SIISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIISelLowering.cpp]

1	//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Custom DAG lowering for SI
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIISelLowering.h"
15	#include "AMDGPU.h"
16	#include "AMDGPUInstrInfo.h"
17	#include "AMDGPULaneMaskUtils.h"
18	#include "AMDGPUSelectionDAGInfo.h"
19	#include "AMDGPUTargetMachine.h"
20	#include "GCNSubtarget.h"
21	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22	#include "SIMachineFunctionInfo.h"
23	#include "SIRegisterInfo.h"
24	#include "llvm/ADT/APInt.h"
25	#include "llvm/ADT/FloatingPointMode.h"
26	#include "llvm/ADT/Statistic.h"
27	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28	#include "llvm/Analysis/UniformityAnalysis.h"
29	#include "llvm/CodeGen/Analysis.h"
30	#include "llvm/CodeGen/ByteProvider.h"
31	#include "llvm/CodeGen/FunctionLoweringInfo.h"
32	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
33	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
34	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35	#include "llvm/CodeGen/MachineFrameInfo.h"
36	#include "llvm/CodeGen/MachineFunction.h"
37	#include "llvm/CodeGen/MachineLoopInfo.h"
38	#include "llvm/CodeGen/PseudoSourceValueManager.h"
39	#include "llvm/CodeGen/SDPatternMatch.h"
40	#include "llvm/IR/DiagnosticInfo.h"
41	#include "llvm/IR/IRBuilder.h"
42	#include "llvm/IR/IntrinsicInst.h"
43	#include "llvm/IR/IntrinsicsAMDGPU.h"
44	#include "llvm/IR/IntrinsicsR600.h"
45	#include "llvm/IR/MDBuilder.h"
46	#include "llvm/Support/CommandLine.h"
47	#include "llvm/Support/KnownBits.h"
48	#include "llvm/Support/ModRef.h"
49	#include "llvm/Transforms/Utils/LowerAtomic.h"
50	#include <optional>
51
52	using namespace llvm;
53	using namespace llvm::SDPatternMatch;
54
55	#define DEBUG_TYPE "si-lower"
56
57	STATISTIC(NumTailCalls, "Number of tail calls");
58
59	static cl::opt<bool>
60	DisableLoopAlignment("amdgpu-disable-loop-alignment",
61	cl::desc ("Do not align and prefetch loops"),
62	cl::init(Val: false));
63
64	static cl::opt<bool> UseDivergentRegisterIndexing(
65	"amdgpu-use-divergent-register-indexing", cl::Hidden,
66	cl::desc ("Use indirect register addressing for divergent indexes"),
67	cl::init(Val: false));
68
69	static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
70	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71	return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
72	}
73
74	static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
75	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
76	return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
77	}
78
79	static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80	unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81	for (unsigned Reg = `0`; Reg < NumSGPRs; ++Reg) {
82	if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
83	return AMDGPU::SGPR0 + Reg;
84	}
85	}
86	llvm_unreachable("Cannot allocate sgpr");
87	}
88
89	SITargetLowering::SITargetLowering(const TargetMachine &TM,
90	const GCNSubtarget &STI)
91	: AMDGPUTargetLowering (TM, STI, STI), Subtarget(&STI) {
92	addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
93	addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
94
95	addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
96
97	const SIRegisterInfo *TRI = STI.getRegisterInfo();
98	const TargetRegisterClass *V32RegClass =
99	TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `32`);
100	addRegisterClass(VT: MVT::f32, RC: V32RegClass);
101
102	addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
103
104	const TargetRegisterClass *V64RegClass =
105	TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `64`);
106
107	addRegisterClass(VT: MVT::f64, RC: V64RegClass);
108	addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
109	addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
110
111	addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
112	addRegisterClass(VT: MVT::v3f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `96`));
113
114	addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
115	addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
116
117	addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
118	addRegisterClass(VT: MVT::v4f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `128`));
119
120	addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
121	addRegisterClass(VT: MVT::v5f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `160`));
122
123	addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
124	addRegisterClass(VT: MVT::v6f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `192`));
125
126	addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
127	addRegisterClass(VT: MVT::v3f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `192`));
128
129	addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
130	addRegisterClass(VT: MVT::v7f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `224`));
131
132	addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
133	addRegisterClass(VT: MVT::v8f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `256`));
134
135	addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
136	addRegisterClass(VT: MVT::v4f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `256`));
137
138	addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
139	addRegisterClass(VT: MVT::v9f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `288`));
140
141	addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
142	addRegisterClass(VT: MVT::v10f32,
143	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `320`));
144
145	addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
146	addRegisterClass(VT: MVT::v11f32,
147	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `352`));
148
149	addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
150	addRegisterClass(VT: MVT::v12f32,
151	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `384`));
152
153	addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
154	addRegisterClass(VT: MVT::v16f32,
155	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `512`));
156
157	addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
158	addRegisterClass(VT: MVT::v8f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `512`));
159
160	addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
161	addRegisterClass(VT: MVT::v16f64,
162	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `1024`));
163
164	if (Subtarget->has16BitInsts()) {
165	if (Subtarget->useRealTrue16Insts()) {
166	addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
167	addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
168	addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
169	} else {
170	addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
171	addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
172	addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
173	}
174
175	// Unless there are also VOP3P operations, not operations are really legal.
176	addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
177	addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
178	addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
179	addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
180	addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
181	addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
182	addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
183	addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
184	addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
185	addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
186	addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
187	addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
188	addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
189	addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
190	addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
191	}
192
193	addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
194	addRegisterClass(VT: MVT::v32f32,
195	RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: `1024`));
196
197	computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
198
199	setMinFunctionAlignment(Align (`4`));
200	setPrefFunctionAlignment(Align (STI.getInstCacheLineSize()));
201
202	// The boolean content concept here is too inflexible. Compares only ever
203	// really produce a 1-bit result. Any copy/extend from these will turn into a
204	// select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205	// it's what most targets use.
206	setBooleanContents(ZeroOrOneBooleanContent);
207	setBooleanVectorContents(ZeroOrOneBooleanContent);
208
209	// We need to custom lower vector stores from local memory
210	setOperationAction(Ops: ISD::LOAD,
211	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214	MVT::i1, MVT::v32i32},
215	Action: Custom);
216
217	setOperationAction(Ops: ISD::STORE,
218	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221	MVT::i1, MVT::v32i32},
222	Action: Custom);
223
224	if (isTypeLegal(VT: MVT::bf16)) {
225	for (unsigned Opc :
226	{ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
227	ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
228	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
229	ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
230	ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
231	ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
232	ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
233	ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
234	ISD::SETCC}) {
235	setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
236	}
237
238	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
239
240	setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
241	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
242
243	setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
244	setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
245	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
246
247	// We only need to custom lower because we can't specify an action for bf16
248	// sources.
249	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
250	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
251	}
252
253	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
254	setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
255	setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
256	setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
257	setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
258	setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
259	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
260	setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
261	setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
262	setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
263	setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
264	setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
265	setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
266	setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
267	setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
268	setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
269
270	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
271	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
272	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
273	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
274	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
275	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
276	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
277
278	setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
279	setOperationAction(Ops: ISD::ExternalSymbol, VTs: {MVT::i32, MVT::i64}, Action: Custom);
280
281	setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
282	setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
283	setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
284	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
285
286	setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
287
288	setOperationAction(Ops: ISD::SELECT_CC,
289	VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
290
291	setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
292	setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
293	AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
294
295	setOperationAction(Ops: ISD::TRUNCATE,
296	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299	Action: Expand);
300	setOperationAction(Ops: ISD::FP_ROUND,
301	VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302	MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303	MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304	Action: Expand);
305
306	setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
307	VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308	MVT::v3i16, MVT::v4i16, MVT::Other},
309	Action: Custom);
310
311	setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
312	setOperationAction(Ops: ISD::BR_CC,
313	VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
314
315	setOperationAction(Ops: {ISD::ABS, ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
316
317	setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
318
319	setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
320	Action: Expand);
321
322	#if 0
323	setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
324	#endif
325
326	// We only support LOAD/STORE and vector manipulation ops for vectors
327	// with > 4 elements.
328	for (MVT VT :
329	{MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330	MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331	MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332	MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333	MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334	MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335	MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336	MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
337	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op) {
338	switch (Op) {
339	case ISD::LOAD:
340	case ISD::STORE:
341	case ISD::BUILD_VECTOR:
342	case ISD::BITCAST:
343	case ISD::UNDEF:
344	case ISD::EXTRACT_VECTOR_ELT:
345	case ISD::INSERT_VECTOR_ELT:
346	case ISD::SCALAR_TO_VECTOR:
347	case ISD::IS_FPCLASS:
348	break;
349	case ISD::EXTRACT_SUBVECTOR:
350	case ISD::INSERT_SUBVECTOR:
351	case ISD::CONCAT_VECTORS:
352	setOperationAction(Op, VT, Action: Custom);
353	break;
354	default:
355	setOperationAction(Op, VT, Action: Expand);
356	break;
357	}
358	}
359	}
360
361	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
362
363	// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
364	// is expanded to avoid having two separate loops in case the index is a VGPR.
365
366	// Most operations are naturally 32-bit vector operations. We only support
367	// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
368	for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
369	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
370	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
371
372	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
373	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
374
375	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
376	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
377
378	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
379	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
380	}
381
382	for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
383	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
384	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
385
386	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
387	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
388
389	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
390	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
391
392	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
393	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
394	}
395
396	for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
397	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
398	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
399
400	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
401	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
402
403	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
404	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
405
406	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
407	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
408	}
409
410	for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
411	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
412	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
413
414	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
415	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
416
417	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
418	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
419
420	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
421	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
422	}
423
424	for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
426	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
427
428	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
429	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
430
431	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
432	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
433
434	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
435	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
436	}
437
438	setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
439	VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440	MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441	Action: Custom);
442
443	if (Subtarget->hasPkMovB32()) {
444	// TODO: 16-bit element vectors should be legal with even aligned elements.
445	// TODO: Can be legal with wider source types than the result with
446	// subregister extracts.
447	setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
448	}
449
450	setOperationAction(Ops: {ISD::AND, ISD::OR, ISD::XOR}, VT: MVT::v2i32, Action: Legal);
451	// Prevent SELECT v2i32 from being implemented with the above bitwise ops and
452	// instead lower to cndmask in SITargetLowering::LowerSELECT().
453	setOperationAction(Op: ISD::SELECT, VT: MVT::v2i32, Action: Custom);
454	// Enable MatchRotate to produce ISD::ROTR, which is later transformed to
455	// alignbit.
456	setOperationAction(Op: ISD::ROTR, VT: MVT::v2i32, Action: Custom);
457
458	setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
459	Action: Custom);
460
461	// Avoid stack access for these.
462	// TODO: Generalize to more vector types.
463	setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
464	VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465	MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
466	Action: Custom);
467
468	// Deal with vec3 vector operations when widened to vec4.
469	setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
470	VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
471
472	// Deal with vec5/6/7 vector operations when widened to vec8.
473	setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
474	VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475	MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476	MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477	MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
478	Action: Custom);
479
480	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
481	// and output demarshalling
482	setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
483
484	// We can't return success/failure, only the old value,
485	// let LLVM add the comparison
486	setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
487	Action: Expand);
488
489	setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
490
491	setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
492
493	// FIXME: This should be narrowed to i32, but that only happens if i64 is
494	// illegal.
495	// FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
496	setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
497
498	// On SI this is s_memtime and s_memrealtime on VI.
499	setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
500
501	if (Subtarget->hasSMemRealTime() \|\|
502	Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
503	setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
504	setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
505
506	if (Subtarget->has16BitInsts()) {
507	setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
508	setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
509	setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
510	setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Legal);
511	setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f16, Action: Legal);
512	} else {
513	setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
514	}
515
516	if (Subtarget->hasMadMacF32Insts())
517	setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
518
519	setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
520	setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
521
522	// We only really have 32-bit BFE instructions (and 16-bit on VI).
523	//
524	// On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
525	// effort to match them now. We want this to be false for i64 cases when the
526	// extraction isn't restricted to the upper or lower half. Ideally we would
527	// have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
528	// span the midpoint are probably relatively rare, so don't worry about them
529	// for now.
530	setHasExtractBitsInsn(true);
531
532	// Clamp modifier on add/sub
533	if (Subtarget->hasIntClamp())
534	setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
535
536	if (Subtarget->hasAddNoCarryInsts())
537	setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
538	Action: Legal);
539
540	setOperationAction(
541	Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
542	VTs: {MVT::f32, MVT::f64}, Action: Custom);
543
544	// These are really only legal for ieee_mode functions. We should be avoiding
545	// them for functions that don't have ieee_mode enabled, so just say they are
546	// legal.
547	setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
548	VTs: {MVT::f32, MVT::f64}, Action: Legal);
549
550	if (Subtarget->haveRoundOpsF64())
551	setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
552	Action: Legal);
553	else
554	setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
555	VT: MVT::f64, Action: Custom);
556
557	setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
558	setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
559	Action: Legal);
560	setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
561
562	setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
563	setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
564
565	setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
566	setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
567
568	setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i32,
569	Action: Custom);
570	setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i16,
571	Action: Custom);
572	setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i1,
573	Action: Custom);
574
575	// Custom lower these because we can't specify a rule based on an illegal
576	// source bf16.
577	setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
578	setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
579
580	if (Subtarget->has16BitInsts()) {
581	setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
582	ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
583	VT: MVT::i16, Action: Legal);
584
585	AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
586
587	setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
588	VT: MVT::i16, Action: Expand);
589
590	setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
591	ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
592	ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
593	ISD::CTPOP},
594	VT: MVT::i16, Action: Promote);
595
596	setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
597
598	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
599
600	setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
601	AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
602	setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
603	AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
604
605	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
606	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
607	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
608
609	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
610
611	// F16 - Constant Actions.
612	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
613	setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
614
615	// F16 - Load/Store Actions.
616	setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
617	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
618	setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
619	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
620
621	// BF16 - Load/Store Actions.
622	setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
623	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
624	setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
625	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
626
627	// F16 - VOP1 Actions.
628	setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
629	ISD::FSIN, ISD::FROUND},
630	VT: MVT::f16, Action: Custom);
631
632	// BF16 - VOP1 Actions.
633	if (Subtarget->hasBF16TransInsts())
634	setOperationAction(Ops: {ISD::FCOS, ISD::FSIN, ISD::FDIV}, VT: MVT::bf16, Action: Custom);
635
636	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
637	ISD::FP_TO_UINT_SAT},
638	VT: MVT::f16, Action: Promote);
639	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
640	ISD::FP_TO_UINT_SAT},
641	VT: MVT::bf16, Action: Promote);
642
643	// F16 - VOP2 Actions.
644	setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
645	Action: Expand);
646	setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
647	setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
648	setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
649
650	// F16 - VOP3 Actions.
651	setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
652	if (STI.hasMadF16())
653	setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
654
655	for (MVT VT :
656	{MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
657	MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
658	MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
659	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op) {
660	switch (Op) {
661	case ISD::LOAD:
662	case ISD::STORE:
663	case ISD::BUILD_VECTOR:
664	case ISD::BITCAST:
665	case ISD::UNDEF:
666	case ISD::EXTRACT_VECTOR_ELT:
667	case ISD::INSERT_VECTOR_ELT:
668	case ISD::INSERT_SUBVECTOR:
669	case ISD::SCALAR_TO_VECTOR:
670	case ISD::IS_FPCLASS:
671	break;
672	case ISD::EXTRACT_SUBVECTOR:
673	case ISD::CONCAT_VECTORS:
674	case ISD::FSIN:
675	case ISD::FCOS:
676	setOperationAction(Op, VT, Action: Custom);
677	break;
678	default:
679	setOperationAction(Op, VT, Action: Expand);
680	break;
681	}
682	}
683	}
684
685	// v_perm_b32 can handle either of these.
686	setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
687	setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
688
689	// XXX - Do these do anything? Vector constants turn into build_vector.
690	setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
691
692	setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
693	Action: Legal);
694
695	setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
696	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
697	setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
698	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
699
700	setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
701	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
702	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
703	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
704
705	setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
706	AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
707	setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
708	AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
709	setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
710	AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
711
712	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
713	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
714	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
715	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
716	setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
717	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
718
719	setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
720	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
721	setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
722	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
723	setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
724	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
725
726	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
727	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
728	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
729	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
730	setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
731	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
732
733	setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
734	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
735	setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
736	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
737
738	setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
739	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
740	setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
741	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
742	setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
743	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
744
745	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
746	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
747	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
748	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
749	setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
750	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
751
752	setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
753	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
754	setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
755	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
756	setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
757	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
758
759	setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
760	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
761	setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
762	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
763	setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
764	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
765
766	setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
767	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
768	setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
769	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
770	setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
771	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
772
773	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
774	VT: MVT::v2i32, Action: Expand);
775	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
776
777	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
778	VT: MVT::v4i32, Action: Expand);
779
780	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
781	VT: MVT::v8i32, Action: Expand);
782
783	setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784	Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
785
786	setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
787	// This isn't really legal, but this avoids the legalizer unrolling it (and
788	// allows matching fneg (fabs x) patterns)
789	setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
790
791	// Can do this in one BFI plus a constant materialize.
792	setOperationAction(Ops: ISD::FCOPYSIGN,
793	VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
794	MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
795	MVT::v32f16, MVT::v32bf16},
796	Action: Custom);
797
798	setOperationAction(
799	Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
800	VT: MVT::f16, Action: Custom);
801	setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
802
803	setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
804	ISD::FMAXIMUMNUM},
805	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806	Action: Custom);
807
808	setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
809	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810	Action: Expand);
811
812	for (MVT Vec16 :
813	{MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
814	MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
815	setOperationAction(
816	Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
817	VT: Vec16, Action: Custom);
818	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
819	}
820	}
821
822	if (Subtarget->hasVOP3PInsts()) {
823	setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
824	ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
825	ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
826	VT: MVT::v2i16, Action: Legal);
827
828	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
829	ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
830	VT: MVT::v2f16, Action: Legal);
831
832	setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
833	VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
834
835	setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
836	VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
837	MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
838	MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
839	Action: Custom);
840
841	for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
842	// Split vector operations.
843	setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
844	ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
845	ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
846	ISD::SSUBSAT},
847	VT, Action: Custom);
848
849	for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
850	// Split vector operations.
851	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
852	VT, Action: Custom);
853
854	setOperationAction(
855	Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
856	VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
857
858	setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
859	setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
860	Action: Custom);
861
862	if (Subtarget->hasBF16PackedInsts()) {
863	for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
864	// Split vector operations.
865	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
866	VT, Action: Custom);
867	}
868
869	if (Subtarget->hasPackedFP32Ops()) {
870	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
871	VT: MVT::v2f32, Action: Legal);
872	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
873	VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
874	Action: Custom);
875	}
876	}
877
878	setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
879
880	if (Subtarget->has16BitInsts()) {
881	setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
882	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
883	setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
884	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
885	} else {
886	// Legalization hack.
887	setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
888
889	setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
890	}
891
892	setOperationAction(Ops: ISD::SELECT,
893	VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
894	MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
895	MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
896	MVT::v32f16, MVT::v32bf16},
897	Action: Custom);
898
899	setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
900
901	if (Subtarget->hasVectorMulU64())
902	setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Legal);
903	else if (Subtarget->hasScalarSMulU64())
904	setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
905
906	if (Subtarget->hasMad64_32())
907	setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
908
909	if (Subtarget->hasSafeSmemPrefetch() \|\| Subtarget->hasVmemPrefInsts())
910	setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
911
912	if (Subtarget->hasIEEEMinimumMaximumInsts()) {
913	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
914	VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
915	} else {
916	// FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
917	if (Subtarget->hasMinimum3Maximum3F32())
918	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
919
920	if (Subtarget->hasMinimum3Maximum3PKF16()) {
921	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
922
923	// If only the vector form is available, we need to widen to a vector.
924	if (!Subtarget->hasMinimum3Maximum3F16())
925	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
926	}
927	}
928
929	if (Subtarget->hasVOP3PInsts()) {
930	// We want to break these into v2f16 pieces, not scalarize.
931	setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
932	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933	Action: Custom);
934	}
935
936	if (Subtarget->hasIntMinMax64())
937	setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i64,
938	Action: Legal);
939
940	setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
941	VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
942	MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
943	MVT::i8},
944	Action: Custom);
945
946	setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
947	VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
948	MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
949	MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
950	MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951	Action: Custom);
952
953	setOperationAction(Ops: ISD::INTRINSIC_VOID,
954	VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
955	MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
956	MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
957	MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
958	Action: Custom);
959
960	setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
961	setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
962	setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
963	setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
964	setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
965
966	// TODO: Could move this to custom lowering, could benefit from combines on
967	// extract of relevant bits.
968	setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
969
970	setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
971
972	if (Subtarget->hasBF16ConversionInsts()) {
973	setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
974	setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
975	}
976
977	if (Subtarget->hasBF16PackedInsts()) {
978	setOperationAction(
979	Ops: {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
980	VT: MVT::v2bf16, Action: Legal);
981	}
982
983	if (Subtarget->hasBF16TransInsts()) {
984	setOperationAction(Ops: {ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, VT: MVT::bf16, Action: Legal);
985	}
986
987	if (Subtarget->hasCvtPkF16F32Inst()) {
988	setOperationAction(Ops: ISD::FP_ROUND,
989	VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
990	Action: Custom);
991	}
992
993	setTargetDAGCombine({ISD::ADD,
994	ISD::PTRADD,
995	ISD::UADDO_CARRY,
996	ISD::SUB,
997	ISD::USUBO_CARRY,
998	ISD::MUL,
999	ISD::FADD,
1000	ISD::FSUB,
1001	ISD::FDIV,
1002	ISD::FMUL,
1003	ISD::FMINNUM,
1004	ISD::FMAXNUM,
1005	ISD::FMINNUM_IEEE,
1006	ISD::FMAXNUM_IEEE,
1007	ISD::FMINIMUM,
1008	ISD::FMAXIMUM,
1009	ISD::FMINIMUMNUM,
1010	ISD::FMAXIMUMNUM,
1011	ISD::FMA,
1012	ISD::SMIN,
1013	ISD::SMAX,
1014	ISD::UMIN,
1015	ISD::UMAX,
1016	ISD::SETCC,
1017	ISD::SELECT,
1018	ISD::SMIN,
1019	ISD::SMAX,
1020	ISD::UMIN,
1021	ISD::UMAX,
1022	ISD::AND,
1023	ISD::OR,
1024	ISD::XOR,
1025	ISD::SHL,
1026	ISD::SRL,
1027	ISD::SRA,
1028	ISD::FSHR,
1029	ISD::SINT_TO_FP,
1030	ISD::UINT_TO_FP,
1031	ISD::FCANONICALIZE,
1032	ISD::SCALAR_TO_VECTOR,
1033	ISD::ZERO_EXTEND,
1034	ISD::SIGN_EXTEND_INREG,
1035	ISD::ANY_EXTEND,
1036	ISD::EXTRACT_VECTOR_ELT,
1037	ISD::INSERT_VECTOR_ELT,
1038	ISD::FCOPYSIGN});
1039
1040	if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1041	setTargetDAGCombine(ISD::FP_ROUND);
1042
1043	// All memory operations. Some folding on the pointer operand is done to help
1044	// matching the constant offsets in the addressing modes.
1045	setTargetDAGCombine({ISD::LOAD,
1046	ISD::STORE,
1047	ISD::ATOMIC_LOAD,
1048	ISD::ATOMIC_STORE,
1049	ISD::ATOMIC_CMP_SWAP,
1050	ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1051	ISD::ATOMIC_SWAP,
1052	ISD::ATOMIC_LOAD_ADD,
1053	ISD::ATOMIC_LOAD_SUB,
1054	ISD::ATOMIC_LOAD_AND,
1055	ISD::ATOMIC_LOAD_OR,
1056	ISD::ATOMIC_LOAD_XOR,
1057	ISD::ATOMIC_LOAD_NAND,
1058	ISD::ATOMIC_LOAD_MIN,
1059	ISD::ATOMIC_LOAD_MAX,
1060	ISD::ATOMIC_LOAD_UMIN,
1061	ISD::ATOMIC_LOAD_UMAX,
1062	ISD::ATOMIC_LOAD_FADD,
1063	ISD::ATOMIC_LOAD_FMIN,
1064	ISD::ATOMIC_LOAD_FMAX,
1065	ISD::ATOMIC_LOAD_UINC_WRAP,
1066	ISD::ATOMIC_LOAD_UDEC_WRAP,
1067	ISD::ATOMIC_LOAD_USUB_COND,
1068	ISD::ATOMIC_LOAD_USUB_SAT,
1069	ISD::INTRINSIC_VOID,
1070	ISD::INTRINSIC_W_CHAIN});
1071
1072	// FIXME: In other contexts we pretend this is a per-function property.
1073	setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1074
1075	setSchedulingPreference(Sched::RegPressure);
1076	}
1077
1078	const GCNSubtarget SITargetLowering::getSubtarget() const* { return Subtarget; }
1079
1080	ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1081	static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1082	return RCRegs;
1083	}
1084
1085	//===----------------------------------------------------------------------===//
1086	// TargetLowering queries
1087	//===----------------------------------------------------------------------===//
1088
1089	// v_mad_mix support a conversion from f16 to f32.*
1090	//
1091	// There is only one special case when denormals are enabled we don't currently,
1092	// where this is OK to use.
1093	bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1094	EVT DestVT, EVT SrcVT) const {
1095	return DestVT.getScalarType() == MVT::f32 &&
1096	((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) \|\|
1097	(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1098	SrcVT.getScalarType() == MVT::f16) \|\|
1099	(Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1100	SrcVT.getScalarType() == MVT::bf16)) &&
1101	// TODO: This probably only requires no input flushing?
1102	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1103	}
1104
1105	bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1106	LLT DestTy, LLT SrcTy) const {
1107	return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) \|\|
1108	(Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1109	DestTy.getScalarSizeInBits() == `32` &&
1110	SrcTy.getScalarSizeInBits() == `16` &&
1111	// TODO: This probably only requires no input flushing?
1112	denormalModeIsFlushAllF32(MF: *MI.getMF());
1113	}
1114
1115	bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1116	// SI has some legal vector types, but no legal vector operations. Say no
1117	// shuffles are legal in order to prefer scalarizing some vector operations.
1118	return false;
1119	}
1120
1121	MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1122	CallingConv::ID CC,
1123	EVT VT) const {
1124	if (CC == CallingConv::AMDGPU_KERNEL)
1125	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1126
1127	if (VT.isVector()) {
1128	EVT ScalarVT = VT.getScalarType();
1129	unsigned Size = ScalarVT.getSizeInBits();
1130	if (Size == `16`) {
1131	return Subtarget->has16BitInsts()
1132	? MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), NumElements: `2`)
1133	: MVT::i32;
1134	}
1135
1136	if (Size < `16`)
1137	return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1138	return Size == `32` ? ScalarVT.getSimpleVT() : MVT::i32;
1139	}
1140
1141	if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == `16`)
1142	return MVT::i32;
1143
1144	if (VT.getSizeInBits() > `32`)
1145	return MVT::i32;
1146
1147	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1148	}
1149
1150	unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1151	CallingConv::ID CC,
1152	EVT VT) const {
1153	if (CC == CallingConv::AMDGPU_KERNEL)
1154	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155
1156	if (VT.isVector()) {
1157	unsigned NumElts = VT.getVectorNumElements();
1158	EVT ScalarVT = VT.getScalarType();
1159	unsigned Size = ScalarVT.getSizeInBits();
1160
1161	// FIXME: Should probably promote 8-bit vectors to i16.
1162	if (Size == `16`)
1163	return (NumElts + `1`) / `2`;
1164
1165	if (Size <= `32`)
1166	return NumElts;
1167
1168	if (Size > `32`)
1169	return NumElts * ((Size + `31`) / `32`);
1170	} else if (VT.getSizeInBits() > `32`)
1171	return (VT.getSizeInBits() + `31`) / `32`;
1172
1173	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1174	}
1175
1176	unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1177	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1178	unsigned &NumIntermediates, MVT &RegisterVT) const {
1179	if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1180	unsigned NumElts = VT.getVectorNumElements();
1181	EVT ScalarVT = VT.getScalarType();
1182	unsigned Size = ScalarVT.getSizeInBits();
1183	// FIXME: We should fix the ABI to be the same on targets without 16-bit
1184	// support, but unless we can properly handle 3-vectors, it will be still be
1185	// inconsistent.
1186	if (Size == `16`) {
1187	MVT SimpleIntermediateVT =
1188	MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), EC: ElementCount::getFixed(MinVal: `2`));
1189	IntermediateVT = SimpleIntermediateVT;
1190	RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1191	NumIntermediates = (NumElts + `1`) / `2`;
1192	return (NumElts + `1`) / `2`;
1193	}
1194
1195	if (Size == `32`) {
1196	RegisterVT = ScalarVT.getSimpleVT();
1197	IntermediateVT = RegisterVT;
1198	NumIntermediates = NumElts;
1199	return NumIntermediates;
1200	}
1201
1202	if (Size < `16` && Subtarget->has16BitInsts()) {
1203	// FIXME: Should probably form v2i16 pieces
1204	RegisterVT = MVT::i16;
1205	IntermediateVT = ScalarVT;
1206	NumIntermediates = NumElts;
1207	return NumIntermediates;
1208	}
1209
1210	if (Size != `16` && Size <= `32`) {
1211	RegisterVT = MVT::i32;
1212	IntermediateVT = ScalarVT;
1213	NumIntermediates = NumElts;
1214	return NumIntermediates;
1215	}
1216
1217	if (Size > `32`) {
1218	RegisterVT = MVT::i32;
1219	IntermediateVT = RegisterVT;
1220	NumIntermediates = NumElts * ((Size + `31`) / `32`);
1221	return NumIntermediates;
1222	}
1223	}
1224
1225	return TargetLowering::getVectorTypeBreakdownForCallingConv(
1226	Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1227	}
1228
1229	static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1230	const DataLayout &DL, Type *Ty,
1231	unsigned MaxNumLanes) {
1232	assert(MaxNumLanes != `0`);
1233
1234	LLVMContext &Ctx = Ty->getContext();
1235	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1236	unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1237	return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1238	NumElements: NumElts);
1239	}
1240
1241	return TLI.getValueType(DL, Ty);
1242	}
1243
1244	// Peek through TFE struct returns to only use the data size.
1245	static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1246	const DataLayout &DL, Type *Ty,
1247	unsigned MaxNumLanes) {
1248	auto *ST = dyn_cast<StructType>(Val: Ty);
1249	if (!ST)
1250	return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1251
1252	// TFE intrinsics return an aggregate type.
1253	assert(ST->getNumContainedTypes() == `2` &&
1254	ST->getContainedType(`1`)->isIntegerTy(`32`));
1255	return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: `0`), MaxNumLanes);
1256	}
1257
1258	/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1259	/// in-memory representation. This return value is a custom type because there
1260	/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1261	/// could cause issues during codegen, these address space 7 pointers will be
1262	/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1263	/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1264	/// for cost modeling, to work. (This also sets us up decently for doing the
1265	/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1266	MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1267	if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == `160`)
1268	return MVT::amdgpuBufferFatPointer;
1269	if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1270	DL.getPointerSizeInBits(AS) == `192`)
1271	return MVT::amdgpuBufferStridedPointer;
1272	return AMDGPUTargetLowering::getPointerTy(DL, AS);
1273	}
1274	/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1275	/// v8i32 when padding is added.
1276	/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1277	/// also v8i32 with padding.
1278	MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1279	if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1280	DL.getPointerSizeInBits(AS) == `160`) \|\|
1281	(AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1282	DL.getPointerSizeInBits(AS) == `192`))
1283	return MVT::v8i32;
1284	return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1285	}
1286
1287	static unsigned getIntrMemWidth(unsigned IntrID) {
1288	switch (IntrID) {
1289	case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1290	case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1291	case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1292	return `8`;
1293	case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1294	case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1295	case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1296	case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1297	case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1298	case Intrinsic::amdgcn_flat_load_monitor_b32:
1299	case Intrinsic::amdgcn_global_load_monitor_b32:
1300	return `32`;
1301	case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1302	case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1303	case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1304	case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1305	case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1306	case Intrinsic::amdgcn_flat_load_monitor_b64:
1307	case Intrinsic::amdgcn_global_load_monitor_b64:
1308	return `64`;
1309	case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1310	case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1311	case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1312	case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1313	case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1314	case Intrinsic::amdgcn_flat_load_monitor_b128:
1315	case Intrinsic::amdgcn_global_load_monitor_b128:
1316	return `128`;
1317	default:
1318	llvm_unreachable("Unknown width");
1319	}
1320	}
1321
1322	static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
1323	unsigned ArgIdx) {
1324	Value *OrderingArg = CI.getArgOperand(i: ArgIdx);
1325	unsigned Ord = cast<ConstantInt>(Val: OrderingArg)->getZExtValue();
1326	switch (AtomicOrderingCABI(Ord)) {
1327	case AtomicOrderingCABI::acquire:
1328	return AtomicOrdering::Acquire;
1329	break;
1330	case AtomicOrderingCABI::release:
1331	return AtomicOrdering::Release;
1332	break;
1333	case AtomicOrderingCABI::seq_cst:
1334	return AtomicOrdering::SequentiallyConsistent;
1335	break;
1336	default:
1337	return AtomicOrdering::Monotonic;
1338	}
1339	}
1340
1341	static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1342	MDNode *ScopeMD = cast<MDNode>(
1343	Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: ArgIdx))->getMetadata());
1344	StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: `0`))->getString();
1345	return CI.getContext().getOrInsertSyncScopeID(SSN: Scope);
1346	}
1347
1348	void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
1349	const CallBase &CI,
1350	MachineFunction &MF,
1351	unsigned IntrID) const {
1352	MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1353	if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1354	Flags \|= MachineMemOperand::MOInvariant;
1355	if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1356	Flags \|= MachineMemOperand::MONonTemporal;
1357	Flags \|= getTargetMMOFlags(I: CI);
1358
1359	if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1360	AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1361	AttributeSet Attr =
1362	Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1363	MemoryEffects ME = Attr.getMemoryEffects();
1364	if (ME.doesNotAccessMemory())
1365	return;
1366
1367	bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1368	if (!IsSPrefetch) {
1369	auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - `1`));
1370	if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1371	Flags \|= MachineMemOperand::MOVolatile;
1372	}
1373	Flags \|= MachineMemOperand::MODereferenceable;
1374
1375	IntrinsicInfo Info;
1376	// TODO: Should images get their own address space?
1377	Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1378
1379	const AMDGPU::MIMGBaseOpcodeInfo BaseOpcode = nullptr*;
1380	if (RsrcIntr->IsImage) {
1381	const AMDGPU::ImageDimIntrinsicInfo *Intr =
1382	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1383	BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1384	Info.align.reset();
1385	}
1386
1387	Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1388	if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1389	if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1390	// We conservatively set the memory operand of a buffer intrinsic to the
1391	// base resource pointer, so that we can access alias information about
1392	// those pointers. Cases like "this points at the same value
1393	// but with a different offset" are handled in
1394	// areMemAccessesTriviallyDisjoint.
1395	Info.ptrVal = RsrcArg;
1396	}
1397
1398	if (ME.onlyReadsMemory()) {
1399	if (RsrcIntr->IsImage) {
1400	unsigned MaxNumLanes = `4`;
1401
1402	if (!BaseOpcode->Gather4) {
1403	// If this isn't a gather, we may have excess loaded elements in the
1404	// IR type. Check the dmask for the real number of elements loaded.
1405	unsigned DMask =
1406	cast<ConstantInt>(Val: CI.getArgOperand(i: `0`))->getZExtValue();
1407	MaxNumLanes = DMask == `0` ? `1` : llvm::popcount(Value: DMask);
1408	}
1409
1410	Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1411	Ty: CI.getType(), MaxNumLanes);
1412	} else {
1413	Info.memVT =
1414	memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1415	MaxNumLanes: std::numeric_limits<unsigned>::max());
1416	}
1417
1418	// FIXME: What does alignment mean for an image?
1419	Info.opc = ISD::INTRINSIC_W_CHAIN;
1420	Info.flags = Flags \| MachineMemOperand::MOLoad;
1421	} else if (ME.onlyWritesMemory()) {
1422	Info.opc = ISD::INTRINSIC_VOID;
1423
1424	Type *DataTy = CI.getArgOperand(i: `0`)->getType();
1425	if (RsrcIntr->IsImage) {
1426	unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: `1`))->getZExtValue();
1427	unsigned DMaskLanes = DMask == `0` ? `1` : llvm::popcount(Value: DMask);
1428	Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1429	MaxNumLanes: DMaskLanes);
1430	} else
1431	Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1432
1433	Info.flags = Flags \| MachineMemOperand::MOStore;
1434	} else {
1435	// Atomic, NoReturn Sampler or prefetch
1436	Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1437	: ISD::INTRINSIC_W_CHAIN;
1438
1439	switch (IntrID) {
1440	default:
1441	Info.flags = Flags \| MachineMemOperand::MOLoad;
1442	if (!IsSPrefetch)
1443	Info.flags \|= MachineMemOperand::MOStore;
1444
1445	if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) \|\| IsSPrefetch) {
1446	// Fake memory access type for no return sampler intrinsics
1447	Info.memVT = MVT::i32;
1448	} else {
1449	// XXX - Should this be volatile without known ordering?
1450	Info.flags \|= MachineMemOperand::MOVolatile;
1451	Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: `0`)->getType());
1452	}
1453	break;
1454	case Intrinsic::amdgcn_raw_buffer_load_lds:
1455	case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1456	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1457	case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1458	case Intrinsic::amdgcn_struct_buffer_load_lds:
1459	case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1460	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1461	case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1462	unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getZExtValue();
1463
1464	// Entry 0: Load from buffer.
1465	// Don't set an offset, since the pointer value always represents the
1466	// base of the buffer.
1467	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * `8`);
1468	Info.flags = Flags \| MachineMemOperand::MOLoad;
1469	Infos.push_back(Elt: Info);
1470
1471	// Entry 1: Store to LDS.
1472	// Instruction offset is applied, and an additional per-lane offset
1473	// which we simulate using a larger memory type.
1474	Info.memVT = EVT::getIntegerVT(
1475	Context&: CI.getContext(), BitWidth: Width * `8` * Subtarget->getWavefrontSize());
1476	Info.ptrVal = CI.getArgOperand(i: `1`); // LDS destination pointer
1477	Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - `2`))
1478	->getZExtValue();
1479	Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1480	Info.flags = Flags \| MachineMemOperand::MOStore;
1481	Infos.push_back(Elt: Info);
1482	return;
1483	}
1484	case Intrinsic::amdgcn_raw_atomic_buffer_load:
1485	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1486	case Intrinsic::amdgcn_struct_atomic_buffer_load:
1487	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1488	Info.memVT =
1489	memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1490	MaxNumLanes: std::numeric_limits<unsigned>::max());
1491	Info.flags = Flags \| MachineMemOperand::MOLoad;
1492	Infos.push_back(Elt: Info);
1493	return;
1494	}
1495	}
1496	}
1497	Infos.push_back(Elt: Info);
1498	return;
1499	}
1500
1501	IntrinsicInfo Info;
1502	switch (IntrID) {
1503	case Intrinsic::amdgcn_ds_ordered_add:
1504	case Intrinsic::amdgcn_ds_ordered_swap: {
1505	Info.opc = ISD::INTRINSIC_W_CHAIN;
1506	Info.memVT = MVT::getVT(Ty: CI.getType());
1507	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1508	Info.align.reset();
1509	Info.flags = Flags \| MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1510
1511	const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: `4`));
1512	if (!Vol->isZero())
1513	Info.flags \|= MachineMemOperand::MOVolatile;
1514
1515	Infos.push_back(Elt: Info);
1516	return;
1517	}
1518	case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1519	case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1520	Info.opc = ISD::INTRINSIC_W_CHAIN;
1521	Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: `0`)->getType());
1522	Info.ptrVal = nullptr;
1523	Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1524	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1525	Infos.push_back(Elt: Info);
1526	return;
1527	}
1528	case Intrinsic::amdgcn_ds_append:
1529	case Intrinsic::amdgcn_ds_consume: {
1530	Info.opc = ISD::INTRINSIC_W_CHAIN;
1531	Info.memVT = MVT::getVT(Ty: CI.getType());
1532	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1533	Info.align.reset();
1534	Info.flags = Flags \| MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1535
1536	const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: `1`));
1537	if (!Vol->isZero())
1538	Info.flags \|= MachineMemOperand::MOVolatile;
1539
1540	Infos.push_back(Elt: Info);
1541	return;
1542	}
1543	case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1544	case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1545	Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1546	? ISD::INTRINSIC_W_CHAIN
1547	: ISD::INTRINSIC_VOID;
1548	Info.memVT = MVT::getVT(Ty: CI.getType());
1549	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1550	Info.memVT = MVT::i64;
1551	Info.size = `8`;
1552	Info.align.reset();
1553	Info.flags = Flags \| MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1554	Infos.push_back(Elt: Info);
1555	return;
1556	}
1557	case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1558	case Intrinsic::amdgcn_image_bvh_intersect_ray:
1559	case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1560	Info.opc = ISD::INTRINSIC_W_CHAIN;
1561	Info.memVT =
1562	MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1563	? CI.getType()
1564	: cast<StructType>(Val: CI.getType())
1565	->getElementType(N: `0`)); // XXX: what is correct VT?
1566
1567	Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1568	Info.align.reset();
1569	Info.flags = Flags \| MachineMemOperand::MOLoad \|
1570	MachineMemOperand::MODereferenceable;
1571	Infos.push_back(Elt: Info);
1572	return;
1573	}
1574	case Intrinsic::amdgcn_global_atomic_fmin_num:
1575	case Intrinsic::amdgcn_global_atomic_fmax_num:
1576	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1577	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1578	case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1579	Info.opc = ISD::INTRINSIC_W_CHAIN;
1580	Info.memVT = MVT::getVT(Ty: CI.getType());
1581	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1582	Info.align.reset();
1583	Info.flags =
1584	Flags \| MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
1585	MachineMemOperand::MODereferenceable \| MachineMemOperand::MOVolatile;
1586	Infos.push_back(Elt: Info);
1587	return;
1588	}
1589	case Intrinsic::amdgcn_cluster_load_b32:
1590	case Intrinsic::amdgcn_cluster_load_b64:
1591	case Intrinsic::amdgcn_cluster_load_b128:
1592	case Intrinsic::amdgcn_ds_load_tr6_b96:
1593	case Intrinsic::amdgcn_ds_load_tr4_b64:
1594	case Intrinsic::amdgcn_ds_load_tr8_b64:
1595	case Intrinsic::amdgcn_ds_load_tr16_b128:
1596	case Intrinsic::amdgcn_global_load_tr6_b96:
1597	case Intrinsic::amdgcn_global_load_tr4_b64:
1598	case Intrinsic::amdgcn_global_load_tr_b64:
1599	case Intrinsic::amdgcn_global_load_tr_b128:
1600	case Intrinsic::amdgcn_ds_read_tr4_b64:
1601	case Intrinsic::amdgcn_ds_read_tr6_b96:
1602	case Intrinsic::amdgcn_ds_read_tr8_b64:
1603	case Intrinsic::amdgcn_ds_read_tr16_b64: {
1604	Info.opc = ISD::INTRINSIC_W_CHAIN;
1605	Info.memVT = MVT::getVT(Ty: CI.getType());
1606	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1607	Info.align.reset();
1608	Info.flags = Flags \| MachineMemOperand::MOLoad;
1609	Infos.push_back(Elt: Info);
1610	return;
1611	}
1612	case Intrinsic::amdgcn_flat_load_monitor_b32:
1613	case Intrinsic::amdgcn_flat_load_monitor_b64:
1614	case Intrinsic::amdgcn_flat_load_monitor_b128:
1615	case Intrinsic::amdgcn_global_load_monitor_b32:
1616	case Intrinsic::amdgcn_global_load_monitor_b64:
1617	case Intrinsic::amdgcn_global_load_monitor_b128: {
1618	Info.opc = ISD::INTRINSIC_W_CHAIN;
1619	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1620	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1621	Info.align.reset();
1622	Info.flags = MachineMemOperand::MOLoad;
1623	Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: `1`);
1624	Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: `2`);
1625	Infos.push_back(Elt: Info);
1626	return;
1627	}
1628	case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1629	case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1630	case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1631	Info.opc = ISD::INTRINSIC_W_CHAIN;
1632	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1633	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1634	Info.align.reset();
1635	Info.flags = (MachineMemOperand::MOLoad \| MOCooperative);
1636	Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: `1`);
1637	Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: `2`);
1638	Infos.push_back(Elt: Info);
1639	return;
1640	}
1641	case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1642	case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1643	case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1644	Info.opc = ISD::INTRINSIC_VOID;
1645	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1646	Info.ptrVal = CI.getArgOperand(i: `0`);
1647	Info.align.reset();
1648	Info.flags = (MachineMemOperand::MOStore \| MOCooperative);
1649	Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: `2`);
1650	Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: `3`);
1651	Infos.push_back(Elt: Info);
1652	return;
1653	}
1654	case Intrinsic::amdgcn_ds_gws_init:
1655	case Intrinsic::amdgcn_ds_gws_barrier:
1656	case Intrinsic::amdgcn_ds_gws_sema_v:
1657	case Intrinsic::amdgcn_ds_gws_sema_br:
1658	case Intrinsic::amdgcn_ds_gws_sema_p:
1659	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1660	Info.opc = ISD::INTRINSIC_VOID;
1661
1662	const GCNTargetMachine &TM =
1663	static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1665	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1666	Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668	// This is an abstract access, but we need to specify a type and size.
1669	Info.memVT = MVT::i32;
1670	Info.size = `4`;
1671	Info.align = Align (`4`);
1672
1673	if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1674	Info.flags = Flags \| MachineMemOperand::MOLoad;
1675	else
1676	Info.flags = Flags \| MachineMemOperand::MOStore;
1677	Infos.push_back(Elt: Info);
1678	return;
1679	}
1680	case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1681	case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1682	case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1683	case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1684	case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1685	case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1686	case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1687	case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1688	// Entry 0: Load from source (global/flat).
1689	Info.opc = ISD::INTRINSIC_VOID;
1690	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1691	Info.ptrVal = CI.getArgOperand(i: `0`); // Global pointer
1692	Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getSExtValue();
1693	Info.flags = Flags \| MachineMemOperand::MOLoad;
1694	Infos.push_back(Elt: Info);
1695
1696	// Entry 1: Store to LDS (same offset).
1697	Info.flags = Flags \| MachineMemOperand::MOStore;
1698	Info.ptrVal = CI.getArgOperand(i: `1`); // LDS pointer
1699	Infos.push_back(Elt: Info);
1700	return;
1701	}
1702	case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1703	case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1704	case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1705	case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1706	// Entry 0: Load from LDS.
1707	Info.opc = ISD::INTRINSIC_VOID;
1708	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1709	Info.ptrVal = CI.getArgOperand(i: `1`); // LDS pointer
1710	Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getSExtValue();
1711	Info.flags = Flags \| MachineMemOperand::MOLoad;
1712	Infos.push_back(Elt: Info);
1713
1714	// Entry 1: Store to global (same offset).
1715	Info.flags = Flags \| MachineMemOperand::MOStore;
1716	Info.ptrVal = CI.getArgOperand(i: `0`); // Global pointer
1717	Infos.push_back(Elt: Info);
1718	return;
1719	}
1720	case Intrinsic::amdgcn_load_to_lds:
1721	case Intrinsic::amdgcn_load_async_to_lds:
1722	case Intrinsic::amdgcn_global_load_lds:
1723	case Intrinsic::amdgcn_global_load_async_lds: {
1724	unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getZExtValue();
1725	auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - `1`));
1726	bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1727	if (IsVolatile)
1728	Flags \|= MachineMemOperand::MOVolatile;
1729
1730	// Entry 0: Load from source (global/flat).
1731	Info.opc = ISD::INTRINSIC_VOID;
1732	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * `8`);
1733	Info.ptrVal = CI.getArgOperand(i: `0`); // Source pointer
1734	Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: `3`))->getSExtValue();
1735	Info.flags = Flags \| MachineMemOperand::MOLoad;
1736	Infos.push_back(Elt: Info);
1737
1738	// Entry 1: Store to LDS.
1739	// Same offset from the instruction, but an additional per-lane offset is
1740	// added. Represent that using a wider memory type.
1741	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(),
1742	BitWidth: Width * `8` * Subtarget->getWavefrontSize());
1743	Info.ptrVal = CI.getArgOperand(i: `1`); // LDS destination pointer
1744	Info.flags = Flags \| MachineMemOperand::MOStore;
1745	Infos.push_back(Elt: Info);
1746	return;
1747	}
1748	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1749	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1750	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1751	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1752	Info.opc = ISD::INTRINSIC_W_CHAIN;
1753
1754	const GCNTargetMachine &TM =
1755	static_cast<const GCNTargetMachine &>(getTargetMachine());
1756
1757	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1758	Info.ptrVal = MFI->getGWSPSV(TM);
1759
1760	// This is an abstract access, but we need to specify a type and size.
1761	Info.memVT = MVT::i32;
1762	Info.size = `4`;
1763	Info.align = Align (`4`);
1764
1765	Info.flags = Flags \| MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1766	Infos.push_back(Elt: Info);
1767	return;
1768	}
1769	case Intrinsic::amdgcn_s_prefetch_data:
1770	case Intrinsic::amdgcn_flat_prefetch:
1771	case Intrinsic::amdgcn_global_prefetch: {
1772	Info.opc = ISD::INTRINSIC_VOID;
1773	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: `8`);
1774	Info.ptrVal = CI.getArgOperand(i: `0`);
1775	Info.flags = Flags \| MachineMemOperand::MOLoad;
1776	Infos.push_back(Elt: Info);
1777	return;
1778	}
1779	default:
1780	return;
1781	}
1782	}
1783
1784	void SITargetLowering::CollectTargetIntrinsicOperands(
1785	const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1786	switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1787	case Intrinsic::amdgcn_addrspacecast_nonnull: {
1788	// The DAG's ValueType loses the addrspaces.
1789	// Add them as 2 extra Constant operands "from" and "to".
1790	unsigned SrcAS = I.getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
1791	unsigned DstAS = I.getType()->getPointerAddressSpace();
1792	Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc (), VT: MVT::i32));
1793	Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc (), VT: MVT::i32));
1794	break;
1795	}
1796	default:
1797	break;
1798	}
1799	}
1800
1801	bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1802	SmallVectorImpl<Value *> &Ops,
1803	Type &AccessTy) const* {
1804	Value Ptr = nullptr*;
1805	switch (II->getIntrinsicID()) {
1806	case Intrinsic::amdgcn_cluster_load_b128:
1807	case Intrinsic::amdgcn_cluster_load_b64:
1808	case Intrinsic::amdgcn_cluster_load_b32:
1809	case Intrinsic::amdgcn_ds_append:
1810	case Intrinsic::amdgcn_ds_consume:
1811	case Intrinsic::amdgcn_ds_load_tr8_b64:
1812	case Intrinsic::amdgcn_ds_load_tr16_b128:
1813	case Intrinsic::amdgcn_ds_load_tr4_b64:
1814	case Intrinsic::amdgcn_ds_load_tr6_b96:
1815	case Intrinsic::amdgcn_ds_read_tr4_b64:
1816	case Intrinsic::amdgcn_ds_read_tr6_b96:
1817	case Intrinsic::amdgcn_ds_read_tr8_b64:
1818	case Intrinsic::amdgcn_ds_read_tr16_b64:
1819	case Intrinsic::amdgcn_ds_ordered_add:
1820	case Intrinsic::amdgcn_ds_ordered_swap:
1821	case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1822	case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1823	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1824	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1825	case Intrinsic::amdgcn_global_atomic_fmax_num:
1826	case Intrinsic::amdgcn_global_atomic_fmin_num:
1827	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1828	case Intrinsic::amdgcn_global_load_tr_b64:
1829	case Intrinsic::amdgcn_global_load_tr_b128:
1830	case Intrinsic::amdgcn_global_load_tr4_b64:
1831	case Intrinsic::amdgcn_global_load_tr6_b96:
1832	case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1833	case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1834	case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1835	case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1836	Ptr = II->getArgOperand(i: `0`);
1837	break;
1838	case Intrinsic::amdgcn_load_to_lds:
1839	case Intrinsic::amdgcn_load_async_to_lds:
1840	case Intrinsic::amdgcn_global_load_lds:
1841	case Intrinsic::amdgcn_global_load_async_lds:
1842	case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1843	case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1844	case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1845	case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1846	case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1847	case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1848	case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1849	case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1850	Ptr = II->getArgOperand(i: `1`);
1851	break;
1852	default:
1853	return false;
1854	}
1855	AccessTy = II->getType();
1856	Ops.push_back(Elt: Ptr);
1857	return true;
1858	}
1859
1860	bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1861	unsigned AddrSpace) const {
1862	if (!Subtarget->hasFlatInstOffsets()) {
1863	// Flat instructions do not have offsets, and only have the register
1864	// address.
1865	return AM.BaseOffs == `0` && AM.Scale == `0`;
1866	}
1867
1868	decltype(SIInstrFlags::FLAT) FlatVariant =
1869	AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1870	: AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1871	: SIInstrFlags::FLAT;
1872
1873	return AM.Scale == `0` &&
1874	(AM.BaseOffs == `0` \|\| Subtarget->getInstrInfo()->isLegalFLATOffset(
1875	Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1876	}
1877
1878	bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1879	if (Subtarget->hasFlatGlobalInsts())
1880	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1881
1882	if (!Subtarget->hasAddr64() \|\| Subtarget->useFlatForGlobal()) {
1883	// Assume the we will use FLAT for all global memory accesses
1884	// on VI.
1885	// FIXME: This assumption is currently wrong. On VI we still use
1886	// MUBUF instructions for the r + i addressing mode. As currently
1887	// implemented, the MUBUF instructions only work on buffer < 4GB.
1888	// It may be possible to support > 4GB buffers with MUBUF instructions,
1889	// by setting the stride value in the resource descriptor which would
1890	// increase the size limit to (stride 4GB). However, this is risky,*
1891	// because it has never been validated.
1892	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1893	}
1894
1895	return isLegalMUBUFAddressingMode(AM);
1896	}
1897
1898	bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1899	// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1900	// additionally can do r + r + i with addr64. 32-bit has more addressing
1901	// mode options. Depending on the resource constant, it can also do
1902	// (i64 r0) + (i32 r1) (i14 i).*
1903	//
1904	// Private arrays end up using a scratch buffer most of the time, so also
1905	// assume those use MUBUF instructions. Scratch loads / stores are currently
1906	// implemented as mubuf instructions with offen bit set, so slightly
1907	// different than the normal addr64.
1908	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1909	if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1910	return false;
1911
1912	// FIXME: Since we can split immediate into soffset and immediate offset,
1913	// would it make sense to allow any immediate?
1914
1915	switch (AM.Scale) {
1916	case `0`: // r + i or just i, depending on HasBaseReg.
1917	return true;
1918	case `1`:
1919	return true; // We have r + r or r + i.
1920	case `2`:
1921	if (AM.HasBaseReg) {
1922	// Reject 2 r + r.*
1923	return false;
1924	}
1925
1926	// Allow 2 r as r + r*
1927	// Or 2 r + i is allowed as r + r + i.*
1928	return true;
1929	default: // Don't allow n r*
1930	return false;
1931	}
1932	}
1933
1934	bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1935	const AddrMode &AM, Type *Ty,
1936	unsigned AS,
1937	Instruction I) const* {
1938	// No global is ever allowed as a base.
1939	if (AM.BaseGV)
1940	return false;
1941
1942	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1943	return isLegalGlobalAddressingMode(AM);
1944
1945	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
1946	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
1947	AS == AMDGPUAS::BUFFER_FAT_POINTER \|\| AS == AMDGPUAS::BUFFER_RESOURCE \|\|
1948	AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1949	// If the offset isn't a multiple of 4, it probably isn't going to be
1950	// correctly aligned.
1951	// FIXME: Can we get the real alignment here?
1952	if (AM.BaseOffs % `4` != `0`)
1953	return isLegalMUBUFAddressingMode(AM);
1954
1955	if (!Subtarget->hasScalarSubwordLoads()) {
1956	// There are no SMRD extloads, so if we have to do a small type access we
1957	// will use a MUBUF load.
1958	// FIXME?: We also need to do this if unaligned, but we don't know the
1959	// alignment here.
1960	if (Ty->isSized() && DL.getTypeStoreSize(Ty) < `4`)
1961	return isLegalGlobalAddressingMode(AM);
1962	}
1963
1964	if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1965	// SMRD instructions have an 8-bit, dword offset on SI.
1966	if (!isUInt<`8`>(x: AM.BaseOffs / `4`))
1967	return false;
1968	} else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1969	// On CI+, this can also be a 32-bit literal constant offset. If it fits
1970	// in 8-bits, it can use a smaller encoding.
1971	if (!isUInt<`32`>(x: AM.BaseOffs / `4`))
1972	return false;
1973	} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1974	// On VI, these use the SMEM format and the offset is 20-bit in bytes.
1975	if (!isUInt<`20`>(x: AM.BaseOffs))
1976	return false;
1977	} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1978	// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1979	// for S_BUFFER_ instructions).*
1980	if (!isInt<`21`>(x: AM.BaseOffs))
1981	return false;
1982	} else {
1983	// On GFX12, all offsets are signed 24-bit in bytes.
1984	if (!isInt<`24`>(x: AM.BaseOffs))
1985	return false;
1986	}
1987
1988	if ((AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
1989	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1990	AM.BaseOffs < `0`) {
1991	// Scalar (non-buffer) loads can only use a negative offset if
1992	// soffset+offset is non-negative. Since the compiler can only prove that
1993	// in a few special cases, it is safer to claim that negative offsets are
1994	// not supported.
1995	return false;
1996	}
1997
1998	if (AM.Scale == `0`) // r + i or just i, depending on HasBaseReg.
1999	return true;
2000
2001	if (AM.Scale == `1` && AM.HasBaseReg)
2002	return true;
2003
2004	return false;
2005	}
2006
2007	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2008	return Subtarget->hasFlatScratchEnabled()
2009	? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
2010	: isLegalMUBUFAddressingMode(AM);
2011
2012	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
2013	(AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2014	// Basic, single offset DS instructions allow a 16-bit unsigned immediate
2015	// field.
2016	// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2017	// an 8-bit dword offset but we don't know the alignment here.
2018	if (!isUInt<`16`>(x: AM.BaseOffs))
2019	return false;
2020
2021	if (AM.Scale == `0`) // r + i or just i, depending on HasBaseReg.
2022	return true;
2023
2024	if (AM.Scale == `1` && AM.HasBaseReg)
2025	return true;
2026
2027	return false;
2028	}
2029
2030	if (AS == AMDGPUAS::FLAT_ADDRESS \|\| AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
2031	// For an unknown address space, this usually means that this is for some
2032	// reason being used for pure arithmetic, and not based on some addressing
2033	// computation. We don't have instructions that compute pointers with any
2034	// addressing modes, so treat them as having no offset like flat
2035	// instructions.
2036	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
2037	}
2038
2039	// Assume a user alias of global for unknown address spaces.
2040	return isLegalGlobalAddressingMode(AM);
2041	}
2042
2043	bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
2044	const MachineFunction &MF) const {
2045	if (AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
2046	return (MemVT.getSizeInBits() <= `4` * `32`);
2047	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2048	unsigned MaxPrivateBits = `8` * getSubtarget()->getMaxPrivateElementSize();
2049	return (MemVT.getSizeInBits() <= MaxPrivateBits);
2050	}
2051	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
2052	return (MemVT.getSizeInBits() <= `2` * `32`);
2053	return true;
2054	}
2055
2056	bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
2057	unsigned Size, unsigned AddrSpace, Align Alignment,
2058	MachineMemOperand::Flags Flags, unsigned IsFast) const* {
2059	if (IsFast)
2060	*IsFast = `0`;
2061
2062	if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
2063	AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2064	// Check if alignment requirements for ds_read/write instructions are
2065	// disabled.
2066	if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align (`4`))
2067	return false;
2068
2069	Align RequiredAlignment(
2070	PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: `8`))); // Natural alignment.
2071	if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > `32` &&
2072	Alignment < RequiredAlignment)
2073	return false;
2074
2075	// Either, the alignment requirements are "enabled", or there is an
2076	// unaligned LDS access related hardware bug though alignment requirements
2077	// are "disabled". In either case, we need to check for proper alignment
2078	// requirements.
2079	//
2080	switch (Size) {
2081	case `64`:
2082	// SI has a hardware bug in the LDS / GDS bounds checking: if the base
2083	// address is negative, then the instruction is incorrectly treated as
2084	// out-of-bounds even if base + offsets is in bounds. Split vectorized
2085	// loads here to avoid emitting ds_read2_b32. We may re-combine the
2086	// load later in the SILoadStoreOptimizer.
2087	if (!Subtarget->hasUsableDSOffset() && Alignment < Align (`8`))
2088	return false;
2089
2090	// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2091	// can do a 4 byte aligned, 8 byte access in a single operation using
2092	// ds_read2/write2_b32 with adjacent offsets.
2093	RequiredAlignment = Align (`4`);
2094
2095	if (Subtarget->hasUnalignedDSAccessEnabled()) {
2096	// We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2097	// ds_write2_b32 depending on the alignment. In either case with either
2098	// alignment there is no faster way of doing this.
2099
2100	// The numbers returned here and below are not additive, it is a 'speed
2101	// rank'. They are just meant to be compared to decide if a certain way
2102	// of lowering an operation is faster than another. For that purpose
2103	// naturally aligned operation gets it bitsize to indicate that "it
2104	// operates with a speed comparable to N-bit wide load". With the full
2105	// alignment ds128 is slower than ds96 for example. If underaligned it
2106	// is comparable to a speed of a single dword access, which would then
2107	// mean 32 < 128 and it is faster to issue a wide load regardless.
2108	// 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2109	// wider load which will not be aligned anymore the latter is slower.
2110	if (IsFast)
2111	*IsFast = (Alignment >= RequiredAlignment) ? `64`
2112	: (Alignment < Align (`4`)) ? `32`
2113	: `1`;
2114	return true;
2115	}
2116
2117	break;
2118	case `96`:
2119	if (!Subtarget->hasDS96AndDS128())
2120	return false;
2121
2122	// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2123	// gfx8 and older.
2124
2125	if (Subtarget->hasUnalignedDSAccessEnabled()) {
2126	// Naturally aligned access is fastest. However, also report it is Fast
2127	// if memory is aligned less than DWORD. A narrow load or store will be
2128	// be equally slow as a single ds_read_b96/ds_write_b96, but there will
2129	// be more of them, so overall we will pay less penalty issuing a single
2130	// instruction.
2131
2132	// See comment on the values above.
2133	if (IsFast)
2134	*IsFast = (Alignment >= RequiredAlignment) ? `96`
2135	: (Alignment < Align (`4`)) ? `32`
2136	: `1`;
2137	return true;
2138	}
2139
2140	break;
2141	case `128`:
2142	if (!Subtarget->hasDS96AndDS128() \|\| !Subtarget->useDS128())
2143	return false;
2144
2145	// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2146	// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2147	// single operation using ds_read2/write2_b64.
2148	RequiredAlignment = Align (`8`);
2149
2150	if (Subtarget->hasUnalignedDSAccessEnabled()) {
2151	// Naturally aligned access is fastest. However, also report it is Fast
2152	// if memory is aligned less than DWORD. A narrow load or store will be
2153	// be equally slow as a single ds_read_b128/ds_write_b128, but there
2154	// will be more of them, so overall we will pay less penalty issuing a
2155	// single instruction.
2156
2157	// See comment on the values above.
2158	if (IsFast)
2159	*IsFast = (Alignment >= RequiredAlignment) ? `128`
2160	: (Alignment < Align (`4`)) ? `32`
2161	: `1`;
2162	return true;
2163	}
2164
2165	break;
2166	default:
2167	if (Size > `32`)
2168	return false;
2169
2170	break;
2171	}
2172
2173	// See comment on the values above.
2174	// Note that we have a single-dword or sub-dword here, so if underaligned
2175	// it is a slowest possible access, hence returned value is 0.
2176	if (IsFast)
2177	*IsFast = (Alignment >= RequiredAlignment) ? Size : `0`;
2178
2179	return Alignment >= RequiredAlignment \|\|
2180	Subtarget->hasUnalignedDSAccessEnabled();
2181	}
2182
2183	// FIXME: We have to be conservative here and assume that flat operations
2184	// will access scratch. If we had access to the IR function, then we
2185	// could determine if any private memory was used in the function.
2186	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS \|\|
2187	AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2188	bool AlignedBy4 = Alignment >= Align (`4`);
2189	if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2190	if (IsFast)
2191	*IsFast = AlignedBy4 ? Size : `1`;
2192	return true;
2193	}
2194
2195	if (IsFast)
2196	*IsFast = AlignedBy4;
2197
2198	return AlignedBy4;
2199	}
2200
2201	// So long as they are correct, wide global memory operations perform better
2202	// than multiple smaller memory ops -- even when misaligned
2203	if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
2204	if (IsFast)
2205	*IsFast = Size;
2206
2207	return Alignment >= Align (`4`) \|\|
2208	Subtarget->hasUnalignedBufferAccessEnabled();
2209	}
2210
2211	// Ensure robust out-of-bounds guarantees for buffer accesses are met if
2212	// RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2213	// out-of-bounds behavior, but in the edge case where an access starts
2214	// out-of-bounds and then enter in-bounds, the entire access would be treated
2215	// as out-of-bounds. Prevent misaligned memory accesses by requiring the
2216	// natural alignment of buffer accesses.
2217	if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
2218	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
2219	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2220	if (!Subtarget->hasRelaxedBufferOOBMode() &&
2221	Alignment < Align (PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: `8`))))
2222	return false;
2223	}
2224
2225	// Smaller than dword value must be aligned.
2226	if (Size < `32`)
2227	return false;
2228
2229	// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2230	// byte-address are ignored, thus forcing Dword alignment.
2231	// This applies to private, global, and constant memory.
2232	if (IsFast)
2233	*IsFast = `1`;
2234
2235	return Size >= `32` && Alignment >= Align (`4`);
2236	}
2237
2238	bool SITargetLowering::allowsMisalignedMemoryAccesses(
2239	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2240	unsigned IsFast) const* {
2241	return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
2242	Alignment, Flags, IsFast);
2243	}
2244
2245	EVT SITargetLowering::getOptimalMemOpType(
2246	LLVMContext &Context, const MemOp &Op,
2247	const AttributeList &FuncAttributes) const {
2248	// FIXME: Should account for address space here.
2249
2250	// The default fallback uses the private pointer size as a guess for a type to
2251	// use. Make sure we switch these to 64-bit accesses.
2252
2253	if (Op.size() >= `16` &&
2254	Op.isDstAligned(AlignCheck: Align (`4`))) // XXX: Should only do for global
2255	return MVT::v4i32;
2256
2257	if (Op.size() >= `8` && Op.isDstAligned(AlignCheck: Align (`4`)))
2258	return MVT::v2i32;
2259
2260	// Use the default.
2261	return MVT::Other;
2262	}
2263
2264	bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode N) const* {
2265	const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2266	return MemNode->getMemOperand()->getFlags() & MONoClobber;
2267	}
2268
2269	bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2270	return AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS \|\|
2271	AS == AMDGPUAS::PRIVATE_ADDRESS;
2272	}
2273
2274	bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2275	unsigned DestAS) const {
2276	if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2277	if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2278	Subtarget->hasGloballyAddressableScratch()) {
2279	// Flat -> private requires subtracting src_flat_scratch_base_lo.
2280	return false;
2281	}
2282
2283	// Flat -> private/local is a simple truncate.
2284	// Flat -> global is no-op
2285	return true;
2286	}
2287
2288	const GCNTargetMachine &TM =
2289	static_cast<const GCNTargetMachine &>(getTargetMachine());
2290	return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2291	}
2292
2293	TargetLoweringBase::LegalizeTypeAction
2294	SITargetLowering::getPreferredVectorAction(MVT VT) const {
2295	if (!VT.isScalableVector() && VT.getVectorNumElements() != `1` &&
2296	VT.getScalarType().bitsLE(VT: MVT::i16))
2297	return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2298	return TargetLoweringBase::getPreferredVectorAction(VT);
2299	}
2300
2301	bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2302	Type Ty) const* {
2303	// FIXME: Could be smarter if called for vector constants.
2304	return true;
2305	}
2306
2307	bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2308	unsigned Index) const {
2309	if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2310	return false;
2311
2312	// TODO: Add more cases that are cheap.
2313	return Index == `0`;
2314	}
2315
2316	bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2317	// TODO: This should be more aggressive, particular for 16-bit element
2318	// vectors. However there are some mixed improvements and regressions.
2319	EVT EltTy = VT.getVectorElementType();
2320	unsigned MinAlign = Subtarget->useRealTrue16Insts() ? `16` : `32`;
2321	return EltTy.getSizeInBits() % MinAlign == `0`;
2322	}
2323
2324	bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2325	if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2326	switch (Op) {
2327	case ISD::LOAD:
2328	case ISD::STORE:
2329	return true;
2330	default:
2331	return false;
2332	}
2333	}
2334
2335	// SimplifySetCC uses this function to determine whether or not it should
2336	// create setcc with i1 operands. We don't have instructions for i1 setcc.
2337	if (VT == MVT::i1 && Op == ISD::SETCC)
2338	return false;
2339
2340	return TargetLowering::isTypeDesirableForOp(Op, VT);
2341	}
2342
2343	MachinePointerInfo
2344	SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
2345	// This isn't really a constant pool but close enough.
2346	MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
2347	PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
2348	return PtrInfo;
2349	}
2350
2351	SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2352	const SDLoc &SL,
2353	SDValue Chain,
2354	uint64_t Offset) const {
2355	const DataLayout &DL = DAG.getDataLayout();
2356	MachineFunction &MF = DAG.getMachineFunction();
2357	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2358	MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2359
2360	auto [InputPtrReg, RC, ArgTy] =
2361	Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2362
2363	// We may not have the kernarg segment argument if we have no kernel
2364	// arguments.
2365	if (!InputPtrReg)
2366	return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2367
2368	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2369	SDValue BasePtr = DAG.getCopyFromReg(
2370	Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2371
2372	return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2373	}
2374
2375	SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2376	const SDLoc &SL) const {
2377	uint64_t Offset =
2378	getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2379	return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2380	}
2381
2382	SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2383	const SDLoc &SL) const {
2384
2385	Function &F = DAG.getMachineFunction().getFunction();
2386	std::optional<uint32_t> KnownSize =
2387	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2388	if (KnownSize.has_value())
2389	return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2390	return SDValue ();
2391	}
2392
2393	SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2394	const SDLoc &SL, SDValue Val,
2395	bool Signed,
2396	const ISD::InputArg Arg) const* {
2397	// First, if it is a widened vector, narrow it.
2398	if (VT.isVector() &&
2399	VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2400	EVT NarrowedVT =
2401	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2402	NumElements: VT.getVectorNumElements());
2403	Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2404	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
2405	}
2406
2407	// Then convert the vector elements or scalar value.
2408	if (Arg && (Arg->Flags.isSExt() \|\| Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2409	unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2410	Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2411	}
2412
2413	if (MemVT.isFloatingPoint()) {
2414	if (VT.isFloatingPoint()) {
2415	Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2416	} else {
2417	assert(!MemVT.isVector());
2418	EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
2419	SDValue Cast = DAG.getBitcast(VT: IntVT, V: Val);
2420	Val = DAG.getAnyExtOrTrunc(Op: Cast, DL: SL, VT);
2421	}
2422	} else if (Signed)
2423	Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2424	else
2425	Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2426
2427	return Val;
2428	}
2429
2430	SDValue SITargetLowering::lowerKernargMemParameter(
2431	SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2432	uint64_t Offset, Align Alignment, bool Signed,
2433	const ISD::InputArg Arg) const* {
2434
2435	MachinePointerInfo PtrInfo =
2436	getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
2437
2438	// Try to avoid using an extload by loading earlier than the argument address,
2439	// and extracting the relevant bits. The load should hopefully be merged with
2440	// the previous argument.
2441	if (MemVT.getStoreSize() < `4` && Alignment < `4`) {
2442	// TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2443	int64_t AlignDownOffset = alignDown(Value: Offset, Align: `4`);
2444	int64_t OffsetDiff = Offset - AlignDownOffset;
2445
2446	EVT IntVT = MemVT.changeTypeToInteger();
2447
2448	// TODO: If we passed in the base kernel offset we could have a better
2449	// alignment than 4, but we don't really need it.
2450	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2451	SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr,
2452	PtrInfo: PtrInfo.getWithOffset(O: AlignDownOffset), Alignment: Align (`4`),
2453	MMOFlags: MachineMemOperand::MODereferenceable \|
2454	MachineMemOperand::MOInvariant);
2455
2456	SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * `8`, DL: SL, VT: MVT::i32);
2457	SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2458
2459	SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2460	ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2461	ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2462
2463	return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: `1`)}, dl: SL);
2464	}
2465
2466	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2467	SDValue Load = DAG.getLoad(
2468	VT: MemVT, dl: SL, Chain, Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
2469	MMOFlags: MachineMemOperand::MODereferenceable \| MachineMemOperand::MOInvariant);
2470
2471	SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2472	return DAG.getMergeValues(Ops: {Val, Load.getValue(R: `1`)}, dl: SL);
2473	}
2474
2475	/// Coerce an argument which was passed in a different ABI type to the original
2476	/// expected value type.
2477	SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2478	SDValue Val,
2479	CCValAssign &VA,
2480	const SDLoc &SL) const {
2481	EVT ValVT = VA.getValVT();
2482
2483	// If this is an 8 or 16-bit value, it is really passed promoted
2484	// to 32 bits. Insert an assert[sz]ext to capture this, then
2485	// truncate to the right size.
2486	switch (VA.getLocInfo()) {
2487	case CCValAssign::Full:
2488	return Val;
2489	case CCValAssign::BCvt:
2490	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ValVT, Operand: Val);
2491	case CCValAssign::SExt:
2492	Val = DAG.getNode(Opcode: ISD::AssertSext, DL: SL, VT: VA.getLocVT(), N1: Val,
2493	N2: DAG.getValueType(ValVT));
2494	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2495	case CCValAssign::ZExt:
2496	Val = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: VA.getLocVT(), N1: Val,
2497	N2: DAG.getValueType(ValVT));
2498	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2499	case CCValAssign::AExt:
2500	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2501	default:
2502	llvm_unreachable("Unknown loc info!");
2503	}
2504	}
2505
2506	SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2507	CCValAssign &VA, const SDLoc &SL,
2508	SDValue Chain,
2509	const ISD::InputArg &Arg) const {
2510	MachineFunction &MF = DAG.getMachineFunction();
2511	MachineFrameInfo &MFI = MF.getFrameInfo();
2512
2513	if (Arg.Flags.isByVal()) {
2514	unsigned Size = Arg.Flags.getByValSize();
2515	int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2516	return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2517	}
2518
2519	unsigned ArgOffset = VA.getLocMemOffset();
2520	unsigned ArgSize = VA.getValVT().getStoreSize();
2521
2522	int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2523
2524	// Create load nodes to retrieve arguments from the stack.
2525	SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2526
2527	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2528	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2529	MVT MemVT = VA.getValVT();
2530
2531	switch (VA.getLocInfo()) {
2532	default:
2533	break;
2534	case CCValAssign::BCvt:
2535	MemVT = VA.getLocVT();
2536	break;
2537	case CCValAssign::SExt:
2538	ExtType = ISD::SEXTLOAD;
2539	break;
2540	case CCValAssign::ZExt:
2541	ExtType = ISD::ZEXTLOAD;
2542	break;
2543	case CCValAssign::AExt:
2544	ExtType = ISD::EXTLOAD;
2545	break;
2546	}
2547
2548	SDValue ArgValue = DAG.getExtLoad(
2549	ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2550	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2551
2552	SDValue ConvertedVal = convertABITypeToValueType(DAG, Val: ArgValue, VA, SL);
2553	if (ConvertedVal == ArgValue)
2554	return ConvertedVal;
2555
2556	return DAG.getMergeValues(Ops: {ConvertedVal, ArgValue.getValue(R: `1`)}, dl: SL);
2557	}
2558
2559	SDValue SITargetLowering::lowerWorkGroupId(
2560	SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2561	AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
2562	AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
2563	AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2564	if (!Subtarget->hasClusters())
2565	return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2566
2567	// Clusters are supported. Return the global position in the grid. If clusters
2568	// are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2569
2570	// WorkGroupIdXYZ = ClusterId == 0 ?
2571	// ClusterIdXYZ :
2572	// ClusterIdXYZ (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ*
2573	SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2574	SDLoc SL(ClusterIdXYZ);
2575	SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2576	SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT);
2577	SDValue ClusterSizeXYZ = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterMaxIdXYZ, N2: One);
2578	SDValue ClusterWorkGroupIdXYZ =
2579	getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2580	SDValue GlobalIdXYZ =
2581	DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterWorkGroupIdXYZ,
2582	N2: DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: ClusterIdXYZ, N2: ClusterSizeXYZ));
2583
2584	switch (MFI.getClusterDims().getKind()) {
2585	case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
2586	case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
2587	return GlobalIdXYZ;
2588	case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
2589	return ClusterIdXYZ;
2590	case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
2591	using namespace AMDGPU::Hwreg;
2592	SDValue ClusterIdField =
2593	DAG.getTargetConstant(Val: HwregEncoding::encode(Values: ID_IB_STS2, Values: `6`, Values: `4`), DL: SL, VT);
2594	SDNode *GetReg =
2595	DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT, Op1: ClusterIdField);
2596	SDValue ClusterId(GetReg, `0`);
2597	SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT);
2598	return DAG.getNode(Opcode: ISD::SELECT_CC, DL: SL, VT, N1: ClusterId, N2: Zero, N3: ClusterIdXYZ,
2599	N4: GlobalIdXYZ, N5: DAG.getCondCode(Cond: ISD::SETEQ));
2600	}
2601	}
2602
2603	llvm_unreachable("nothing should reach here");
2604	}
2605
2606	SDValue SITargetLowering::getPreloadedValue(
2607	SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2608	AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2609	const ArgDescriptor Reg = nullptr*;
2610	const TargetRegisterClass *RC;
2611	LLT Ty;
2612
2613	CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2614	const ArgDescriptor WorkGroupIDX =
2615	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2616	// If GridZ is not programmed in an entry function then the hardware will set
2617	// it to all zeros, so there is no need to mask the GridY value in the low
2618	// order bits.
2619	const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2620	Reg: AMDGPU::TTMP7,
2621	Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~`0u` : `0xFFFFu`);
2622	const ArgDescriptor WorkGroupIDZ =
2623	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: `0xFFFF0000u`);
2624	const ArgDescriptor ClusterWorkGroupIDX =
2625	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x0000000Fu`);
2626	const ArgDescriptor ClusterWorkGroupIDY =
2627	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x000000F0u`);
2628	const ArgDescriptor ClusterWorkGroupIDZ =
2629	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x00000F00u`);
2630	const ArgDescriptor ClusterWorkGroupMaxIDX =
2631	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x0000F000u`);
2632	const ArgDescriptor ClusterWorkGroupMaxIDY =
2633	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x000F0000u`);
2634	const ArgDescriptor ClusterWorkGroupMaxIDZ =
2635	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x00F00000u`);
2636	const ArgDescriptor ClusterWorkGroupMaxFlatID =
2637	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: `0x0F000000u`);
2638
2639	auto LoadConstant = [&](unsigned N) {
2640	return DAG.getConstant(Val: N, DL: SDLoc (), VT);
2641	};
2642
2643	if (Subtarget->hasArchitectedSGPRs() &&
2644	(AMDGPU::isCompute(CC) \|\| CC == CallingConv::AMDGPU_Gfx)) {
2645	AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2646	bool HasFixedDims = ClusterDims.isFixedDims();
2647
2648	switch (PVID) {
2649	case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2650	Reg = &WorkGroupIDX;
2651	RC = &AMDGPU::SReg_32RegClass;
2652	Ty = LLT::scalar(SizeInBits: `32`);
2653	break;
2654	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2655	Reg = &WorkGroupIDY;
2656	RC = &AMDGPU::SReg_32RegClass;
2657	Ty = LLT::scalar(SizeInBits: `32`);
2658	break;
2659	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2660	Reg = &WorkGroupIDZ;
2661	RC = &AMDGPU::SReg_32RegClass;
2662	Ty = LLT::scalar(SizeInBits: `32`);
2663	break;
2664	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
2665	if (HasFixedDims && ClusterDims.getDims()[`0`] == `1`)
2666	return LoadConstant (`0`);
2667	Reg = &ClusterWorkGroupIDX;
2668	RC = &AMDGPU::SReg_32RegClass;
2669	Ty = LLT::scalar(SizeInBits: `32`);
2670	break;
2671	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
2672	if (HasFixedDims && ClusterDims.getDims()[`1`] == `1`)
2673	return LoadConstant (`0`);
2674	Reg = &ClusterWorkGroupIDY;
2675	RC = &AMDGPU::SReg_32RegClass;
2676	Ty = LLT::scalar(SizeInBits: `32`);
2677	break;
2678	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
2679	if (HasFixedDims && ClusterDims.getDims()[`2`] == `1`)
2680	return LoadConstant (`0`);
2681	Reg = &ClusterWorkGroupIDZ;
2682	RC = &AMDGPU::SReg_32RegClass;
2683	Ty = LLT::scalar(SizeInBits: `32`);
2684	break;
2685	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
2686	if (HasFixedDims)
2687	return LoadConstant (ClusterDims.getDims()[`0`] - `1`);
2688	Reg = &ClusterWorkGroupMaxIDX;
2689	RC = &AMDGPU::SReg_32RegClass;
2690	Ty = LLT::scalar(SizeInBits: `32`);
2691	break;
2692	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
2693	if (HasFixedDims)
2694	return LoadConstant (ClusterDims.getDims()[`1`] - `1`);
2695	Reg = &ClusterWorkGroupMaxIDY;
2696	RC = &AMDGPU::SReg_32RegClass;
2697	Ty = LLT::scalar(SizeInBits: `32`);
2698	break;
2699	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
2700	if (HasFixedDims)
2701	return LoadConstant (ClusterDims.getDims()[`2`] - `1`);
2702	Reg = &ClusterWorkGroupMaxIDZ;
2703	RC = &AMDGPU::SReg_32RegClass;
2704	Ty = LLT::scalar(SizeInBits: `32`);
2705	break;
2706	case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
2707	Reg = &ClusterWorkGroupMaxFlatID;
2708	RC = &AMDGPU::SReg_32RegClass;
2709	Ty = LLT::scalar(SizeInBits: `32`);
2710	break;
2711	default:
2712	break;
2713	}
2714	}
2715
2716	if (!Reg)
2717	std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2718	if (!Reg) {
2719	if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2720	// It's possible for a kernarg intrinsic call to appear in a kernel with
2721	// no allocated segment, in which case we do not add the user sgpr
2722	// argument, so just return null.
2723	return DAG.getConstant(Val: `0`, DL: SDLoc (), VT);
2724	}
2725
2726	// It's undefined behavior if a function marked with the amdgpu-no-*
2727	// attributes uses the corresponding intrinsic.
2728	return DAG.getPOISON(VT);
2729	}
2730
2731	return loadInputValue(DAG, RC, VT, SL: SDLoc (DAG.getEntryNode()), Arg: *Reg);
2732	}
2733
2734	static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2735	CallingConv::ID CallConv,
2736	ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2737	FunctionType *FType,
2738	SIMachineFunctionInfo *Info) {
2739	for (unsigned I = `0`, E = Ins.size(), PSInputNum = `0`; I != E; ++I) {
2740	const ISD::InputArg *Arg = &Ins [I];
2741
2742	assert((!Arg->VT.isVector() \|\| Arg->VT.getScalarSizeInBits() == `16`) &&
2743	"vector type argument should have been split");
2744
2745	// First check if it's a PS input addr.
2746	if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2747	PSInputNum <= `15`) {
2748	bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2749
2750	// Inconveniently only the first part of the split is marked as isSplit,
2751	// so skip to the end. We only want to increment PSInputNum once for the
2752	// entire split argument.
2753	if (Arg->Flags.isSplit()) {
2754	while (!Arg->Flags.isSplitEnd()) {
2755	assert((!Arg->VT.isVector() \|\| Arg->VT.getScalarSizeInBits() == `16`) &&
2756	"unexpected vector split in ps argument type");
2757	if (!SkipArg)
2758	Splits.push_back(Elt: *Arg);
2759	Arg = &Ins [++I];
2760	}
2761	}
2762
2763	if (SkipArg) {
2764	// We can safely skip PS inputs.
2765	Skipped.set(Arg->getOrigArgIndex());
2766	++PSInputNum;
2767	continue;
2768	}
2769
2770	Info->markPSInputAllocated(Index: PSInputNum);
2771	if (Arg->Used)
2772	Info->markPSInputEnabled(Index: PSInputNum);
2773
2774	++PSInputNum;
2775	}
2776
2777	Splits.push_back(Elt: *Arg);
2778	}
2779	}
2780
2781	// Allocate special inputs passed in VGPRs.
2782	void SITargetLowering::allocateSpecialEntryInputVGPRs(
2783	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2784	SIMachineFunctionInfo &Info) const {
2785	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2786	MachineRegisterInfo &MRI = MF.getRegInfo();
2787
2788	if (Info.hasWorkItemIDX()) {
2789	Register Reg = AMDGPU::VGPR0;
2790	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2791
2792	CCInfo.AllocateReg(Reg);
2793	unsigned Mask =
2794	(Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? `0x3ff` : ~`0u`;
2795	Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2796	}
2797
2798	if (Info.hasWorkItemIDY()) {
2799	assert(Info.hasWorkItemIDX());
2800	if (Subtarget->hasPackedTID()) {
2801	Info.setWorkItemIDY(
2802	ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: `0x3ff` << `10`));
2803	} else {
2804	unsigned Reg = AMDGPU::VGPR1;
2805	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2806
2807	CCInfo.AllocateReg(Reg);
2808	Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2809	}
2810	}
2811
2812	if (Info.hasWorkItemIDZ()) {
2813	assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2814	if (Subtarget->hasPackedTID()) {
2815	Info.setWorkItemIDZ(
2816	ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: `0x3ff` << `20`));
2817	} else {
2818	unsigned Reg = AMDGPU::VGPR2;
2819	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2820
2821	CCInfo.AllocateReg(Reg);
2822	Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2823	}
2824	}
2825	}
2826
2827	// Try to allocate a VGPR at the end of the argument list, or if no argument
2828	// VGPRs are left allocating a stack slot.
2829	// If \p Mask is is given it indicates bitfield position in the register.
2830	// If \p Arg is given use it with new ]p Mask instead of allocating new.
2831	static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~`0u`,
2832	ArgDescriptor Arg = ArgDescriptor ()) {
2833	if (Arg.isSet())
2834	return ArgDescriptor::createArg(Arg, Mask);
2835
2836	ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), `32`);
2837	unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2838	if (RegIdx == ArgVGPRs.size()) {
2839	// Spill to stack required.
2840	int64_t Offset = CCInfo.AllocateStack(Size: `4`, Alignment: Align (`4`));
2841
2842	return ArgDescriptor::createStack(Offset, Mask);
2843	}
2844
2845	unsigned Reg = ArgVGPRs [RegIdx];
2846	Reg = CCInfo.AllocateReg(Reg);
2847	assert(Reg != AMDGPU::NoRegister);
2848
2849	MachineFunction &MF = CCInfo.getMachineFunction();
2850	Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2851	MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: `32`));
2852	return ArgDescriptor::createRegister(Reg, Mask);
2853	}
2854
2855	static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2856	const TargetRegisterClass *RC,
2857	unsigned NumArgRegs) {
2858	ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), `32`);
2859	unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2860	if (RegIdx == ArgSGPRs.size())
2861	report_fatal_error(reason: "ran out of SGPRs for arguments");
2862
2863	unsigned Reg = ArgSGPRs [RegIdx];
2864	Reg = CCInfo.AllocateReg(Reg);
2865	assert(Reg != AMDGPU::NoRegister);
2866
2867	MachineFunction &MF = CCInfo.getMachineFunction();
2868	MF.addLiveIn(PReg: Reg, RC);
2869	return ArgDescriptor::createRegister(Reg);
2870	}
2871
2872	// If this has a fixed position, we still should allocate the register in the
2873	// CCInfo state. Technically we could get away with this for values passed
2874	// outside of the normal argument range.
2875	static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2876	const TargetRegisterClass *RC,
2877	MCRegister Reg) {
2878	Reg = CCInfo.AllocateReg(Reg);
2879	assert(Reg != AMDGPU::NoRegister);
2880	MachineFunction &MF = CCInfo.getMachineFunction();
2881	MF.addLiveIn(PReg: Reg, RC);
2882	}
2883
2884	static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2885	if (Arg) {
2886	allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2887	Reg: Arg.getRegister());
2888	} else
2889	Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: `32`);
2890	}
2891
2892	static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2893	if (Arg) {
2894	allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2895	Reg: Arg.getRegister());
2896	} else
2897	Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: `16`);
2898	}
2899
2900	/// Allocate implicit function VGPR arguments at the end of allocated user
2901	/// arguments.
2902	void SITargetLowering::allocateSpecialInputVGPRs(
2903	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2904	SIMachineFunctionInfo &Info) const {
2905	const unsigned Mask = `0x3ff`;
2906	ArgDescriptor Arg;
2907
2908	if (Info.hasWorkItemIDX()) {
2909	Arg = allocateVGPR32Input(CCInfo, Mask);
2910	Info.setWorkItemIDX(Arg);
2911	}
2912
2913	if (Info.hasWorkItemIDY()) {
2914	Arg = allocateVGPR32Input(CCInfo, Mask: Mask << `10`, Arg);
2915	Info.setWorkItemIDY(Arg);
2916	}
2917
2918	if (Info.hasWorkItemIDZ())
2919	Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << `20`, Arg));
2920	}
2921
2922	/// Allocate implicit function VGPR arguments in fixed registers.
2923	void SITargetLowering::allocateSpecialInputVGPRsFixed(
2924	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2925	SIMachineFunctionInfo &Info) const {
2926	Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2927	if (!Reg)
2928	report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2929
2930	const unsigned Mask = `0x3ff`;
2931	Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2932	Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << `10`));
2933	Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << `20`));
2934	}
2935
2936	void SITargetLowering::allocateSpecialInputSGPRs(
2937	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2938	SIMachineFunctionInfo &Info) const {
2939	auto &ArgInfo = Info.getArgInfo();
2940	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2941
2942	// TODO: Unify handling with private memory pointers.
2943	if (UserSGPRInfo.hasDispatchPtr())
2944	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2945
2946	if (UserSGPRInfo.hasQueuePtr())
2947	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2948
2949	// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2950	// constant offset from the kernarg segment.
2951	if (Info.hasImplicitArgPtr())
2952	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2953
2954	if (UserSGPRInfo.hasDispatchID())
2955	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2956
2957	// flat_scratch_init is not applicable for non-kernel functions.
2958
2959	if (Info.hasWorkGroupIDX())
2960	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2961
2962	if (Info.hasWorkGroupIDY())
2963	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2964
2965	if (Info.hasWorkGroupIDZ())
2966	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2967
2968	if (Info.hasLDSKernelId())
2969	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2970	}
2971
2972	// Allocate special inputs passed in user SGPRs.
2973	void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2974	MachineFunction &MF,
2975	const SIRegisterInfo &TRI,
2976	SIMachineFunctionInfo &Info) const {
2977	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2978	if (UserSGPRInfo.hasImplicitBufferPtr()) {
2979	Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2980	MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2981	CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2982	}
2983
2984	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2985	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2986	Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2987	MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2988	CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2989	}
2990
2991	if (UserSGPRInfo.hasDispatchPtr()) {
2992	Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2993	MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2994	CCInfo.AllocateReg(Reg: DispatchPtrReg);
2995	}
2996
2997	if (UserSGPRInfo.hasQueuePtr()) {
2998	Register QueuePtrReg = Info.addQueuePtr(TRI);
2999	MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
3000	CCInfo.AllocateReg(Reg: QueuePtrReg);
3001	}
3002
3003	if (UserSGPRInfo.hasKernargSegmentPtr()) {
3004	MachineRegisterInfo &MRI = MF.getRegInfo();
3005	Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3006	CCInfo.AllocateReg(Reg: InputPtrReg);
3007
3008	Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3009	MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
3010	}
3011
3012	if (UserSGPRInfo.hasDispatchID()) {
3013	Register DispatchIDReg = Info.addDispatchID(TRI);
3014	MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
3015	CCInfo.AllocateReg(Reg: DispatchIDReg);
3016	}
3017
3018	if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3019	Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3020	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
3021	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
3022	}
3023
3024	if (UserSGPRInfo.hasPrivateSegmentSize()) {
3025	Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3026	MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
3027	CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
3028	}
3029
3030	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3031	// these from the dispatch pointer.
3032	}
3033
3034	// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3035	// sequential starting from the first argument.
3036	void SITargetLowering::allocatePreloadKernArgSGPRs(
3037	CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3038	const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
3039	const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3040	Function &F = MF.getFunction();
3041	unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3042	GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3043	bool InPreloadSequence = true;
3044	unsigned InIdx = `0`;
3045	bool AlignedForImplictArgs = false;
3046	unsigned ImplicitArgOffset = `0`;
3047	for (auto &Arg : F.args()) {
3048	if (!InPreloadSequence \|\| !Arg.hasInRegAttr())
3049	break;
3050
3051	unsigned ArgIdx = Arg.getArgNo();
3052	// Don't preload non-original args or parts not in the current preload
3053	// sequence.
3054	if (InIdx < Ins.size() &&
3055	(!Ins [InIdx].isOrigArg() \|\| Ins [InIdx].getOrigArgIndex() != ArgIdx))
3056	break;
3057
3058	for (; InIdx < Ins.size() && Ins [InIdx].isOrigArg() &&
3059	Ins [InIdx].getOrigArgIndex() == ArgIdx;
3060	InIdx++) {
3061	assert(ArgLocs[ArgIdx].isMemLoc());
3062	auto &ArgLoc = ArgLocs [InIdx];
3063	const Align KernelArgBaseAlign = Align (`16`);
3064	unsigned ArgOffset = ArgLoc.getLocMemOffset();
3065	Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
3066	unsigned NumAllocSGPRs =
3067	alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: `32`) / `32`;
3068
3069	// Fix alignment for hidden arguments.
3070	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
3071	if (!AlignedForImplictArgs) {
3072	ImplicitArgOffset =
3073	alignTo(Size: LastExplicitArgOffset,
3074	A: Subtarget->getAlignmentForImplicitArgPtr()) -
3075	LastExplicitArgOffset;
3076	AlignedForImplictArgs = true;
3077	}
3078	ArgOffset += ImplicitArgOffset;
3079	}
3080
3081	// Arg is preloaded into the previous SGPR.
3082	if (ArgLoc.getLocVT().getStoreSize() < `4` && Alignment < `4`) {
3083	assert(InIdx >= `1` && "No previous SGPR");
3084	Info.getArgInfo().PreloadKernArgs [InIdx].Regs.push_back(
3085	Elt: Info.getArgInfo().PreloadKernArgs [InIdx - `1`].Regs [`0`]);
3086	continue;
3087	}
3088
3089	unsigned Padding = ArgOffset - LastExplicitArgOffset;
3090	unsigned PaddingSGPRs = alignTo(Value: Padding, Align: `4`) / `4`;
3091	// Check for free user SGPRs for preloading.
3092	if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3093	InPreloadSequence = false;
3094	break;
3095	}
3096
3097	// Preload this argument.
3098	const TargetRegisterClass *RC =
3099	TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * `32`);
3100	SmallVectorImpl<MCRegister> *PreloadRegs =
3101	Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
3102
3103	if (PreloadRegs->size() > `1`)
3104	RC = &AMDGPU::SGPR_32RegClass;
3105	for (auto &Reg : *PreloadRegs) {
3106	assert(Reg);
3107	MF.addLiveIn(PReg: Reg, RC);
3108	CCInfo.AllocateReg(Reg);
3109	}
3110
3111	LastExplicitArgOffset = NumAllocSGPRs * `4` + ArgOffset;
3112	}
3113	}
3114	}
3115
3116	void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
3117	const SIRegisterInfo &TRI,
3118	SIMachineFunctionInfo &Info) const {
3119	// Always allocate this last since it is a synthetic preload.
3120	if (Info.hasLDSKernelId()) {
3121	Register Reg = Info.addLDSKernelId();
3122	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3123	CCInfo.AllocateReg(Reg);
3124	}
3125	}
3126
3127	// Allocate special input registers that are initialized per-wave.
3128	void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
3129	SIMachineFunctionInfo &Info,
3130	CallingConv::ID CallConv,
3131	bool IsShader) const {
3132	bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3133	if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3134	// Note: user SGPRs are handled by the front-end for graphics shaders
3135	// Pad up the used user SGPRs with dead inputs.
3136
3137	// TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3138	// before enabling architected SGPRs for workgroup IDs.
3139	assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3140
3141	unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3142	// Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3143	// rely on it to reach 16 since if we end up having no stack usage, it will
3144	// not really be added.
3145	unsigned NumRequiredSystemSGPRs =
3146	Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3147	Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3148	for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < `16`; ++i) {
3149	Register Reg = Info.addReservedUserSGPR();
3150	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3151	CCInfo.AllocateReg(Reg);
3152	}
3153	}
3154
3155	if (!HasArchitectedSGPRs) {
3156	if (Info.hasWorkGroupIDX()) {
3157	Register Reg = Info.addWorkGroupIDX();
3158	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3159	CCInfo.AllocateReg(Reg);
3160	}
3161
3162	if (Info.hasWorkGroupIDY()) {
3163	Register Reg = Info.addWorkGroupIDY();
3164	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3165	CCInfo.AllocateReg(Reg);
3166	}
3167
3168	if (Info.hasWorkGroupIDZ()) {
3169	Register Reg = Info.addWorkGroupIDZ();
3170	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3171	CCInfo.AllocateReg(Reg);
3172	}
3173	}
3174
3175	if (Info.hasWorkGroupInfo()) {
3176	Register Reg = Info.addWorkGroupInfo();
3177	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3178	CCInfo.AllocateReg(Reg);
3179	}
3180
3181	if (Info.hasPrivateSegmentWaveByteOffset()) {
3182	// Scratch wave offset passed in system SGPR.
3183	unsigned PrivateSegmentWaveByteOffsetReg;
3184
3185	if (IsShader) {
3186	PrivateSegmentWaveByteOffsetReg =
3187	Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3188
3189	// This is true if the scratch wave byte offset doesn't have a fixed
3190	// location.
3191	if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3192	PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3193	Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3194	}
3195	} else
3196	PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3197
3198	MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
3199	CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
3200	}
3201
3202	assert(!Subtarget->hasUserSGPRInit16BugInWave32() \|\| IsShader \|\|
3203	Info.getNumPreloadedSGPRs() >= `16`);
3204	}
3205
3206	static void reservePrivateMemoryRegs(const TargetMachine &TM,
3207	MachineFunction &MF,
3208	const SIRegisterInfo &TRI,
3209	SIMachineFunctionInfo &Info) {
3210	// Now that we've figured out where the scratch register inputs are, see if
3211	// should reserve the arguments and use them directly.
3212	MachineFrameInfo &MFI = MF.getFrameInfo();
3213	bool HasStackObjects = MFI.hasStackObjects();
3214	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3215
3216	// Record that we know we have non-spill stack objects so we don't need to
3217	// check all stack objects later.
3218	if (HasStackObjects)
3219	Info.setHasNonSpillStackObjects(true);
3220
3221	// Everything live out of a block is spilled with fast regalloc, so it's
3222	// almost certain that spilling will be required.
3223	if (TM.getOptLevel() == CodeGenOptLevel::None)
3224	HasStackObjects = true;
3225
3226	// For now assume stack access is needed in any callee functions, so we need
3227	// the scratch registers to pass in.
3228	bool RequiresStackAccess = HasStackObjects \|\| MFI.hasCalls();
3229
3230	if (!ST.hasFlatScratchEnabled()) {
3231	if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
3232	// If we have stack objects, we unquestionably need the private buffer
3233	// resource. For the Code Object V2 ABI, this will be the first 4 user
3234	// SGPR inputs. We can reserve those and use them directly.
3235
3236	Register PrivateSegmentBufferReg =
3237	Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
3238	Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3239	} else {
3240	unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3241	// We tentatively reserve the last registers (skipping the last registers
3242	// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3243	// we'll replace these with the ones immediately after those which were
3244	// really allocated. In the prologue copies will be inserted from the
3245	// argument to these reserved registers.
3246
3247	// Without HSA, relocations are used for the scratch pointer and the
3248	// buffer resource setup is always inserted in the prologue. Scratch wave
3249	// offset is still in an input SGPR.
3250	Info.setScratchRSrcReg(ReservedBufferReg);
3251	}
3252	}
3253
3254	MachineRegisterInfo &MRI = MF.getRegInfo();
3255
3256	// For entry functions we have to set up the stack pointer if we use it,
3257	// whereas non-entry functions get this "for free". This means there is no
3258	// intrinsic advantage to using S32 over S34 in cases where we do not have
3259	// calls but do need a frame pointer (i.e. if we are requested to have one
3260	// because frame pointer elimination is disabled). To keep things simple we
3261	// only ever use S32 as the call ABI stack pointer, and so using it does not
3262	// imply we need a separate frame pointer.
3263	//
3264	// Try to use s32 as the SP, but move it if it would interfere with input
3265	// arguments. This won't work with calls though.
3266	//
3267	// FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3268	// registers.
3269	if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
3270	Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3271	} else {
3272	assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
3273
3274	if (MFI.hasCalls())
3275	report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
3276
3277	for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3278	if (!MRI.isLiveIn(Reg)) {
3279	Info.setStackPtrOffsetReg(Reg);
3280	break;
3281	}
3282	}
3283
3284	if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3285	report_fatal_error(reason: "failed to find register for SP");
3286	}
3287
3288	// hasFP should be accurate for entry functions even before the frame is
3289	// finalized, because it does not rely on the known stack size, only
3290	// properties like whether variable sized objects are present.
3291	if (ST.getFrameLowering()->hasFP(MF)) {
3292	Info.setFrameOffsetReg(AMDGPU::SGPR33);
3293	}
3294	}
3295
3296	bool SITargetLowering::supportSplitCSR(MachineFunction MF) const* {
3297	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3298	return !Info->isEntryFunction();
3299	}
3300
3301	void SITargetLowering::initializeSplitCSR(MachineBasicBlock Entry) const* {}
3302
3303	void SITargetLowering::insertCopiesSplitCSR(
3304	MachineBasicBlock *Entry,
3305	const SmallVectorImpl<MachineBasicBlock > &Exits) const* {
3306	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3307
3308	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
3309	if (!IStart)
3310	return;
3311
3312	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3313	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3314	MachineBasicBlock::iterator MBBI = Entry->begin();
3315	for (const MCPhysReg I = IStart; I; ++I) {
3316	const TargetRegisterClass RC = nullptr*;
3317	if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3318	RC = &AMDGPU::SGPR_64RegClass;
3319	else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3320	RC = &AMDGPU::SGPR_32RegClass;
3321	else
3322	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3323
3324	Register NewVR = MRI->createVirtualRegister(RegClass: RC);
3325	// Create copy from CSR to a virtual register.
3326	Entry->addLiveIn(PhysReg: *I);
3327	BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc (), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
3328	.addReg(RegNo: *I);
3329
3330	// Insert the copy-back instructions right before the terminator.
3331	for (auto *Exit : Exits)
3332	BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc (),
3333	MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
3334	.addReg(RegNo: NewVR);
3335	}
3336	}
3337
3338	SDValue SITargetLowering::LowerFormalArguments(
3339	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3340	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3341	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3342	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3343
3344	MachineFunction &MF = DAG.getMachineFunction();
3345	const Function &Fn = MF.getFunction();
3346	FunctionType *FType = MF.getFunction().getFunctionType();
3347	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3348	bool IsError = false;
3349
3350	if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
3351	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
3352	Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3353	IsError = true;
3354	}
3355
3356	SmallVector<ISD::InputArg, `16`> Splits;
3357	SmallVector<CCValAssign, `16`> ArgLocs;
3358	BitVector Skipped(Ins.size());
3359	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3360	*DAG.getContext());
3361
3362	bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
3363	bool IsKernel = AMDGPU::isKernel(CC: CallConv);
3364	bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
3365
3366	if (IsGraphics) {
3367	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3368	assert(!UserSGPRInfo.hasDispatchPtr() &&
3369	!UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3370	!Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3371	!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3372	(void)UserSGPRInfo;
3373	if (!Subtarget->hasFlatScratchEnabled())
3374	assert(!UserSGPRInfo.hasFlatScratchInit());
3375	if ((CallConv != CallingConv::AMDGPU_CS &&
3376	CallConv != CallingConv::AMDGPU_Gfx &&
3377	CallConv != CallingConv::AMDGPU_Gfx_WholeWave) \|\|
3378	!Subtarget->hasArchitectedSGPRs())
3379	assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3380	!Info->hasWorkGroupIDZ());
3381	}
3382
3383	bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3384
3385	if (CallConv == CallingConv::AMDGPU_PS) {
3386	processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3387
3388	// At least one interpolation mode must be enabled or else the GPU will
3389	// hang.
3390	//
3391	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3392	// set PSInputAddr, the user wants to enable some bits after the compilation
3393	// based on run-time states. Since we can't know what the final PSInputEna
3394	// will look like, so we shouldn't do anything here and the user should take
3395	// responsibility for the correct programming.
3396	//
3397	// Otherwise, the following restrictions apply:
3398	// - At least one of PERSP_ (0xF) or LINEAR_* (0x70) must be enabled.*
3399	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_ must be*
3400	// enabled too.
3401	if ((Info->getPSInputAddr() & `0x7F`) == `0` \|\|
3402	((Info->getPSInputAddr() & `0xF`) == `0` && Info->isPSInputAllocated(Index: `11`))) {
3403	CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
3404	CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
3405	Info->markPSInputAllocated(Index: `0`);
3406	Info->markPSInputEnabled(Index: `0`);
3407	}
3408	if (Subtarget->isAmdPalOS()) {
3409	// For isAmdPalOS, the user does not enable some bits after compilation
3410	// based on run-time states; the register values being generated here are
3411	// the final ones set in hardware. Therefore we need to apply the
3412	// workaround to PSInputAddr and PSInputEnable together. (The case where
3413	// a bit is set in PSInputAddr but not PSInputEnable is where the
3414	// frontend set up an input arg for a particular interpolation mode, but
3415	// nothing uses that input arg. Really we should have an earlier pass
3416	// that removes such an arg.)
3417	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3418	if ((PsInputBits & `0x7F`) == `0` \|\|
3419	((PsInputBits & `0xF`) == `0` && (PsInputBits >> `11` & `1`)))
3420	Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
3421	}
3422	} else if (IsKernel) {
3423	assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3424	} else {
3425	Splits.append(in_start: IsWholeWaveFunc ? std::next(x: Ins.begin()) : Ins.begin(),
3426	in_end: Ins.end());
3427	}
3428
3429	if (IsKernel)
3430	analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
3431
3432	if (IsEntryFunc) {
3433	allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
3434	allocateHSAUserSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
3435	if (IsKernel && Subtarget->hasKernargPreload())
3436	allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: TRI, Info&: Info);
3437
3438	allocateLDSKernelId(CCInfo, MF, TRI: TRI, Info&: Info);
3439	} else if (!IsGraphics) {
3440	// For the fixed ABI, pass workitem IDs in the last argument register.
3441	allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: TRI, Info&: Info);
3442
3443	// FIXME: Sink this into allocateSpecialInputSGPRs
3444	if (!Subtarget->hasFlatScratchEnabled())
3445	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3446
3447	allocateSpecialInputSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
3448	}
3449
3450	if (!IsKernel) {
3451	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3452	CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3453
3454	// This assumes the registers are allocated by CCInfo in ascending order
3455	// with no gaps.
3456	Info->setNumWaveDispatchSGPRs(
3457	CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
3458	Info->setNumWaveDispatchVGPRs(
3459	CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
3460	} else if (Info->getNumKernargPreloadedSGPRs()) {
3461	Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3462	}
3463
3464	SmallVector<SDValue, `16`> Chains;
3465
3466	if (IsWholeWaveFunc) {
3467	SDValue Setup = DAG.getNode(Opcode: AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3468	ResultTys: {MVT::i1, MVT::Other}, Ops: Chain);
3469	InVals.push_back(Elt: Setup.getValue(R: `0`));
3470	Chains.push_back(Elt: Setup.getValue(R: `1`));
3471	}
3472
3473	// FIXME: This is the minimum kernel argument alignment. We should improve
3474	// this to the maximum alignment of the arguments.
3475	//
3476	// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3477	// kern arg offset.
3478	const Align KernelArgBaseAlign = Align (`16`);
3479
3480	for (unsigned i = IsWholeWaveFunc ? `1` : `0`, e = Ins.size(), ArgIdx = `0`; i != e;
3481	++i) {
3482	const ISD::InputArg &Arg = Ins [i];
3483	if ((Arg.isOrigArg() && Skipped [Arg.getOrigArgIndex()]) \|\| IsError) {
3484	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3485	continue;
3486	}
3487
3488	CCValAssign &VA = ArgLocs [ArgIdx++];
3489	MVT VT = VA.getLocVT();
3490
3491	if (IsEntryFunc && VA.isMemLoc()) {
3492	VT = Ins [i].VT;
3493	EVT MemVT = VA.getLocVT();
3494
3495	const uint64_t Offset = VA.getLocMemOffset();
3496	Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3497
3498	if (Arg.Flags.isByRef()) {
3499	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3500
3501	const GCNTargetMachine &TM =
3502	static_cast<const GCNTargetMachine &>(getTargetMachine());
3503	if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3504	DestAS: Arg.Flags.getPointerAddrSpace())) {
3505	Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3506	DestAS: Arg.Flags.getPointerAddrSpace());
3507	}
3508
3509	InVals.push_back(Elt: Ptr);
3510	continue;
3511	}
3512
3513	SDValue NewArg;
3514	if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3515	if (MemVT.getStoreSize() < `4` && Alignment < `4`) {
3516	// In this case the argument is packed into the previous preload SGPR.
3517	int64_t AlignDownOffset = alignDown(Value: Offset, Align: `4`);
3518	int64_t OffsetDiff = Offset - AlignDownOffset;
3519	EVT IntVT = MemVT.changeTypeToInteger();
3520
3521	const SIMachineFunctionInfo *Info =
3522	MF.getInfo<SIMachineFunctionInfo>();
3523	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3524	Register Reg =
3525	Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs [`0`];
3526
3527	assert(Reg);
3528	Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3529	SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3530
3531	SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * `8`, DL, VT: MVT::i32);
3532	SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3533
3534	SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3535	ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3536	NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3537	Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3538
3539	NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: `1`)}, dl: DL);
3540	} else {
3541	const SIMachineFunctionInfo *Info =
3542	MF.getInfo<SIMachineFunctionInfo>();
3543	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3544	const SmallVectorImpl<MCRegister> &PreloadRegs =
3545	Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3546
3547	SDValue Copy;
3548	if (PreloadRegs.size() == `1`) {
3549	Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs [`0`]);
3550	const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3551	NewArg = DAG.getCopyFromReg(
3552	Chain, dl: DL, Reg: VReg,
3553	VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3554	BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3555
3556	} else {
3557	// If the kernarg alignment does not match the alignment of the SGPR
3558	// tuple RC that can accommodate this argument, it will be built up
3559	// via copies from from the individual SGPRs that the argument was
3560	// preloaded to.
3561	SmallVector<SDValue, `4`> Elts;
3562	for (auto Reg : PreloadRegs) {
3563	Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3564	Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3565	Elts.push_back(Elt: Copy);
3566	}
3567	NewArg =
3568	DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3569	NumElements: PreloadRegs.size()),
3570	DL, Ops: Elts);
3571	}
3572
3573	// If the argument was preloaded to multiple consecutive 32-bit
3574	// registers because of misalignment between addressable SGPR tuples
3575	// and the argument size, we can still assume that because of kernarg
3576	// segment alignment restrictions that NewArg's size is the same as
3577	// MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3578	// truncate since we cannot preload to less than a single SGPR and the
3579	// MemVT may be smaller.
3580	EVT MemVTInt =
3581	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3582	if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3583	NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3584
3585	NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3586	NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3587	Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3588	NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3589	}
3590	} else {
3591	// Hidden arguments that are in the kernel signature must be preloaded
3592	// to user SGPRs. Print a diagnostic error if a hidden argument is in
3593	// the argument list and is not preloaded.
3594	if (Arg.isOrigArg()) {
3595	Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3596	if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3597	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
3598	*OrigArg->getParent(),
3599	"hidden argument in kernel signature was not preloaded",
3600	DL.getDebugLoc()));
3601	}
3602	}
3603
3604	NewArg =
3605	lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3606	Alignment, Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3607	}
3608	Chains.push_back(Elt: NewArg.getValue(R: `1`));
3609
3610	auto *ParamTy =
3611	dyn_cast<PointerType>(Val: FType->getParamType(i: Ins [i].getOrigArgIndex()));
3612	if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3613	ParamTy &&
3614	(ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
3615	ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3616	// On SI local pointers are just offsets into LDS, so they are always
3617	// less than 16-bits. On CI and newer they could potentially be
3618	// real pointers, so we can't guarantee their size.
3619	NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3620	N2: DAG.getValueType(MVT::i16));
3621	}
3622
3623	InVals.push_back(Elt: NewArg);
3624	continue;
3625	}
3626	if (!IsEntryFunc && VA.isMemLoc()) {
3627	SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3628	InVals.push_back(Elt: Val);
3629	if (!Arg.Flags.isByVal())
3630	Chains.push_back(Elt: Val.getValue(R: `1`));
3631	continue;
3632	}
3633
3634	assert(VA.isRegLoc() && "Parameter must be in a register!");
3635
3636	Register Reg = VA.getLocReg();
3637	const TargetRegisterClass RC = nullptr*;
3638	if (AMDGPU::VGPR_32RegClass.contains(Reg))
3639	RC = &AMDGPU::VGPR_32RegClass;
3640	else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3641	RC = &AMDGPU::SGPR_32RegClass;
3642	else
3643	llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3644
3645	Reg = MF.addLiveIn(PReg: Reg, RC);
3646	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3647	if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3648	// FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3649	// they will read physical regs before any side effect instructions.
3650	SDValue ReadFirstLane =
3651	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3652	Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
3653	N1: ReadFirstLane, N2: Val);
3654	}
3655
3656	if (Arg.Flags.isSRet()) {
3657	// The return object should be reasonably addressable.
3658
3659	// FIXME: This helps when the return is a real sret. If it is a
3660	// automatically inserted sret (i.e. CanLowerReturn returns false), an
3661	// extra copy is inserted in SelectionDAGBuilder which obscures this.
3662	unsigned NumBits =
3663	`32` - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3664	Val = DAG.getNode(
3665	Opcode: ISD::AssertZext, DL, VT, N1: Val,
3666	N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3667	}
3668
3669	Val = convertABITypeToValueType(DAG, Val, VA, SL: DL);
3670	InVals.push_back(Elt: Val);
3671	}
3672
3673	// Start adding system SGPRs.
3674	if (IsEntryFunc)
3675	allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3676
3677	unsigned StackArgSize = CCInfo.getStackSize();
3678	Info->setBytesInStackArgArea(StackArgSize);
3679
3680	return Chains.empty() ? Chain
3681	: DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3682	}
3683
3684	// TODO: If return values can't fit in registers, we should return as many as
3685	// possible in registers before passing on stack.
3686	bool SITargetLowering::CanLowerReturn(
3687	CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3688	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3689	const Type RetTy) const* {
3690	// Replacing returns with sret/stack usage doesn't make sense for shaders.
3691	// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3692	// for shaders. Vector types should be explicitly handled by CC.
3693	if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3694	return true;
3695
3696	SmallVector<CCValAssign, `16`> RVLocs;
3697	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3698	if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3699	return false;
3700
3701	// We must use the stack if return would require unavailable registers.
3702	unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3703	unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3704	for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3705	if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3706	return false;
3707
3708	return true;
3709	}
3710
3711	SDValue
3712	SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3713	bool isVarArg,
3714	const SmallVectorImpl<ISD::OutputArg> &Outs,
3715	const SmallVectorImpl<SDValue> &OutVals,
3716	const SDLoc &DL, SelectionDAG &DAG) const {
3717	MachineFunction &MF = DAG.getMachineFunction();
3718	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3719	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3720
3721	if (AMDGPU::isKernel(CC: CallConv)) {
3722	return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3723	OutVals, DL, DAG);
3724	}
3725
3726	bool IsShader = AMDGPU::isShader(CC: CallConv);
3727
3728	Info->setIfReturnsVoid(Outs.empty());
3729	bool IsWaveEnd = Info->returnsVoid() && IsShader;
3730
3731	// CCValAssign - represent the assignment of the return value to a location.
3732	SmallVector<CCValAssign, `48`> RVLocs;
3733
3734	// CCState - Info about the registers and stack slots.
3735	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3736	*DAG.getContext());
3737
3738	// Analyze outgoing return values.
3739	CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3740
3741	SDValue Glue;
3742	SmallVector<SDValue, `48`> RetOps;
3743	RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3744
3745	SDValue ReadFirstLane =
3746	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3747	// Copy the result values into the output registers.
3748	for (unsigned I = `0`, RealRVLocIdx = `0`, E = RVLocs.size(); I != E;
3749	++I, ++RealRVLocIdx) {
3750	CCValAssign &VA = RVLocs [I];
3751	assert(VA.isRegLoc() && "Can only return in registers!");
3752	// TODO: Partially return in registers if return values don't fit.
3753	SDValue Arg = OutVals [RealRVLocIdx];
3754
3755	// Copied from other backends.
3756	switch (VA.getLocInfo()) {
3757	case CCValAssign::Full:
3758	break;
3759	case CCValAssign::BCvt:
3760	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3761	break;
3762	case CCValAssign::SExt:
3763	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3764	break;
3765	case CCValAssign::ZExt:
3766	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3767	break;
3768	case CCValAssign::AExt:
3769	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3770	break;
3771	default:
3772	llvm_unreachable("Unknown loc info!");
3773	}
3774	if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3775	Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3776	N1: ReadFirstLane, N2: Arg);
3777	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3778	Glue = Chain.getValue(R: `1`);
3779	RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3780	}
3781
3782	// FIXME: Does sret work properly?
3783	if (!Info->isEntryFunction()) {
3784	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3785	const MCPhysReg *I =
3786	TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3787	if (I) {
3788	for (; *I; ++I) {
3789	if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3790	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3791	else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3792	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3793	else
3794	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3795	}
3796	}
3797	}
3798
3799	// Update chain and glue.
3800	RetOps [`0`] = Chain;
3801	if (Glue.getNode())
3802	RetOps.push_back(Elt: Glue);
3803
3804	unsigned Opc = AMDGPUISD::ENDPGM;
3805	if (!IsWaveEnd)
3806	Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3807	: IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3808	: AMDGPUISD::RET_GLUE;
3809	return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3810	}
3811
3812	SDValue SITargetLowering::LowerCallResult(
3813	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3814	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3815	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3816	SDValue ThisVal) const {
3817	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3818
3819	// Assign locations to each value returned by this call.
3820	SmallVector<CCValAssign, `16`> RVLocs;
3821	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3822	*DAG.getContext());
3823	CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3824
3825	// Copy all of the result registers out of their specified physreg.
3826	for (CCValAssign VA : RVLocs) {
3827	SDValue Val;
3828
3829	if (VA.isRegLoc()) {
3830	Val =
3831	DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3832	Chain = Val.getValue(R: `1`);
3833	InGlue = Val.getValue(R: `2`);
3834	} else if (VA.isMemLoc()) {
3835	report_fatal_error(reason: "TODO: return values in memory");
3836	} else
3837	llvm_unreachable("unknown argument location type");
3838
3839	switch (VA.getLocInfo()) {
3840	case CCValAssign::Full:
3841	break;
3842	case CCValAssign::BCvt:
3843	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3844	break;
3845	case CCValAssign::ZExt:
3846	Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3847	N2: DAG.getValueType(VA.getValVT()));
3848	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3849	break;
3850	case CCValAssign::SExt:
3851	Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3852	N2: DAG.getValueType(VA.getValVT()));
3853	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3854	break;
3855	case CCValAssign::AExt:
3856	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3857	break;
3858	default:
3859	llvm_unreachable("Unknown loc info!");
3860	}
3861
3862	InVals.push_back(Elt: Val);
3863	}
3864
3865	return Chain;
3866	}
3867
3868	// Add code to pass special inputs required depending on used features separate
3869	// from the explicit user arguments present in the IR.
3870	void SITargetLowering::passSpecialInputs(
3871	CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3872	SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3873	SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3874	// If we don't have a call site, this was a call inserted by
3875	// legalization. These can never use special inputs.
3876	if (!CLI.CB)
3877	return;
3878
3879	SelectionDAG &DAG = CLI.DAG;
3880	const SDLoc &DL = CLI.DL;
3881	const Function &F = DAG.getMachineFunction().getFunction();
3882
3883	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3884	const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3885
3886	const AMDGPUFunctionArgInfo &CalleeArgInfo =
3887	AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
3888
3889	// TODO: Unify with private memory register handling. This is complicated by
3890	// the fact that at least in kernels, the input argument is not necessarily
3891	// in the same location as the input.
3892	// clang-format off
3893	static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3894	std::array<StringLiteral, `2`>> ImplicitAttrs[] = {
3895	{AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3896	{AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3897	{AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3898	{AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3899	{AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3900	{AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3901	{AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3902	{AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3903	};
3904	// clang-format on
3905
3906	for (auto [InputID, Attrs] : ImplicitAttrs) {
3907	// If the callee does not use the attribute value, skip copying the value.
3908	if (all_of(Range&: Attrs, P: [&](StringRef Attr) {
3909	return Attr.empty() \|\| CLI.CB->hasFnAttr(Kind: Attr);
3910	}))
3911	continue;
3912
3913	const auto [OutgoingArg, ArgRC, ArgTy] =
3914	CalleeArgInfo.getPreloadedValue(Value: InputID);
3915	if (!OutgoingArg)
3916	continue;
3917
3918	const auto [IncomingArg, IncomingArgRC, Ty] =
3919	CallerArgInfo.getPreloadedValue(Value: InputID);
3920	assert(IncomingArgRC == ArgRC);
3921
3922	// All special arguments are ints for now.
3923	EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == `8` ? MVT::i64 : MVT::i32;
3924	SDValue InputReg;
3925
3926	if (IncomingArg) {
3927	InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3928	} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3929	// The implicit arg ptr is special because it doesn't have a corresponding
3930	// input for kernels, and is computed from the kernarg segment pointer.
3931	InputReg = getImplicitArgPtr(DAG, SL: DL);
3932	} else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3933	std::optional<uint32_t> Id =
3934	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3935	if (Id.has_value()) {
3936	InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3937	} else {
3938	InputReg = DAG.getPOISON(VT: ArgVT);
3939	}
3940	} else {
3941	// We may have proven the input wasn't needed, although the ABI is
3942	// requiring it. We just need to allocate the register appropriately.
3943	InputReg = DAG.getPOISON(VT: ArgVT);
3944	}
3945
3946	if (OutgoingArg->isRegister()) {
3947	RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3948	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3949	report_fatal_error(reason: "failed to allocate implicit input argument");
3950	} else {
3951	unsigned SpecialArgOffset =
3952	CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align (`4`));
3953	SDValue ArgStore =
3954	storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3955	MemOpChains.push_back(Elt: ArgStore);
3956	}
3957	}
3958
3959	// Pack workitem IDs into a single register or pass it as is if already
3960	// packed.
3961
3962	auto [OutgoingArg, ArgRC, Ty] =
3963	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3964	if (!OutgoingArg)
3965	std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3966	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3967	if (!OutgoingArg)
3968	std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3969	CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3970	if (!OutgoingArg)
3971	return;
3972
3973	const ArgDescriptor *IncomingArgX = std::get<`0`>(
3974	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3975	const ArgDescriptor *IncomingArgY = std::get<`0`>(
3976	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3977	const ArgDescriptor *IncomingArgZ = std::get<`0`>(
3978	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3979
3980	SDValue InputReg;
3981	SDLoc SL;
3982
3983	const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3984	const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3985	const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3986
3987	// If incoming ids are not packed we need to pack them.
3988	if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3989	NeedWorkItemIDX) {
3990	if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `0`) != `0`) {
3991	InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3992	} else {
3993	InputReg = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
3994	}
3995	}
3996
3997	if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3998	NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `1`) != `0`) {
3999	SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
4000	Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
4001	N2: DAG.getShiftAmountConstant(Val: `10`, VT: MVT::i32, DL: SL));
4002	InputReg = InputReg.getNode()
4003	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
4004	: Y;
4005	}
4006
4007	if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4008	NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `2`) != `0`) {
4009	SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
4010	Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
4011	N2: DAG.getShiftAmountConstant(Val: `20`, VT: MVT::i32, DL: SL));
4012	InputReg = InputReg.getNode()
4013	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
4014	: Z;
4015	}
4016
4017	if (!InputReg && (NeedWorkItemIDX \|\| NeedWorkItemIDY \|\| NeedWorkItemIDZ)) {
4018	if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4019	// We're in a situation where the outgoing function requires the workitem
4020	// ID, but the calling function does not have it (e.g a graphics function
4021	// calling a C calling convention function). This is illegal, but we need
4022	// to produce something.
4023	InputReg = DAG.getPOISON(VT: MVT::i32);
4024	} else {
4025	// Workitem ids are already packed, any of present incoming arguments
4026	// will carry all required fields.
4027	ArgDescriptor IncomingArg =
4028	ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
4029	: IncomingArgY ? *IncomingArgY
4030	: *IncomingArgZ,
4031	Mask: ~`0u`);
4032	InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
4033	}
4034	}
4035
4036	if (OutgoingArg->isRegister()) {
4037	if (InputReg)
4038	RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
4039
4040	CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
4041	} else {
4042	unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: `4`, Alignment: Align (`4`));
4043	if (InputReg) {
4044	SDValue ArgStore =
4045	storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4046	MemOpChains.push_back(Elt: ArgStore);
4047	}
4048	}
4049	}
4050
4051	bool SITargetLowering::isEligibleForTailCallOptimization(
4052	SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4053	const SmallVectorImpl<ISD::OutputArg> &Outs,
4054	const SmallVectorImpl<SDValue> &OutVals,
4055	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4056	if (AMDGPU::isChainCC(CC: CalleeCC))
4057	return true;
4058
4059	if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
4060	return false;
4061
4062	// For a divergent call target, we need to do a waterfall loop over the
4063	// possible callees which precludes us from using a simple jump.
4064	if (Callee ->isDivergent())
4065	return false;
4066
4067	MachineFunction &MF = DAG.getMachineFunction();
4068	const Function &CallerF = MF.getFunction();
4069	CallingConv::ID CallerCC = CallerF.getCallingConv();
4070	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4071	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4072
4073	// Kernels aren't callable, and don't have a live in return address so it
4074	// doesn't make sense to do a tail call with entry functions.
4075	if (!CallerPreserved)
4076	return false;
4077
4078	bool CCMatch = CallerCC == CalleeCC;
4079
4080	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4081	if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
4082	return true;
4083	return false;
4084	}
4085
4086	// TODO: Can we handle var args?
4087	if (IsVarArg)
4088	return false;
4089
4090	for (const Argument &Arg : CallerF.args()) {
4091	if (Arg.hasByValAttr())
4092	return false;
4093	}
4094
4095	LLVMContext &Ctx = *DAG.getContext();
4096
4097	// Check that the call results are passed in the same way.
4098	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
4099	CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
4100	CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
4101	return false;
4102
4103	// The callee has to preserve all registers the caller needs to preserve.
4104	if (!CCMatch) {
4105	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4106	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
4107	return false;
4108	}
4109
4110	// Nothing more to check if the callee is taking no arguments.
4111	if (Outs.empty())
4112	return true;
4113
4114	SmallVector<CCValAssign, `16`> ArgLocs;
4115	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4116
4117	// FIXME: We are not allocating special input registers, so we will be
4118	// deciding based on incorrect register assignments.
4119	CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
4120
4121	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4122	// If the stack arguments for this call do not fit into our own save area then
4123	// the call cannot be made tail.
4124	// TODO: Is this really necessary?
4125	if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4126	return false;
4127
4128	for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
4129	// FIXME: What about inreg arguments that end up passed in memory?
4130	if (!CCVA.isRegLoc())
4131	continue;
4132
4133	// If we are passing an argument in an SGPR, and the value is divergent,
4134	// this call requires a waterfall loop.
4135	if (ArgVal ->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
4136	LLVM_DEBUG(
4137	dbgs() << "Cannot tail call due to divergent outgoing argument in "
4138	<< printReg(CCVA.getLocReg(), TRI) << `'\n'`);
4139	return false;
4140	}
4141	}
4142
4143	const MachineRegisterInfo &MRI = MF.getRegInfo();
4144	return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
4145	}
4146
4147	bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
4148	if (!CI->isTailCall())
4149	return false;
4150
4151	const Function *ParentFn = CI->getFunction();
4152	if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
4153	return false;
4154	return true;
4155	}
4156
4157	namespace {
4158	// Chain calls have special arguments that we need to handle. These are
4159	// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4160	// arguments (index 0 and 1 respectively).
4161	enum ChainCallArgIdx {
4162	Exec = `2`,
4163	Flags,
4164	NumVGPRs,
4165	FallbackExec,
4166	FallbackCallee
4167	};
4168	} // anonymous namespace
4169
4170	// The wave scratch offset register is used as the global base pointer.
4171	SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4172	SmallVectorImpl<SDValue> &InVals) const {
4173	CallingConv::ID CallConv = CLI.CallConv;
4174	bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
4175
4176	SelectionDAG &DAG = CLI.DAG;
4177
4178	const SDLoc &DL = CLI.DL;
4179	SDValue Chain = CLI.Chain;
4180	SDValue Callee = CLI.Callee;
4181
4182	llvm::SmallVector<SDValue, `6`> ChainCallSpecialArgs;
4183	bool UsesDynamicVGPRs = false;
4184	if (IsChainCallConv) {
4185	// The last arguments should be the value that we need to put in EXEC,
4186	// followed by the flags and any other arguments with special meanings.
4187	// Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4188	// we don't treat them like the "real" arguments.
4189	auto RequestedExecIt =
4190	llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
4191	return Arg.OrigArgIndex == `2`;
4192	});
4193	assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4194
4195	size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4196	CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
4197	CE: CLI.OutVals.end());
4198	CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
4199
4200	assert(CLI.Outs.back().OrigArgIndex < `2` &&
4201	"Haven't popped all the special args");
4202
4203	TargetLowering::ArgListEntry RequestedExecArg =
4204	CLI.Args [ChainCallArgIdx::Exec];
4205	if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
4206	return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
4207
4208	// Convert constants into TargetConstants, so they become immediate operands
4209	// instead of being selected into S_MOV.
4210	auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4211	if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
4212	ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
4213	Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: `0`)));
4214	} else
4215	ChainCallSpecialArgs.push_back(Elt: Arg.Node);
4216	};
4217
4218	PushNodeOrTargetConstant (RequestedExecArg);
4219
4220	// Process any other special arguments depending on the value of the flags.
4221	TargetLowering::ArgListEntry Flags = CLI.Args [ChainCallArgIdx::Flags];
4222
4223	const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
4224	if (FlagsValue.isZero()) {
4225	if (CLI.Args.size() > ChainCallArgIdx::Flags + `1`)
4226	return lowerUnhandledCall(CLI, InVals,
4227	Reason: "no additional args allowed if flags == 0");
4228	} else if (FlagsValue.isOneBitSet(BitNo: `0`)) {
4229	if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + `1`) {
4230	return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
4231	}
4232
4233	if (!Subtarget->isWave32()) {
4234	return lowerUnhandledCall(
4235	CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
4236	}
4237
4238	UsesDynamicVGPRs = true;
4239	std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4240	last: CLI.Args.end(), f: PushNodeOrTargetConstant);
4241	}
4242	}
4243
4244	SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
4245	SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
4246	SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
4247	bool &IsTailCall = CLI.IsTailCall;
4248	bool IsVarArg = CLI.IsVarArg;
4249	bool IsSibCall = false;
4250	MachineFunction &MF = DAG.getMachineFunction();
4251
4252	if (Callee.isUndef() \|\| isNullConstant(V: Callee)) {
4253	if (!CLI.IsTailCall) {
4254	for (ISD::InputArg &Arg : CLI.Ins)
4255	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
4256	}
4257
4258	return Chain;
4259	}
4260
4261	if (IsVarArg) {
4262	return lowerUnhandledCall(CLI, InVals,
4263	Reason: "unsupported call to variadic function ");
4264	}
4265
4266	if (!CLI.CB)
4267	return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
4268
4269	if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4270	return lowerUnhandledCall(CLI, InVals,
4271	Reason: "unsupported required tail call to function ");
4272	}
4273
4274	if (IsTailCall) {
4275	IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
4276	Outs, OutVals, Ins, DAG);
4277	if (!IsTailCall &&
4278	((CLI.CB && CLI.CB->isMustTailCall()) \|\| IsChainCallConv)) {
4279	report_fatal_error(reason: "failed to perform tail call elimination on a call "
4280	"site marked musttail or on llvm.amdgcn.cs.chain");
4281	}
4282
4283	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4284
4285	// A sibling call is one where we're under the usual C ABI and not planning
4286	// to change that but can still do a tail call:
4287	if (!TailCallOpt && IsTailCall)
4288	IsSibCall = true;
4289
4290	if (IsTailCall)
4291	++NumTailCalls;
4292	}
4293
4294	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4295	SmallVector<std::pair<unsigned, SDValue>, `8`> RegsToPass;
4296	SmallVector<SDValue, `8`> MemOpChains;
4297
4298	// Analyze operands of the call, assigning locations to each operand.
4299	SmallVector<CCValAssign, `16`> ArgLocs;
4300	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4301	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
4302
4303	if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv) &&
4304	CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
4305	// With a fixed ABI, allocate fixed registers before user arguments.
4306	passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
4307	}
4308
4309	// Mark the scratch resource descriptor as allocated so the CC analysis
4310	// does not assign user arguments to these registers, matching the callee.
4311	if (!Subtarget->hasFlatScratchEnabled())
4312	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
4313
4314	CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
4315
4316	// Get a count of how many bytes are to be pushed on the stack.
4317	unsigned NumBytes = CCInfo.getStackSize();
4318
4319	if (IsSibCall) {
4320	// Since we're not changing the ABI to make this a tail call, the memory
4321	// operands are already available in the caller's incoming argument space.
4322	NumBytes = `0`;
4323	}
4324
4325	// FPDiff is the byte offset of the call's argument area from the callee's.
4326	// Stores to callee stack arguments will be placed in FixedStackSlots offset
4327	// by this amount for a tail call. In a sibling call it must be 0 because the
4328	// caller will deallocate the entire stack and the callee still expects its
4329	// arguments to begin at SP+0. Completely unused for non-tail calls.
4330	int32_t FPDiff = `0`;
4331	MachineFrameInfo &MFI = MF.getFrameInfo();
4332	auto *TRI = Subtarget->getRegisterInfo();
4333
4334	// Adjust the stack pointer for the new arguments...
4335	// These operations are automatically eliminated by the prolog/epilog pass
4336	if (!IsSibCall)
4337	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL);
4338
4339	if (!IsSibCall \|\| IsChainCallConv) {
4340	if (!Subtarget->hasFlatScratchEnabled()) {
4341	SmallVector<SDValue, `4`> CopyFromChains;
4342
4343	// In the HSA case, this should be an identity copy.
4344	SDValue ScratchRSrcReg =
4345	DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
4346	RegsToPass.emplace_back(Args: IsChainCallConv
4347	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4348	: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4349	Args&: ScratchRSrcReg);
4350	CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: `1`));
4351	Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
4352	}
4353	}
4354
4355	const unsigned NumSpecialInputs = RegsToPass.size();
4356
4357	MVT PtrVT = MVT::i32;
4358
4359	// Walk the register/memloc assignments, inserting copies/loads.
4360	for (unsigned i = `0`, e = ArgLocs.size(); i != e; ++i) {
4361	CCValAssign &VA = ArgLocs [i];
4362	SDValue Arg = OutVals [i];
4363
4364	// Promote the value if needed.
4365	switch (VA.getLocInfo()) {
4366	case CCValAssign::Full:
4367	break;
4368	case CCValAssign::BCvt:
4369	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
4370	break;
4371	case CCValAssign::ZExt:
4372	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4373	break;
4374	case CCValAssign::SExt:
4375	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4376	break;
4377	case CCValAssign::AExt:
4378	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4379	break;
4380	case CCValAssign::FPExt:
4381	Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4382	break;
4383	default:
4384	llvm_unreachable("Unknown loc info!");
4385	}
4386
4387	if (VA.isRegLoc()) {
4388	RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
4389	} else {
4390	assert(VA.isMemLoc());
4391
4392	SDValue DstAddr;
4393	MachinePointerInfo DstInfo;
4394
4395	unsigned LocMemOffset = VA.getLocMemOffset();
4396	int32_t Offset = LocMemOffset;
4397
4398	SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
4399	MaybeAlign Alignment;
4400
4401	if (IsTailCall) {
4402	ISD::ArgFlagsTy Flags = Outs [i].Flags;
4403	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4404	: VA.getValVT().getStoreSize();
4405
4406	// FIXME: We can have better than the minimum byval required alignment.
4407	Alignment =
4408	Flags.isByVal()
4409	? Flags.getNonZeroByValAlign()
4410	: commonAlignment(A: Subtarget->getStackAlignment(), Offset);
4411
4412	Offset = Offset + FPDiff;
4413	int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
4414
4415	DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
4416	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4417
4418	// Make sure any stack arguments overlapping with where we're storing
4419	// are loaded before this eventual operation. Otherwise they'll be
4420	// clobbered.
4421
4422	// FIXME: Why is this really necessary? This seems to just result in a
4423	// lot of code to copy the stack and write them back to the same
4424	// locations, which are supposed to be immutable?
4425	Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
4426	} else {
4427	// Stores to the argument stack area are relative to the stack pointer.
4428	SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
4429	VT: MVT::i32);
4430	DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
4431	DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
4432	Alignment =
4433	commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
4434	}
4435
4436	if (Outs [i].Flags.isByVal()) {
4437	SDValue SizeNode =
4438	DAG.getConstant(Val: Outs [i].Flags.getByValSize(), DL, VT: MVT::i32);
4439	SDValue Cpy =
4440	DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4441	Alignment: Outs [i].Flags.getNonZeroByValAlign(),
4442	/isVol = / false, /AlwaysInline = / true,
4443	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4444	SrcPtrInfo: MachinePointerInfo (AMDGPUAS::PRIVATE_ADDRESS));
4445
4446	MemOpChains.push_back(Elt: Cpy);
4447	} else {
4448	SDValue Store =
4449	DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4450	MemOpChains.push_back(Elt: Store);
4451	}
4452	}
4453	}
4454
4455	if (!MemOpChains.empty())
4456	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4457
4458	SDValue ReadFirstLaneID =
4459	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4460
4461	SDValue TokenGlue;
4462	if (CLI.ConvergenceControlToken) {
4463	TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4464	Operand: CLI.ConvergenceControlToken);
4465	}
4466
4467	// Build a sequence of copy-to-reg nodes chained together with token chain
4468	// and flag operands which copy the outgoing args into the appropriate regs.
4469	SDValue InGlue;
4470
4471	unsigned ArgIdx = `0`;
4472	for (auto [Reg, Val] : RegsToPass) {
4473	if (ArgIdx++ >= NumSpecialInputs &&
4474	(IsChainCallConv \|\| !Val ->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4475	// For chain calls, the inreg arguments are required to be
4476	// uniform. Speculatively Insert a readfirstlane in case we cannot prove
4477	// they are uniform.
4478	//
4479	// For other calls, if an inreg arguments is known to be uniform,
4480	// speculatively insert a readfirstlane in case it is in a VGPR.
4481	//
4482	// FIXME: We need to execute this in a waterfall loop if it is a divergent
4483	// value, so let that continue to produce invalid code.
4484
4485	SmallVector<SDValue, `3`> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4486	if (TokenGlue)
4487	ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4488	Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4489	Ops: ReadfirstlaneArgs);
4490	}
4491
4492	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4493	InGlue = Chain.getValue(R: `1`);
4494	}
4495
4496	// We don't usually want to end the call-sequence here because we would tidy
4497	// the frame up after* the call, however in the ABI-changing tail-call case*
4498	// we've carefully laid out the parameters so that when sp is reset they'll be
4499	// in the correct location.
4500	if (IsTailCall && !IsSibCall) {
4501	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: `0`, Glue: InGlue, DL);
4502	InGlue = Chain.getValue(R: `1`);
4503	}
4504
4505	std::vector<SDValue> Ops({Chain});
4506
4507	// Add a redundant copy of the callee global which will not be legalized, as
4508	// we need direct access to the callee later.
4509	if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4510	const GlobalValue *GV = GSD->getGlobal();
4511	Ops.push_back(x: Callee);
4512	Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4513	} else {
4514	if (IsTailCall) {
4515	// isEligibleForTailCallOptimization considered whether the call target is
4516	// divergent, but we may still end up with a uniform value in a VGPR.
4517	// Insert a readfirstlane just in case.
4518	SDValue ReadFirstLaneID =
4519	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4520
4521	SmallVector<SDValue, `3`> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4522	if (TokenGlue)
4523	ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4524	Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4525	Ops: ReadfirstlaneArgs);
4526	}
4527
4528	Ops.push_back(x: Callee);
4529	Ops.push_back(x: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64));
4530	}
4531
4532	if (IsTailCall) {
4533	// Each tail call may have to adjust the stack by a different amount, so
4534	// this information must travel along with the operation for eventual
4535	// consumption by emitEpilogue.
4536	Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4537	}
4538
4539	if (IsChainCallConv)
4540	llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4541
4542	// Add argument registers to the end of the list so that they are known live
4543	// into the call.
4544	for (auto &[Reg, Val] : RegsToPass)
4545	Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4546
4547	// Add a register mask operand representing the call-preserved registers.
4548	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4549	assert(Mask && "Missing call preserved mask for calling convention");
4550	Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4551
4552	if (SDValue Token = CLI.ConvergenceControlToken) {
4553	SmallVector<SDValue, `2`> GlueOps;
4554	GlueOps.push_back(Elt: Token);
4555	if (InGlue)
4556	GlueOps.push_back(Elt: InGlue);
4557
4558	InGlue = SDValue (DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4559	VT: MVT::Glue, Ops: GlueOps),
4560	`0`);
4561	}
4562
4563	if (InGlue)
4564	Ops.push_back(x: InGlue);
4565
4566	// If we're doing a tall call, use a TC_RETURN here rather than an
4567	// actual call instruction.
4568	if (IsTailCall) {
4569	MFI.setHasTailCall();
4570	unsigned OPC = AMDGPUISD::TC_RETURN;
4571	switch (CallConv) {
4572	case CallingConv::AMDGPU_Gfx:
4573	OPC = AMDGPUISD::TC_RETURN_GFX;
4574	break;
4575	case CallingConv::AMDGPU_CS_Chain:
4576	case CallingConv::AMDGPU_CS_ChainPreserve:
4577	OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4578	: AMDGPUISD::TC_RETURN_CHAIN;
4579	break;
4580	}
4581
4582	// If the caller is a whole wave function, we need to use a special opcode
4583	// so we can patch up EXEC.
4584	if (Info->isWholeWaveFunction())
4585	OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4586
4587	return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4588	}
4589
4590	// Returns a chain and a flag for retval copy to use.
4591	SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4592	Chain = Call.getValue(R: `0`);
4593	InGlue = Call.getValue(R: `1`);
4594
4595	uint64_t CalleePopBytes = NumBytes;
4596	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: CalleePopBytes, Glue: InGlue, DL);
4597	if (!Ins.empty())
4598	InGlue = Chain.getValue(R: `1`);
4599
4600	// Handle result values, copying them out of physregs into vregs that we
4601	// return.
4602	return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4603	InVals, /IsThisReturn=/false, ThisVal: SDValue ());
4604	}
4605
4606	// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4607	// except for:
4608	// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4609	// 2. Scale size where, scale = wave-reduction(alloca-size) wave-size*
4610	SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4611	SelectionDAG &DAG) const {
4612	const MachineFunction &MF = DAG.getMachineFunction();
4613	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4614
4615	SDLoc dl(Op);
4616	EVT VT = Op.getValueType();
4617	SDValue Chain = Op.getOperand(i: `0`);
4618	Register SPReg = Info->getStackPtrOffsetReg();
4619
4620	// Chain the dynamic stack allocation so that it doesn't modify the stack
4621	// pointer when other instructions are using the stack.
4622	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL: dl);
4623
4624	SDValue Size = Op.getOperand(i: `1`);
4625	SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4626	Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getAlignValue();
4627
4628	const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4629	assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4630	"Stack grows upwards for AMDGPU");
4631
4632	Chain = BaseAddr.getValue(R: `1`);
4633	Align StackAlign = TFL->getStackAlign();
4634	if (Alignment > StackAlign) {
4635	uint64_t ScaledAlignment = Alignment.value()
4636	<< Subtarget->getWavefrontSizeLog2();
4637	uint64_t StackAlignMask = ScaledAlignment - `1`;
4638	SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4639	N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4640	BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4641	N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4642	}
4643
4644	assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4645	SDValue NewSP;
4646	if (isa<ConstantSDNode>(Val: Size)) {
4647	// For constant sized alloca, scale alloca size by wave-size
4648	SDValue ScaledSize = DAG.getNode(
4649	Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4650	N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4651	NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4652	} else {
4653	// For dynamic sized alloca, perform wave-wide reduction to get max of
4654	// alloca size(divergent) and then scale it by wave-size
4655	SDValue WaveReduction =
4656	DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4657	Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4658	N2: Size, N3: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32));
4659	SDValue ScaledSize = DAG.getNode(
4660	Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4661	N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4662	NewSP =
4663	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4664	SDValue ReadFirstLaneID =
4665	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4666	NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4667	N2: NewSP);
4668	}
4669
4670	Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4671	SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: SDValue (), DL: dl);
4672
4673	return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4674	}
4675
4676	SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4677	if (Op.getValueType() != MVT::i32)
4678	return Op; // Defer to cannot select error.
4679
4680	Register SP = getStackPointerRegisterToSaveRestore();
4681	SDLoc SL(Op);
4682
4683	SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op ->getOperand(Num: `0`), dl: SL, Reg: SP, VT: MVT::i32);
4684
4685	// Convert from wave uniform to swizzled vector address. This should protect
4686	// from any edge cases where the stacksave result isn't directly used with
4687	// stackrestore.
4688	SDValue VectorAddress =
4689	DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4690	return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: `1`)}, dl: SL);
4691	}
4692
4693	SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4694	SelectionDAG &DAG) const {
4695	SDLoc SL(Op);
4696	assert(Op.getValueType() == MVT::i32);
4697
4698	uint32_t BothRoundHwReg =
4699	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `4`);
4700	SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4701
4702	SDValue IntrinID =
4703	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4704	SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op ->getVTList(),
4705	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: GetRoundBothImm);
4706
4707	// There are two rounding modes, one for f32 and one for f64/f16. We only
4708	// report in the standard value range if both are the same.
4709	//
4710	// The raw values also differ from the expected FLT_ROUNDS values. Nearest
4711	// ties away from zero is not supported, and the other values are rotated by
4712	// 1.
4713	//
4714	// If the two rounding modes are not the same, report a target defined value.
4715
4716	// Mode register rounding mode fields:
4717	//
4718	// [1:0] Single-precision round mode.
4719	// [3:2] Double/Half-precision round mode.
4720	//
4721	// 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4722	//
4723	// Hardware Spec
4724	// Toward-0 3 0
4725	// Nearest Even 0 1
4726	// +Inf 1 2
4727	// -Inf 2 3
4728	// NearestAway0 N/A 4
4729	//
4730	// We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4731	// table we can index by the raw hardware mode.
4732	//
4733	// (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4734
4735	SDValue BitTable =
4736	DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4737
4738	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4739	SDValue RoundModeTimesNumBits =
4740	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4741
4742	// TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4743	// knew only one mode was demanded.
4744	SDValue TableValue =
4745	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4746	SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4747
4748	SDValue EntryMask = DAG.getConstant(Val: `0xf`, DL: SL, VT: MVT::i32);
4749	SDValue TableEntry =
4750	DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4751
4752	// There's a gap in the 4-bit encoded table and actual enum values, so offset
4753	// if it's an extended value.
4754	SDValue Four = DAG.getConstant(Val: `4`, DL: SL, VT: MVT::i32);
4755	SDValue IsStandardValue =
4756	DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4757	SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4758	SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4759	N2: TableEntry, N3: EnumOffset);
4760
4761	return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: `1`)}, dl: SL);
4762	}
4763
4764	SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4765	SelectionDAG &DAG) const {
4766	SDLoc SL(Op);
4767
4768	SDValue NewMode = Op.getOperand(i: `1`);
4769	assert(NewMode.getValueType() == MVT::i32);
4770
4771	// Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4772	// hardware MODE.fp_round values.
4773	if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4774	uint32_t ClampedVal = std::min(
4775	a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4776	b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4777	NewMode = DAG.getConstant(
4778	Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4779	} else {
4780	// If we know the input can only be one of the supported standard modes in
4781	// the range 0-3, we can use a simplified mapping to hardware values.
4782	KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4783	const bool UseReducedTable = KB.countMinLeadingZeros() >= `30`;
4784	// The supported standard values are 0-3. The extended values start at 8. We
4785	// need to offset by 4 if the value is in the extended range.
4786
4787	if (UseReducedTable) {
4788	// Truncate to the low 32-bits.
4789	SDValue BitTable = DAG.getConstant(
4790	Val: AMDGPU::FltRoundToHWConversionTable & `0xffff`, DL: SL, VT: MVT::i32);
4791
4792	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4793	SDValue RoundModeTimesNumBits =
4794	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4795
4796	NewMode =
4797	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4798
4799	// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4800	// the table extracted bits into inline immediates.
4801	} else {
4802	// table_index = umin(value, value - 4)
4803	// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4804	SDValue BitTable =
4805	DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4806
4807	SDValue Four = DAG.getConstant(Val: `4`, DL: SL, VT: MVT::i32);
4808	SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4809	SDValue IndexVal =
4810	DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4811
4812	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4813	SDValue RoundModeTimesNumBits =
4814	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4815
4816	SDValue TableValue =
4817	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4818	SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4819
4820	// No need to mask out the high bits since the setreg will ignore them
4821	// anyway.
4822	NewMode = TruncTable;
4823	}
4824
4825	// Insert a readfirstlane in case the value is a VGPR. We could do this
4826	// earlier and keep more operations scalar, but that interferes with
4827	// combining the source.
4828	SDValue ReadFirstLaneID =
4829	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4830	NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4831	N1: ReadFirstLaneID, N2: NewMode);
4832	}
4833
4834	// N.B. The setreg will be later folded into s_round_mode on supported
4835	// targets.
4836	SDValue IntrinID =
4837	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4838	uint32_t BothRoundHwReg =
4839	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `4`);
4840	SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4841
4842	SDValue SetReg =
4843	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op ->getVTList(), N1: Op.getOperand(i: `0`),
4844	N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4845
4846	return SetReg;
4847	}
4848
4849	SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4850	if (Op ->isDivergent() &&
4851	(!Subtarget->hasVmemPrefInsts() \|\| !Op.getConstantOperandVal(i: `4`)))
4852	// Cannot do I$ prefetch with divergent pointer.
4853	return SDValue ();
4854
4855	switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4856	case AMDGPUAS::FLAT_ADDRESS:
4857	case AMDGPUAS::GLOBAL_ADDRESS:
4858	case AMDGPUAS::CONSTANT_ADDRESS:
4859	break;
4860	case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4861	if (Subtarget->hasSafeSmemPrefetch())
4862	break;
4863	[[fallthrough]];
4864	default:
4865	return SDValue ();
4866	}
4867
4868	// I$ prefetch
4869	if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(i: `4`))
4870	return SDValue ();
4871
4872	return Op;
4873	}
4874
4875	// Work around DAG legality rules only based on the result type.
4876	SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4877	bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4878	SDValue Src = Op.getOperand(i: IsStrict ? `1` : `0`);
4879	EVT SrcVT = Src.getValueType();
4880
4881	if (SrcVT.getScalarType() != MVT::bf16)
4882	return Op;
4883
4884	SDLoc SL(Op);
4885	SDValue BitCast =
4886	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4887
4888	EVT DstVT = Op.getValueType();
4889	if (IsStrict)
4890	llvm_unreachable("Need STRICT_BF16_TO_FP");
4891
4892	return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4893	}
4894
4895	SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4896	SDLoc SL(Op);
4897	if (Op.getValueType() != MVT::i64)
4898	return Op;
4899
4900	uint32_t ModeHwReg =
4901	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
4902	SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4903	uint32_t TrapHwReg =
4904	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
4905	SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4906
4907	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4908	SDValue IntrinID =
4909	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4910	SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4911	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: ModeHwRegImm);
4912	SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4913	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: TrapHwRegImm);
4914	SDValue TokenReg =
4915	DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: `1`),
4916	N2: GetTrapReg.getValue(R: `1`));
4917
4918	SDValue CvtPtr =
4919	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4920	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4921
4922	return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4923	}
4924
4925	SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4926	SDLoc SL(Op);
4927	if (Op.getOperand(i: `1`).getValueType() != MVT::i64)
4928	return Op;
4929
4930	SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
4931	SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4932	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
4933	SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4934	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
4935
4936	SDValue ReadFirstLaneID =
4937	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4938	NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4939	N1: ReadFirstLaneID, N2: NewModeReg);
4940	NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4941	N1: ReadFirstLaneID, N2: NewTrapReg);
4942
4943	unsigned ModeHwReg =
4944	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
4945	SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4946	unsigned TrapHwReg =
4947	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
4948	SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4949
4950	SDValue IntrinID =
4951	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4952	SDValue SetModeReg =
4953	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4954	N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4955	SDValue SetTrapReg =
4956	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4957	N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4958	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4959	}
4960
4961	Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4962	const MachineFunction &MF) const {
4963	const Function &Fn = MF.getFunction();
4964
4965	Register Reg = StringSwitch<Register>(RegName)
4966	.Case(S: "m0", Value: AMDGPU::M0)
4967	.Case(S: "exec", Value: AMDGPU::EXEC)
4968	.Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4969	.Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4970	.Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4971	.Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4972	.Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4973	.Default(Value: Register ());
4974	if (!Reg)
4975	return Reg;
4976
4977	if (!Subtarget->hasFlatScrRegister() &&
4978	Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4979	Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef (RegName) +
4980	"\" for subtarget."));
4981	}
4982
4983	switch (Reg) {
4984	case AMDGPU::M0:
4985	case AMDGPU::EXEC_LO:
4986	case AMDGPU::EXEC_HI:
4987	case AMDGPU::FLAT_SCR_LO:
4988	case AMDGPU::FLAT_SCR_HI:
4989	if (VT.getSizeInBits() == `32`)
4990	return Reg;
4991	break;
4992	case AMDGPU::EXEC:
4993	case AMDGPU::FLAT_SCR:
4994	if (VT.getSizeInBits() == `64`)
4995	return Reg;
4996	break;
4997	default:
4998	llvm_unreachable("missing register type checking");
4999	}
5000
5001	report_fatal_error(
5002	reason: Twine("invalid type for register \"" + StringRef (RegName) + "\"."));
5003	}
5004
5005	// If kill is not the last instruction, split the block so kill is always a
5006	// proper terminator.
5007	MachineBasicBlock *
5008	SITargetLowering::splitKillBlock(MachineInstr &MI,
5009	MachineBasicBlock BB) const* {
5010	MachineBasicBlock SplitBB = BB->splitAt(SplitInst&: MI, /UpdateLiveIns=/*true);
5011	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5012	MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
5013	return SplitBB;
5014	}
5015
5016	// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5017	// \p MI will be the only instruction in the loop body block. Otherwise, it will
5018	// be the first instruction in the remainder block.
5019	//
5020	/// \returns { LoopBody, Remainder }
5021	static std::pair<MachineBasicBlock , MachineBasicBlock >
5022	splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
5023	MachineFunction *MF = MBB.getParent();
5024	MachineBasicBlock::iterator I(&MI);
5025
5026	// To insert the loop we need to split the block. Move everything after this
5027	// point to a new block, and insert a new empty block between the two.
5028	MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
5029	MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5030	MachineFunction::iterator MBBI(MBB);
5031	++MBBI;
5032
5033	MF->insert(MBBI, MBB: LoopBB);
5034	MF->insert(MBBI, MBB: RemainderBB);
5035
5036	LoopBB->addSuccessor(Succ: LoopBB);
5037	LoopBB->addSuccessor(Succ: RemainderBB);
5038
5039	// Move the rest of the block into a new block.
5040	RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
5041
5042	if (InstInLoop) {
5043	auto Next = std::next(x: I);
5044
5045	// Move instruction to loop body.
5046	LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
5047
5048	// Move the rest of the block.
5049	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
5050	} else {
5051	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
5052	}
5053
5054	MBB.addSuccessor(Succ: LoopBB);
5055
5056	return std::pair(LoopBB, RemainderBB);
5057	}
5058
5059	/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5060	void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
5061	MachineBasicBlock *MBB = MI.getParent();
5062	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5063	auto I = MI.getIterator();
5064	auto E = std::next(x: I);
5065
5066	// clang-format off
5067	BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
5068	.addImm(Val: `0`);
5069	// clang-format on
5070
5071	MIBundleBuilder Bundler(*MBB, I, E);
5072	finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
5073	}
5074
5075	MachineBasicBlock *
5076	SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
5077	MachineBasicBlock BB) const* {
5078	const DebugLoc &DL = MI.getDebugLoc();
5079
5080	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5081
5082	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5083
5084	// Apparently kill flags are only valid if the def is in the same block?
5085	if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
5086	Src->setIsKill(false);
5087
5088	auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true*);
5089
5090	MachineBasicBlock::iterator I = LoopBB->end();
5091
5092	const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5093	Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: `1`);
5094
5095	// Clear TRAP_STS.MEM_VIOL
5096	BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
5097	.addImm(Val: `0`)
5098	.addImm(Val: EncodedReg);
5099
5100	bundleInstWithWaitcnt(MI);
5101
5102	Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5103
5104	// Load and check TRAP_STS.MEM_VIOL
5105	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
5106	.addImm(Val: EncodedReg);
5107
5108	// FIXME: Do we need to use an isel pseudo that may clobber scc?
5109	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5110	.addReg(RegNo: Reg, Flags: RegState::Kill)
5111	.addImm(Val: `0`);
5112	// clang-format off
5113	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5114	.addMBB(MBB: LoopBB);
5115	// clang-format on
5116
5117	return RemainderBB;
5118	}
5119
5120	// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5121	// wavefront. If the value is uniform and just happens to be in a VGPR, this
5122	// will only do one iteration. In the worst case, this will loop 64 times.
5123	//
5124	// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5125	static MachineBasicBlock::iterator
5126	emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
5127	MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5128	const DebugLoc &DL, const MachineOperand &Idx,
5129	unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5130	unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5131	Register &SGPRIdxReg) {
5132
5133	MachineFunction *MF = OrigBB.getParent();
5134	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5135	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5136	const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5137	MachineBasicBlock::iterator I = LoopBB.begin();
5138
5139	const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5140	Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
5141	Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
5142	Register CurrentIdxReg =
5143	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5144	Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
5145
5146	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
5147	.addReg(RegNo: InitReg)
5148	.addMBB(MBB: &OrigBB)
5149	.addReg(RegNo: ResultReg)
5150	.addMBB(MBB: &LoopBB);
5151
5152	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
5153	.addReg(RegNo: InitSaveExecReg)
5154	.addMBB(MBB: &OrigBB)
5155	.addReg(RegNo: NewExec)
5156	.addMBB(MBB: &LoopBB);
5157
5158	// Read the next variant <- also loop target.
5159	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
5160	.addReg(RegNo: Idx.getReg(), Flags: getUndefRegState(B: Idx.isUndef()));
5161
5162	// Compare the just read M0 value to all possible Idx values.
5163	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
5164	.addReg(RegNo: CurrentIdxReg)
5165	.addReg(RegNo: Idx.getReg(), Flags: {}, SubReg: Idx.getSubReg());
5166
5167	// Update EXEC, save the original EXEC value to VCC.
5168	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.AndSaveExecOpc), DestReg: NewExec)
5169	.addReg(RegNo: CondReg, Flags: RegState::Kill);
5170
5171	MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
5172
5173	if (UseGPRIdxMode) {
5174	if (Offset == `0`) {
5175	SGPRIdxReg = CurrentIdxReg;
5176	} else {
5177	SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
5178	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
5179	.addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5180	.addImm(Val: Offset);
5181	}
5182	} else {
5183	// Move index from VCC into M0
5184	if (Offset == `0`) {
5185	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5186	.addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill);
5187	} else {
5188	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5189	.addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5190	.addImm(Val: Offset);
5191	}
5192	}
5193
5194	// Update EXEC, switch all done bits to 0 and all todo bits to 1.
5195	MachineInstr *InsertPt =
5196	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
5197	.addReg(RegNo: LMC.ExecReg)
5198	.addReg(RegNo: NewExec);
5199
5200	// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5201	// s_cbranch_scc0?
5202
5203	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5204	// clang-format off
5205	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5206	.addMBB(MBB: &LoopBB);
5207	// clang-format on
5208
5209	return InsertPt->getIterator();
5210	}
5211
5212	// This has slightly sub-optimal regalloc when the source vector is killed by
5213	// the read. The register allocator does not understand that the kill is
5214	// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5215	// subregister from it, using 1 more VGPR than necessary. This was saved when
5216	// this was expanded after register allocation.
5217	static MachineBasicBlock::iterator
5218	loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
5219	unsigned InitResultReg, unsigned PhiReg, int Offset,
5220	bool UseGPRIdxMode, Register &SGPRIdxReg) {
5221	MachineFunction *MF = MBB.getParent();
5222	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5223	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5224	MachineRegisterInfo &MRI = MF->getRegInfo();
5225	const DebugLoc &DL = MI.getDebugLoc();
5226	MachineBasicBlock::iterator I(&MI);
5227
5228	const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5229	Register DstReg = MI.getOperand(i: `0`).getReg();
5230	Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5231	Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5232	const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5233
5234	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
5235
5236	// Save the EXEC mask
5237	// clang-format off
5238	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExec)
5239	.addReg(RegNo: LMC.ExecReg);
5240	// clang-format on
5241
5242	auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
5243
5244	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5245
5246	auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: LoopBB, DL, Idx: Idx,
5247	InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
5248	Offset, UseGPRIdxMode, SGPRIdxReg);
5249
5250	MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5251	MachineFunction::iterator MBBI(LoopBB);
5252	++MBBI;
5253	MF->insert(MBBI, MBB: LandingPad);
5254	LoopBB->removeSuccessor(Succ: RemainderBB);
5255	LandingPad->addSuccessor(Succ: RemainderBB);
5256	LoopBB->addSuccessor(Succ: LandingPad);
5257	MachineBasicBlock::iterator First = LandingPad->begin();
5258	// clang-format off
5259	BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
5260	.addReg(RegNo: SaveExec);
5261	// clang-format on
5262
5263	return InsPt;
5264	}
5265
5266	// Returns subreg index, offset
5267	static std::pair<unsigned, int>
5268	computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
5269	const TargetRegisterClass SuperRC, unsigned* VecReg,
5270	int Offset) {
5271	int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / `32`;
5272
5273	// Skip out of bounds offsets, or else we would end up using an undefined
5274	// register.
5275	if (Offset >= NumElts \|\| Offset < `0`)
5276	return std::pair(AMDGPU::sub0, Offset);
5277
5278	return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), `0`);
5279	}
5280
5281	static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
5282	MachineRegisterInfo &MRI, MachineInstr &MI,
5283	int Offset) {
5284	MachineBasicBlock *MBB = MI.getParent();
5285	const DebugLoc &DL = MI.getDebugLoc();
5286	MachineBasicBlock::iterator I(&MI);
5287
5288	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5289
5290	assert(Idx->getReg() != AMDGPU::NoRegister);
5291
5292	if (Offset == `0`) {
5293	// clang-format off
5294	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5295	.add(MO: *Idx);
5296	// clang-format on
5297	} else {
5298	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5299	.add(MO: *Idx)
5300	.addImm(Val: Offset);
5301	}
5302	}
5303
5304	static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
5305	MachineRegisterInfo &MRI, MachineInstr &MI,
5306	int Offset) {
5307	MachineBasicBlock *MBB = MI.getParent();
5308	const DebugLoc &DL = MI.getDebugLoc();
5309	MachineBasicBlock::iterator I(&MI);
5310
5311	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5312
5313	if (Offset == `0`)
5314	return Idx->getReg();
5315
5316	Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5317	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
5318	.add(MO: *Idx)
5319	.addImm(Val: Offset);
5320	return Tmp;
5321	}
5322
5323	static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
5324	MachineBasicBlock &MBB,
5325	const GCNSubtarget &ST) {
5326	const SIInstrInfo *TII = ST.getInstrInfo();
5327	const SIRegisterInfo &TRI = TII->getRegisterInfo();
5328	MachineFunction *MF = MBB.getParent();
5329	MachineRegisterInfo &MRI = MF->getRegInfo();
5330
5331	Register Dst = MI.getOperand(i: `0`).getReg();
5332	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5333	Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
5334	int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5335
5336	const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
5337	const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5338
5339	unsigned SubReg;
5340	std::tie(args&: SubReg, args&: Offset) =
5341	computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
5342
5343	const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5344
5345	// Check for a SGPR index.
5346	if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5347	MachineBasicBlock::iterator I(&MI);
5348	const DebugLoc &DL = MI.getDebugLoc();
5349
5350	if (UseGPRIdxMode) {
5351	// TODO: Look at the uses to avoid the copy. This may require rescheduling
5352	// to avoid interfering with other uses, so probably requires a new
5353	// optimization pass.
5354	Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5355
5356	const MCInstrDesc &GPRIDXDesc =
5357	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: true*);
5358	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5359	.addReg(RegNo: SrcReg)
5360	.addReg(RegNo: Idx)
5361	.addImm(Val: SubReg);
5362	} else {
5363	setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5364
5365	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5366	.addReg(RegNo: SrcReg, Flags: {}, SubReg)
5367	.addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5368	}
5369
5370	MI.eraseFromParent();
5371
5372	return &MBB;
5373	}
5374
5375	// Control flow needs to be inserted if indexing with a VGPR.
5376	const DebugLoc &DL = MI.getDebugLoc();
5377	MachineBasicBlock::iterator I(&MI);
5378
5379	Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5380	Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5381
5382	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
5383
5384	Register SGPRIdxReg;
5385	auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
5386	UseGPRIdxMode, SGPRIdxReg);
5387
5388	MachineBasicBlock *LoopBB = InsPt ->getParent();
5389
5390	if (UseGPRIdxMode) {
5391	const MCInstrDesc &GPRIDXDesc =
5392	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: true*);
5393
5394	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5395	.addReg(RegNo: SrcReg)
5396	.addReg(RegNo: SGPRIdxReg)
5397	.addImm(Val: SubReg);
5398	} else {
5399	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5400	.addReg(RegNo: SrcReg, Flags: {}, SubReg)
5401	.addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5402	}
5403
5404	MI.eraseFromParent();
5405
5406	return LoopBB;
5407	}
5408
5409	static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5410	MachineBasicBlock &MBB,
5411	const GCNSubtarget &ST) {
5412	const SIInstrInfo *TII = ST.getInstrInfo();
5413	const SIRegisterInfo &TRI = TII->getRegisterInfo();
5414	MachineFunction *MF = MBB.getParent();
5415	MachineRegisterInfo &MRI = MF->getRegInfo();
5416
5417	Register Dst = MI.getOperand(i: `0`).getReg();
5418	const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
5419	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5420	const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
5421	int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5422	const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
5423	const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5424
5425	// This can be an immediate, but will be folded later.
5426	assert(Val->getReg());
5427
5428	unsigned SubReg;
5429	std::tie(args&: SubReg, args&: Offset) =
5430	computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
5431	const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5432
5433	if (Idx->getReg() == AMDGPU::NoRegister) {
5434	MachineBasicBlock::iterator I(&MI);
5435	const DebugLoc &DL = MI.getDebugLoc();
5436
5437	assert(Offset == `0`);
5438
5439	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
5440	.add(MO: *SrcVec)
5441	.add(MO: *Val)
5442	.addImm(Val: SubReg);
5443
5444	MI.eraseFromParent();
5445	return &MBB;
5446	}
5447
5448	// Check for a SGPR index.
5449	if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5450	MachineBasicBlock::iterator I(&MI);
5451	const DebugLoc &DL = MI.getDebugLoc();
5452
5453	if (UseGPRIdxMode) {
5454	Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5455
5456	const MCInstrDesc &GPRIDXDesc =
5457	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: false*);
5458	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5459	.addReg(RegNo: SrcVec->getReg())
5460	.add(MO: *Val)
5461	.addReg(RegNo: Idx)
5462	.addImm(Val: SubReg);
5463	} else {
5464	setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5465
5466	const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5467	VecSize: TRI.getRegSizeInBits(RC: VecRC), EltSize: `32`, IsSGPR: false*);
5468	BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5469	.addReg(RegNo: SrcVec->getReg())
5470	.add(MO: *Val)
5471	.addImm(Val: SubReg);
5472	}
5473	MI.eraseFromParent();
5474	return &MBB;
5475	}
5476
5477	// Control flow needs to be inserted if indexing with a VGPR.
5478	if (Val->isReg())
5479	MRI.clearKillFlags(Reg: Val->getReg());
5480
5481	const DebugLoc &DL = MI.getDebugLoc();
5482
5483	Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5484
5485	Register SGPRIdxReg;
5486	auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5487	UseGPRIdxMode, SGPRIdxReg);
5488	MachineBasicBlock *LoopBB = InsPt ->getParent();
5489
5490	if (UseGPRIdxMode) {
5491	const MCInstrDesc &GPRIDXDesc =
5492	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: false*);
5493
5494	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5495	.addReg(RegNo: PhiReg)
5496	.add(MO: *Val)
5497	.addReg(RegNo: SGPRIdxReg)
5498	.addImm(Val: SubReg);
5499	} else {
5500	const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5501	VecSize: TRI.getRegSizeInBits(RC: VecRC), EltSize: `32`, IsSGPR: false*);
5502	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5503	.addReg(RegNo: PhiReg)
5504	.add(MO: *Val)
5505	.addImm(Val: SubReg);
5506	}
5507
5508	MI.eraseFromParent();
5509	return LoopBB;
5510	}
5511
5512	static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5513	MachineBasicBlock *BB) {
5514	// For targets older than GFX12, we emit a sequence of 32-bit operations.
5515	// For GFX12, we emit s_add_u64 and s_sub_u64.
5516	MachineFunction *MF = BB->getParent();
5517	const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5518	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5519	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5520	const DebugLoc &DL = MI.getDebugLoc();
5521	MachineOperand &Dest = MI.getOperand(i: `0`);
5522	MachineOperand &Src0 = MI.getOperand(i: `1`);
5523	MachineOperand &Src1 = MI.getOperand(i: `2`);
5524	bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5525	if (ST.hasScalarAddSub64()) {
5526	unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5527	// clang-format off
5528	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5529	.add(MO: Src0)
5530	.add(MO: Src1);
5531	// clang-format on
5532	} else {
5533	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5534	const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5535
5536	Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5537	Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5538
5539	MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5540	MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5541	MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5542	MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5543
5544	MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5545	MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5546	MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5547	MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5548
5549	unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5550	unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5551	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0).add(MO: Src0Sub0).add(MO: Src1Sub0);
5552	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1).add(MO: Src0Sub1).add(MO: Src1Sub1);
5553	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5554	.addReg(RegNo: DestSub0)
5555	.addImm(Val: AMDGPU::sub0)
5556	.addReg(RegNo: DestSub1)
5557	.addImm(Val: AMDGPU::sub1);
5558	}
5559	MI.eraseFromParent();
5560	return BB;
5561	}
5562
5563	static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5564	switch (Opc) {
5565	case AMDGPU::S_MIN_U32:
5566	return std::numeric_limits<uint32_t>::max();
5567	case AMDGPU::S_MIN_I32:
5568	return std::numeric_limits<int32_t>::max();
5569	case AMDGPU::S_MAX_U32:
5570	return std::numeric_limits<uint32_t>::min();
5571	case AMDGPU::S_MAX_I32:
5572	return std::numeric_limits<int32_t>::min();
5573	case AMDGPU::V_ADD_F32_e64: // -0.0
5574	return `0x80000000`;
5575	case AMDGPU::V_SUB_F32_e64: // +0.0
5576	return `0x0`;
5577	case AMDGPU::S_ADD_I32:
5578	case AMDGPU::S_SUB_I32:
5579	case AMDGPU::S_OR_B32:
5580	case AMDGPU::S_XOR_B32:
5581	return std::numeric_limits<uint32_t>::min();
5582	case AMDGPU::S_AND_B32:
5583	return std::numeric_limits<uint32_t>::max();
5584	case AMDGPU::V_MIN_F32_e64:
5585	case AMDGPU::V_MAX_F32_e64:
5586	return `0x7fc00000`; // qNAN
5587	default:
5588	llvm_unreachable(
5589	"Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5590	}
5591	}
5592
5593	static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5594	switch (Opc) {
5595	case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5596	return std::numeric_limits<uint64_t>::max();
5597	case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5598	return std::numeric_limits<int64_t>::max();
5599	case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5600	return std::numeric_limits<uint64_t>::min();
5601	case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5602	return std::numeric_limits<int64_t>::min();
5603	case AMDGPU::V_MIN_F64_e64:
5604	case AMDGPU::V_MAX_F64_e64:
5605	case AMDGPU::V_MIN_NUM_F64_e64:
5606	case AMDGPU::V_MAX_NUM_F64_e64:
5607	return `0x7FF8000000000000`; // qNAN
5608	case AMDGPU::S_ADD_U64_PSEUDO:
5609	case AMDGPU::S_SUB_U64_PSEUDO:
5610	case AMDGPU::S_OR_B64:
5611	case AMDGPU::S_XOR_B64:
5612	return std::numeric_limits<uint64_t>::min();
5613	case AMDGPU::S_AND_B64:
5614	return std::numeric_limits<uint64_t>::max();
5615	case AMDGPU::V_ADD_F64_e64:
5616	case AMDGPU::V_ADD_F64_pseudo_e64:
5617	return `0x8000000000000000`; // -0.0
5618	default:
5619	llvm_unreachable(
5620	"Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5621	}
5622	}
5623
5624	static bool is32bitWaveReduceOperation(unsigned Opc) {
5625	return Opc == AMDGPU::S_MIN_U32 \|\| Opc == AMDGPU::S_MIN_I32 \|\|
5626	Opc == AMDGPU::S_MAX_U32 \|\| Opc == AMDGPU::S_MAX_I32 \|\|
5627	Opc == AMDGPU::S_ADD_I32 \|\| Opc == AMDGPU::S_SUB_I32 \|\|
5628	Opc == AMDGPU::S_AND_B32 \|\| Opc == AMDGPU::S_OR_B32 \|\|
5629	Opc == AMDGPU::S_XOR_B32 \|\| Opc == AMDGPU::V_MIN_F32_e64 \|\|
5630	Opc == AMDGPU::V_MAX_F32_e64 \|\| Opc == AMDGPU::V_ADD_F32_e64 \|\|
5631	Opc == AMDGPU::V_SUB_F32_e64;
5632	}
5633
5634	static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5635	return Opc == AMDGPU::V_MIN_F32_e64 \|\| Opc == AMDGPU::V_MAX_F32_e64 \|\|
5636	Opc == AMDGPU::V_ADD_F32_e64 \|\| Opc == AMDGPU::V_SUB_F32_e64 \|\|
5637	Opc == AMDGPU::V_MIN_F64_e64 \|\| Opc == AMDGPU::V_MAX_F64_e64 \|\|
5638	Opc == AMDGPU::V_MIN_NUM_F64_e64 \|\| Opc == AMDGPU::V_MAX_NUM_F64_e64 \|\|
5639	Opc == AMDGPU::V_ADD_F64_e64 \|\| Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5640	}
5641
5642	static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5643	MachineBasicBlock &BB,
5644	const GCNSubtarget &ST,
5645	unsigned Opc) {
5646	MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5647	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5648	const DebugLoc &DL = MI.getDebugLoc();
5649	const SIInstrInfo *TII = ST.getInstrInfo();
5650
5651	// Reduction operations depend on whether the input operand is SGPR or VGPR.
5652	Register SrcReg = MI.getOperand(i: `1`).getReg();
5653	bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5654	Register DstReg = MI.getOperand(i: `0`).getReg();
5655	MachineBasicBlock RetBB = nullptr*;
5656	if (isSGPR) {
5657	switch (Opc) {
5658	case AMDGPU::S_MIN_U32:
5659	case AMDGPU::S_MIN_I32:
5660	case AMDGPU::V_MIN_F32_e64:
5661	case AMDGPU::S_MAX_U32:
5662	case AMDGPU::S_MAX_I32:
5663	case AMDGPU::V_MAX_F32_e64:
5664	case AMDGPU::S_AND_B32:
5665	case AMDGPU::S_OR_B32: {
5666	// Idempotent operations.
5667	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5668	RetBB = &BB;
5669	break;
5670	}
5671	case AMDGPU::V_CMP_LT_U64_e64: // umin
5672	case AMDGPU::V_CMP_LT_I64_e64: // min
5673	case AMDGPU::V_CMP_GT_U64_e64: // umax
5674	case AMDGPU::V_CMP_GT_I64_e64: // max
5675	case AMDGPU::V_MIN_F64_e64:
5676	case AMDGPU::V_MIN_NUM_F64_e64:
5677	case AMDGPU::V_MAX_F64_e64:
5678	case AMDGPU::V_MAX_NUM_F64_e64:
5679	case AMDGPU::S_AND_B64:
5680	case AMDGPU::S_OR_B64: {
5681	// Idempotent operations.
5682	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg).addReg(RegNo: SrcReg);
5683	RetBB = &BB;
5684	break;
5685	}
5686	case AMDGPU::S_XOR_B32:
5687	case AMDGPU::S_XOR_B64:
5688	case AMDGPU::S_ADD_I32:
5689	case AMDGPU::S_ADD_U64_PSEUDO:
5690	case AMDGPU::V_ADD_F32_e64:
5691	case AMDGPU::V_ADD_F64_e64:
5692	case AMDGPU::V_ADD_F64_pseudo_e64:
5693	case AMDGPU::S_SUB_I32:
5694	case AMDGPU::S_SUB_U64_PSEUDO:
5695	case AMDGPU::V_SUB_F32_e64: {
5696	const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5697	const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5698	Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5699	Register NumActiveLanes =
5700	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5701
5702	bool IsWave32 = ST.isWave32();
5703	unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5704	MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5705	unsigned BitCountOpc =
5706	IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5707
5708	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5709
5710	auto NewAccumulator =
5711	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: BitCountOpc), DestReg: NumActiveLanes)
5712	.addReg(RegNo: ExecMask);
5713
5714	switch (Opc) {
5715	case AMDGPU::S_XOR_B32:
5716	case AMDGPU::S_XOR_B64: {
5717	// Performing an XOR operation on a uniform value
5718	// depends on the parity of the number of active lanes.
5719	// For even parity, the result will be 0, for odd
5720	// parity the result will be the same as the input value.
5721	Register ParityRegister =
5722	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5723
5724	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5725	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg())
5726	.addImm(Val: `1`)
5727	.setOperandDead(`3`); // Dead scc
5728	if (Opc == AMDGPU::S_XOR_B32) {
5729	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5730	.addReg(RegNo: SrcReg)
5731	.addReg(RegNo: ParityRegister);
5732	} else {
5733	Register DestSub0 =
5734	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5735	Register DestSub1 =
5736	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5737
5738	const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
5739	const TargetRegisterClass *SrcSubRC =
5740	TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5741
5742	MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5743	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5744	MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5745	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5746
5747	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5748	.add(MO: Op1L)
5749	.addReg(RegNo: ParityRegister);
5750
5751	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub1)
5752	.add(MO: Op1H)
5753	.addReg(RegNo: ParityRegister);
5754
5755	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5756	.addReg(RegNo: DestSub0)
5757	.addImm(Val: AMDGPU::sub0)
5758	.addReg(RegNo: DestSub1)
5759	.addImm(Val: AMDGPU::sub1);
5760	}
5761	break;
5762	}
5763	case AMDGPU::S_SUB_I32: {
5764	Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5765
5766	// Take the negation of the source operand.
5767	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedVal)
5768	.addImm(Val: `0`)
5769	.addReg(RegNo: SrcReg);
5770	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5771	.addReg(RegNo: NegatedVal)
5772	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg());
5773	break;
5774	}
5775	case AMDGPU::S_ADD_I32: {
5776	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5777	.addReg(RegNo: SrcReg)
5778	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg());
5779	break;
5780	}
5781	case AMDGPU::S_ADD_U64_PSEUDO:
5782	case AMDGPU::S_SUB_U64_PSEUDO: {
5783	Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5784	Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5785	Register Op1H_Op0L_Reg =
5786	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5787	Register Op1L_Op0H_Reg =
5788	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5789	Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5790	Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5791	Register NegatedValLo =
5792	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5793	Register NegatedValHi =
5794	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5795
5796	const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: SrcReg);
5797	const TargetRegisterClass *Src1SubRC =
5798	TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5799
5800	MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5802	MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5804
5805	if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5806	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedValLo)
5807	.addImm(Val: `0`)
5808	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg())
5809	.setOperandDead(`3`); // Dead scc
5810	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ASHR_I32), DestReg: NegatedValHi)
5811	.addReg(RegNo: NegatedValLo)
5812	.addImm(Val: `31`)
5813	.setOperandDead(`3`); // Dead scc
5814	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1L_Op0H_Reg)
5815	.add(MO: Op1L)
5816	.addReg(RegNo: NegatedValHi);
5817	}
5818	Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5819	? NegatedValLo
5820	: NewAccumulator ->getOperand(i: `0`).getReg();
5821	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5822	.add(MO: Op1L)
5823	.addReg(RegNo: LowOpcode);
5824	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_HI_U32), DestReg: CarryReg)
5825	.add(MO: Op1L)
5826	.addReg(RegNo: LowOpcode);
5827	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1H_Op0L_Reg)
5828	.add(MO: Op1H)
5829	.addReg(RegNo: LowOpcode);
5830
5831	Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5832	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: HiVal)
5833	.addReg(RegNo: CarryReg)
5834	.addReg(RegNo: Op1H_Op0L_Reg)
5835	.setOperandDead(`3`); // Dead scc
5836
5837	if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5838	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: DestSub1)
5839	.addReg(RegNo: HiVal)
5840	.addReg(RegNo: Op1L_Op0H_Reg)
5841	.setOperandDead(`3`); // Dead scc
5842	}
5843	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5844	.addReg(RegNo: DestSub0)
5845	.addImm(Val: AMDGPU::sub0)
5846	.addReg(RegNo: DestSub1)
5847	.addImm(Val: AMDGPU::sub1);
5848	break;
5849	}
5850	case AMDGPU::V_ADD_F32_e64:
5851	case AMDGPU::V_ADD_F64_e64:
5852	case AMDGPU::V_ADD_F64_pseudo_e64:
5853	case AMDGPU::V_SUB_F32_e64: {
5854	bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5855	const TargetRegisterClass *VregRC = TII->getRegClass(MCID: TII->get(Opcode: Opc), OpNum: `0`);
5856	Register ActiveLanesVreg = MRI.createVirtualRegister(RegClass: VregRC);
5857	Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
5858	// Get number of active lanes as a float val.
5859	BuildMI(BB, I&: MI, MIMD: DL,
5860	MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5861	: AMDGPU::V_CVT_F64_I32_e64),
5862	DestReg: ActiveLanesVreg)
5863	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg())
5864	.addImm(Val: `0`) // clamp
5865	.addImm(Val: `0`); // output-modifier
5866
5867	// Take negation of input for SUB reduction
5868	unsigned srcMod =
5869	(Opc == AMDGPU::V_SUB_F32_e64 \|\|
5870	MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5871	? SISrcMods::NEG
5872	: SISrcMods::NONE;
5873	unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5874	: ST.getGeneration() >= AMDGPUSubtarget::GFX12
5875	? AMDGPU::V_MUL_F64_pseudo_e64
5876	: AMDGPU::V_MUL_F64_e64;
5877	auto DestVregInst = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MulOpc),
5878	DestReg: DstVreg)
5879	.addImm(Val: srcMod) // src0 modifier
5880	.addReg(RegNo: SrcReg)
5881	.addImm(Val: SISrcMods::NONE) // src1 modifier
5882	.addReg(RegNo: ActiveLanesVreg)
5883	.addImm(Val: SISrcMods::NONE) // clamp
5884	.addImm(Val: SISrcMods::NONE); // output-mod
5885	if (is32BitOpc) {
5886	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
5887	.addReg(RegNo: DstVreg);
5888	} else {
5889	Register LaneValueLoReg =
5890	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5891	Register LaneValueHiReg =
5892	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5893	const TargetRegisterClass *VregSubRC =
5894	TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5895	MachineOperand Op1L =
5896	TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst ->getOperand(i: `0`),
5897	SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
5898	MachineOperand Op1H =
5899	TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst ->getOperand(i: `0`),
5900	SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
5901	// lane value input should be in an sgpr
5902	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5903	DestReg: LaneValueLoReg)
5904	.add(MO: Op1L);
5905	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5906	DestReg: LaneValueHiReg)
5907	.add(MO: Op1H);
5908	NewAccumulator =
5909	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5910	.addReg(RegNo: LaneValueLoReg)
5911	.addImm(Val: AMDGPU::sub0)
5912	.addReg(RegNo: LaneValueHiReg)
5913	.addImm(Val: AMDGPU::sub1);
5914	}
5915	}
5916	}
5917	RetBB = &BB;
5918	}
5919	}
5920	} else {
5921	// TODO: Implement DPP Strategy and switch based on immediate strategy
5922	// operand. For now, for all the cases (default, Iterative and DPP we use
5923	// iterative approach by default.)
5924
5925	// To reduce the VGPR using iterative approach, we need to iterate
5926	// over all the active lanes. Lowering consists of ComputeLoop,
5927	// which iterate over only active lanes. We use copy of EXEC register
5928	// as induction variable and every active lane modifies it using bitset0
5929	// so that we will get the next active lane for next iteration.
5930	MachineBasicBlock::iterator I = BB.end();
5931	Register SrcReg = MI.getOperand(i: `1`).getReg();
5932	bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5933	bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
5934
5935	// Create Control flow for loop
5936	// Split MI's Machine Basic block into For loop
5937	auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5938
5939	// Create virtual registers required for lowering.
5940	const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5941	const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5942	Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5943	Register IdentityValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5944	Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5945	Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5946	Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5947	Register FF1Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5948	Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5949
5950	bool IsWave32 = ST.isWave32();
5951	unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5952	unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5953
5954	// Create initial values of induction variable from Exec, Accumulator and
5955	// insert branch instr to newly created ComputeBlock
5956	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpcForExec), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5957	if (is32BitOpc) {
5958	uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5959	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: IdentityValReg)
5960	.addImm(Val: IdentityValue);
5961	} else {
5962	uint64_t IdentityValue =
5963	MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5964	? `0x0` // +0.0 for double sub reduction
5965	: getIdentityValueFor64BitWaveReduction(Opc);
5966	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: IdentityValReg)
5967	.addImm(Val: IdentityValue);
5968	}
5969	// clang-format off
5970	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5971	.addMBB(MBB: ComputeLoop);
5972	// clang-format on
5973
5974	// Start constructing ComputeLoop
5975	I = ComputeLoop->begin();
5976	auto Accumulator =
5977	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5978	.addReg(RegNo: IdentityValReg)
5979	.addMBB(MBB: &BB);
5980	auto ActiveBits =
5981	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5982	.addReg(RegNo: LoopIterator)
5983	.addMBB(MBB: &BB);
5984
5985	I = ComputeLoop->end();
5986	MachineInstr *NewAccumulator;
5987	// Perform the computations
5988	unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5989	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5990	.addReg(RegNo: ActiveBitsReg);
5991	if (is32BitOpc) {
5992	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5993	DestReg: LaneValueReg)
5994	.addReg(RegNo: SrcReg)
5995	.addReg(RegNo: FF1Reg);
5996	if (isFPOp) {
5997	Register LaneValVreg =
5998	MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5999	Register DstVreg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
6000	// Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6001	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32),
6002	DestReg: LaneValVreg)
6003	.addReg(RegNo: LaneValueReg);
6004	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6005	.addImm(Val: `0`) // src0 modifier
6006	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg())
6007	.addImm(Val: `0`) // src1 modifier
6008	.addReg(RegNo: LaneValVreg)
6009	.addImm(Val: `0`) // clamp
6010	.addImm(Val: `0`); // omod
6011	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6012	MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6013	.addReg(RegNo: DstVreg);
6014	} else {
6015	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6016	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg())
6017	.addReg(RegNo: LaneValueReg);
6018	}
6019	} else {
6020	Register LaneValueLoReg =
6021	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6022	Register LaneValueHiReg =
6023	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6024	Register LaneValReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6025	const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
6026	const TargetRegisterClass *SrcSubRC =
6027	TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6028	MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6029	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
6030	MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6031	MI, MRI, SuperReg: MI.getOperand(i: `1`), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
6032	// lane value input should be in an sgpr
6033	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6034	DestReg: LaneValueLoReg)
6035	.add(MO: Op1L)
6036	.addReg(RegNo: FF1Reg);
6037	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6038	DestReg: LaneValueHiReg)
6039	.add(MO: Op1H)
6040	.addReg(RegNo: FF1Reg);
6041	auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6042	MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: LaneValReg)
6043	.addReg(RegNo: LaneValueLoReg)
6044	.addImm(Val: AMDGPU::sub0)
6045	.addReg(RegNo: LaneValueHiReg)
6046	.addImm(Val: AMDGPU::sub1);
6047	switch (Opc) {
6048	case AMDGPU::S_OR_B64:
6049	case AMDGPU::S_AND_B64:
6050	case AMDGPU::S_XOR_B64: {
6051	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6052	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg())
6053	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg())
6054	.setOperandDead(`3`); // Dead scc
6055	break;
6056	}
6057	case AMDGPU::V_CMP_GT_I64_e64:
6058	case AMDGPU::V_CMP_GT_U64_e64:
6059	case AMDGPU::V_CMP_LT_I64_e64:
6060	case AMDGPU::V_CMP_LT_U64_e64: {
6061	Register LaneMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6062	Register ComparisonResultReg =
6063	MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6064	int SrcIdx =
6065	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6066	const TargetRegisterClass *VregClass =
6067	TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6068	const TargetRegisterClass *VSubRegClass =
6069	TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6070	Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregClass);
6071	MachineOperand SrcReg0Sub0 =
6072	TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator ->getOperand(i: `0`),
6073	SuperRC: VregClass, SubIdx: AMDGPU::sub0, SubRC: VSubRegClass);
6074	MachineOperand SrcReg0Sub1 =
6075	TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator ->getOperand(i: `0`),
6076	SuperRC: VregClass, SubIdx: AMDGPU::sub1, SubRC: VSubRegClass);
6077	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE),
6078	DestReg: AccumulatorVReg)
6079	.add(MO: SrcReg0Sub0)
6080	.addImm(Val: AMDGPU::sub0)
6081	.add(MO: SrcReg0Sub1)
6082	.addImm(Val: AMDGPU::sub1);
6083	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: LaneMaskReg)
6084	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg())
6085	.addReg(RegNo: AccumulatorVReg);
6086
6087	unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6088	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: ComparisonResultReg)
6089	.addReg(RegNo: LaneMaskReg)
6090	.addReg(RegNo: ActiveBitsReg);
6091
6092	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6093	MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
6094	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg())
6095	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg());
6096	break;
6097	}
6098	case AMDGPU::V_MIN_F64_e64:
6099	case AMDGPU::V_MIN_NUM_F64_e64:
6100	case AMDGPU::V_MAX_F64_e64:
6101	case AMDGPU::V_MAX_NUM_F64_e64:
6102	case AMDGPU::V_ADD_F64_e64:
6103	case AMDGPU::V_ADD_F64_pseudo_e64: {
6104	int SrcIdx =
6105	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6106	const TargetRegisterClass *VregRC =
6107	TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6108	const TargetRegisterClass *VregSubRC =
6109	TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6110	Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregRC);
6111	Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6112	Register LaneValLo =
6113	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6114	Register LaneValHi =
6115	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6116	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AccumulatorVReg)
6117	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg());
6118	unsigned Modifier =
6119	MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6120	? SISrcMods::NEG
6121	: SISrcMods::NONE;
6122	auto DstVregInst = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6123	.addImm(Val: Modifier) // src0 modifiers
6124	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg())
6125	.addImm(Val: SISrcMods::NONE) // src1 modifiers
6126	.addReg(RegNo: AccumulatorVReg)
6127	.addImm(Val: SISrcMods::NONE) // clamp
6128	.addImm(Val: SISrcMods::NONE); // omod
6129	auto ReadLaneLo =
6130	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6131	DestReg: LaneValLo);
6132	auto ReadLaneHi =
6133	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6134	DestReg: LaneValHi);
6135	MachineBasicBlock::iterator Iters = *ReadLaneLo;
6136	MachineOperand Op1L =
6137	TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst ->getOperand(i: `0`),
6138	SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
6139	MachineOperand Op1H =
6140	TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst ->getOperand(i: `0`),
6141	SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
6142	ReadLaneLo.add(MO: Op1L);
6143	ReadLaneHi.add(MO: Op1H);
6144	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6145	MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
6146	.addReg(RegNo: LaneValLo)
6147	.addImm(Val: AMDGPU::sub0)
6148	.addReg(RegNo: LaneValHi)
6149	.addImm(Val: AMDGPU::sub1);
6150	break;
6151	}
6152	case AMDGPU::S_ADD_U64_PSEUDO:
6153	case AMDGPU::S_SUB_U64_PSEUDO: {
6154	NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6155	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg())
6156	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg());
6157	ComputeLoop = Expand64BitScalarArithmetic(MI&: *NewAccumulator, BB: ComputeLoop);
6158	break;
6159	}
6160	}
6161	}
6162	// Manipulate the iterator to get the next active lane
6163	unsigned BITSETOpc =
6164	IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6165	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
6166	.addReg(RegNo: FF1Reg)
6167	.addReg(RegNo: ActiveBitsReg);
6168
6169	// Add phi nodes
6170	Accumulator.addReg(RegNo: DstReg).addMBB(MBB: ComputeLoop);
6171	ActiveBits.addReg(RegNo: NewActiveBitsReg).addMBB(MBB: ComputeLoop);
6172
6173	// Creating branching
6174	unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6175	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
6176	.addReg(RegNo: NewActiveBitsReg)
6177	.addImm(Val: `0`);
6178	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6179	.addMBB(MBB: ComputeLoop);
6180
6181	RetBB = ComputeEnd;
6182	}
6183	MI.eraseFromParent();
6184	return RetBB;
6185	}
6186
6187	MachineBasicBlock *
6188	SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6189	MachineBasicBlock BB) const* {
6190	MachineFunction *MF = BB->getParent();
6191	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
6192	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6193	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6194	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6195	MachineRegisterInfo &MRI = MF->getRegInfo();
6196	const DebugLoc &DL = MI.getDebugLoc();
6197
6198	switch (MI.getOpcode()) {
6199	case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6200	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MIN_U32);
6201	case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6202	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_CMP_LT_U64_e64);
6203	case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6204	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MIN_I32);
6205	case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6206	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_CMP_LT_I64_e64);
6207	case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6208	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_MIN_F32_e64);
6209	case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6210	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(),
6211	Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6212	? AMDGPU::V_MIN_NUM_F64_e64
6213	: AMDGPU::V_MIN_F64_e64);
6214	case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6215	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MAX_U32);
6216	case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6217	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_CMP_GT_U64_e64);
6218	case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6219	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MAX_I32);
6220	case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6221	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_CMP_GT_I64_e64);
6222	case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6223	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_MAX_F32_e64);
6224	case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6225	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(),
6226	Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6227	? AMDGPU::V_MAX_NUM_F64_e64
6228	: AMDGPU::V_MAX_F64_e64);
6229	case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6230	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_ADD_I32);
6231	case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6232	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_ADD_U64_PSEUDO);
6233	case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6234	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_ADD_F32_e64);
6235	case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6236	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(),
6237	Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6238	? AMDGPU::V_ADD_F64_pseudo_e64
6239	: AMDGPU::V_ADD_F64_e64);
6240	case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6241	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_SUB_I32);
6242	case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6243	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_SUB_U64_PSEUDO);
6244	case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6245	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::V_SUB_F32_e64);
6246	case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6247	// There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6248	// fadd + neg, by setting the NEG bit in the instruction.
6249	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(),
6250	Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6251	? AMDGPU::V_ADD_F64_pseudo_e64
6252	: AMDGPU::V_ADD_F64_e64);
6253	case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6254	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_AND_B32);
6255	case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6256	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_AND_B64);
6257	case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6258	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_OR_B32);
6259	case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6260	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_OR_B64);
6261	case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6262	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_XOR_B32);
6263	case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6264	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_XOR_B64);
6265	case AMDGPU::S_UADDO_PSEUDO:
6266	case AMDGPU::S_USUBO_PSEUDO: {
6267	MachineOperand &Dest0 = MI.getOperand(i: `0`);
6268	MachineOperand &Dest1 = MI.getOperand(i: `1`);
6269	MachineOperand &Src0 = MI.getOperand(i: `2`);
6270	MachineOperand &Src1 = MI.getOperand(i: `3`);
6271
6272	unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6273	? AMDGPU::S_ADD_U32
6274	: AMDGPU::S_SUB_U32;
6275	// clang-format off
6276	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
6277	.add(MO: Src0)
6278	.add(MO: Src1);
6279	// clang-format on
6280
6281	unsigned SelOpc =
6282	Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6283	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: Dest1.getReg()).addImm(Val: -`1`).addImm(Val: `0`);
6284
6285	MI.eraseFromParent();
6286	return BB;
6287	}
6288	case AMDGPU::S_ADD_U64_PSEUDO:
6289	case AMDGPU::S_SUB_U64_PSEUDO: {
6290	return Expand64BitScalarArithmetic(MI, BB);
6291	}
6292	case AMDGPU::V_ADD_U64_PSEUDO:
6293	case AMDGPU::V_SUB_U64_PSEUDO: {
6294	bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6295
6296	MachineOperand &Dest = MI.getOperand(i: `0`);
6297	MachineOperand &Src0 = MI.getOperand(i: `1`);
6298	MachineOperand &Src1 = MI.getOperand(i: `2`);
6299
6300	if (ST.hasAddSubU64Insts()) {
6301	auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL,
6302	MCID: TII->get(Opcode: IsAdd ? AMDGPU::V_ADD_U64_e64
6303	: AMDGPU::V_SUB_U64_e64),
6304	DestReg: Dest.getReg())
6305	.add(MO: Src0)
6306	.add(MO: Src1)
6307	.addImm(Val: `0`); // clamp
6308	TII->legalizeOperands(MI&: *I);
6309	MI.eraseFromParent();
6310	return BB;
6311	}
6312
6313	if (IsAdd && ST.hasLshlAddU64Inst()) {
6314	auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
6315	DestReg: Dest.getReg())
6316	.add(MO: Src0)
6317	.addImm(Val: `0`)
6318	.add(MO: Src1);
6319	TII->legalizeOperands(MI&: *Add);
6320	MI.eraseFromParent();
6321	return BB;
6322	}
6323
6324	const auto *CarryRC = TRI->getWaveMaskRegClass();
6325
6326	Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6327	Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6328
6329	Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6330	Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6331
6332	const TargetRegisterClass *Src0RC = Src0.isReg()
6333	? MRI.getRegClass(Reg: Src0.getReg())
6334	: &AMDGPU::VReg_64RegClass;
6335	const TargetRegisterClass *Src1RC = Src1.isReg()
6336	? MRI.getRegClass(Reg: Src1.getReg())
6337	: &AMDGPU::VReg_64RegClass;
6338
6339	const TargetRegisterClass *Src0SubRC =
6340	TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6341	const TargetRegisterClass *Src1SubRC =
6342	TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6343
6344	MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6345	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6346	MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6347	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6348
6349	MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6350	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6351	MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6352	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6353
6354	unsigned LoOpc =
6355	IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6356	MachineInstr LoHalf = BuildMI(BB&: BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
6357	.addReg(RegNo: CarryReg, Flags: RegState::Define)
6358	.add(MO: SrcReg0Sub0)
6359	.add(MO: SrcReg1Sub0)
6360	.addImm(Val: `0`); // clamp bit
6361
6362	unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6363	MachineInstr *HiHalf =
6364	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
6365	.addReg(RegNo: DeadCarryReg, Flags: RegState::Define \| RegState::Dead)
6366	.add(MO: SrcReg0Sub1)
6367	.add(MO: SrcReg1Sub1)
6368	.addReg(RegNo: CarryReg, Flags: RegState::Kill)
6369	.addImm(Val: `0`); // clamp bit
6370
6371	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
6372	.addReg(RegNo: DestSub0)
6373	.addImm(Val: AMDGPU::sub0)
6374	.addReg(RegNo: DestSub1)
6375	.addImm(Val: AMDGPU::sub1);
6376	TII->legalizeOperands(MI&: *LoHalf);
6377	TII->legalizeOperands(MI&: *HiHalf);
6378	MI.eraseFromParent();
6379	return BB;
6380	}
6381	case AMDGPU::S_ADD_CO_PSEUDO:
6382	case AMDGPU::S_SUB_CO_PSEUDO: {
6383	// This pseudo has a chance to be selected
6384	// only from uniform add/subcarry node. All the VGPR operands
6385	// therefore assumed to be splat vectors.
6386	MachineBasicBlock::iterator MII = MI;
6387	MachineOperand &Dest = MI.getOperand(i: `0`);
6388	MachineOperand &CarryDest = MI.getOperand(i: `1`);
6389	MachineOperand &Src0 = MI.getOperand(i: `2`);
6390	MachineOperand &Src1 = MI.getOperand(i: `3`);
6391	MachineOperand &Src2 = MI.getOperand(i: `4`);
6392	if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
6393	Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6394	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
6395	.addReg(RegNo: Src0.getReg());
6396	Src0.setReg(RegOp0);
6397	}
6398	if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
6399	Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6400	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
6401	.addReg(RegNo: Src1.getReg());
6402	Src1.setReg(RegOp1);
6403	}
6404	Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6405	if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
6406	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
6407	.addReg(RegNo: Src2.getReg());
6408	Src2.setReg(RegOp2);
6409	}
6410
6411	if (ST.isWave64()) {
6412	if (ST.hasScalarCompareEq64()) {
6413	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
6414	.addReg(RegNo: Src2.getReg())
6415	.addImm(Val: `0`);
6416	} else {
6417	const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
6418	const TargetRegisterClass *SubRC =
6419	TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6420	MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6421	MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
6422	MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6423	MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
6424	Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6425
6426	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
6427	.add(MO: Src2Sub0)
6428	.add(MO: Src2Sub1);
6429
6430	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6431	.addReg(RegNo: Src2_32, Flags: RegState::Kill)
6432	.addImm(Val: `0`);
6433	}
6434	} else {
6435	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6436	.addReg(RegNo: Src2.getReg())
6437	.addImm(Val: `0`);
6438	}
6439
6440	unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6441	? AMDGPU::S_ADDC_U32
6442	: AMDGPU::S_SUBB_U32;
6443
6444	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
6445
6446	unsigned SelOpc =
6447	ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6448
6449	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
6450	.addImm(Val: -`1`)
6451	.addImm(Val: `0`);
6452
6453	MI.eraseFromParent();
6454	return BB;
6455	}
6456	case AMDGPU::SI_INIT_M0: {
6457	MachineOperand &M0Init = MI.getOperand(i: `0`);
6458	BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6459	MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6460	DestReg: AMDGPU::M0)
6461	.add(MO: M0Init);
6462	MI.eraseFromParent();
6463	return BB;
6464	}
6465	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6466	// Set SCC to true, in case the barrier instruction gets converted to a NOP.
6467	BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6468	MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6469	.addImm(Val: `0`)
6470	.addImm(Val: `0`);
6471	return BB;
6472	}
6473	case AMDGPU::GET_GROUPSTATICSIZE: {
6474	assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
6475	getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6476	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
6477	.add(MO: MI.getOperand(i: `0`))
6478	.addImm(Val: MFI->getLDSSize());
6479	MI.eraseFromParent();
6480	return BB;
6481	}
6482	case AMDGPU::GET_SHADERCYCLESHILO: {
6483	assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6484	// The algorithm is:
6485	//
6486	// hi1 = getreg(SHADER_CYCLES_HI)
6487	// lo1 = getreg(SHADER_CYCLES_LO)
6488	// hi2 = getreg(SHADER_CYCLES_HI)
6489	//
6490	// If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6491	// Otherwise there was overflow and the result is hi2:0. In both cases the
6492	// result should represent the actual time at some point during the sequence
6493	// of three getregs.
6494	using namespace AMDGPU::Hwreg;
6495	Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6496	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
6497	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: `0`, Values: `32`));
6498	Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6499	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
6500	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: `0`, Values: `32`));
6501	Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6502	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
6503	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: `0`, Values: `32`));
6504	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6505	.addReg(RegNo: RegHi1)
6506	.addReg(RegNo: RegHi2);
6507	Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6508	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
6509	.addReg(RegNo: RegLo1)
6510	.addImm(Val: `0`);
6511	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
6512	.add(MO: MI.getOperand(i: `0`))
6513	.addReg(RegNo: RegLo)
6514	.addImm(Val: AMDGPU::sub0)
6515	.addReg(RegNo: RegHi2)
6516	.addImm(Val: AMDGPU::sub1);
6517	MI.eraseFromParent();
6518	return BB;
6519	}
6520	case AMDGPU::SI_INDIRECT_SRC_V1:
6521	case AMDGPU::SI_INDIRECT_SRC_V2:
6522	case AMDGPU::SI_INDIRECT_SRC_V3:
6523	case AMDGPU::SI_INDIRECT_SRC_V4:
6524	case AMDGPU::SI_INDIRECT_SRC_V5:
6525	case AMDGPU::SI_INDIRECT_SRC_V6:
6526	case AMDGPU::SI_INDIRECT_SRC_V7:
6527	case AMDGPU::SI_INDIRECT_SRC_V8:
6528	case AMDGPU::SI_INDIRECT_SRC_V9:
6529	case AMDGPU::SI_INDIRECT_SRC_V10:
6530	case AMDGPU::SI_INDIRECT_SRC_V11:
6531	case AMDGPU::SI_INDIRECT_SRC_V12:
6532	case AMDGPU::SI_INDIRECT_SRC_V16:
6533	case AMDGPU::SI_INDIRECT_SRC_V32:
6534	return emitIndirectSrc(MI, MBB&: BB, ST: getSubtarget());
6535	case AMDGPU::SI_INDIRECT_DST_V1:
6536	case AMDGPU::SI_INDIRECT_DST_V2:
6537	case AMDGPU::SI_INDIRECT_DST_V3:
6538	case AMDGPU::SI_INDIRECT_DST_V4:
6539	case AMDGPU::SI_INDIRECT_DST_V5:
6540	case AMDGPU::SI_INDIRECT_DST_V6:
6541	case AMDGPU::SI_INDIRECT_DST_V7:
6542	case AMDGPU::SI_INDIRECT_DST_V8:
6543	case AMDGPU::SI_INDIRECT_DST_V9:
6544	case AMDGPU::SI_INDIRECT_DST_V10:
6545	case AMDGPU::SI_INDIRECT_DST_V11:
6546	case AMDGPU::SI_INDIRECT_DST_V12:
6547	case AMDGPU::SI_INDIRECT_DST_V16:
6548	case AMDGPU::SI_INDIRECT_DST_V32:
6549	return emitIndirectDst(MI, MBB&: BB, ST: getSubtarget());
6550	case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6551	case AMDGPU::SI_KILL_I1_PSEUDO:
6552	return splitKillBlock(MI, BB);
6553	case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6554	Register Dst = MI.getOperand(i: `0`).getReg();
6555	const MachineOperand &Src0 = MI.getOperand(i: `1`);
6556	const MachineOperand &Src1 = MI.getOperand(i: `2`);
6557	Register SrcCond = MI.getOperand(i: `3`).getReg();
6558
6559	Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6560	Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6561	const auto *CondRC = TRI->getWaveMaskRegClass();
6562	Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
6563
6564	const TargetRegisterClass *Src0RC = Src0.isReg()
6565	? MRI.getRegClass(Reg: Src0.getReg())
6566	: &AMDGPU::VReg_64RegClass;
6567	const TargetRegisterClass *Src1RC = Src1.isReg()
6568	? MRI.getRegClass(Reg: Src1.getReg())
6569	: &AMDGPU::VReg_64RegClass;
6570
6571	const TargetRegisterClass *Src0SubRC =
6572	TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6573	const TargetRegisterClass *Src1SubRC =
6574	TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6575
6576	MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6577	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6578	MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6579	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6580
6581	MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6582	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6583	MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6584	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6585
6586	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
6587	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
6588	.addImm(Val: `0`)
6589	.add(MO: Src0Sub0)
6590	.addImm(Val: `0`)
6591	.add(MO: Src1Sub0)
6592	.addReg(RegNo: SrcCondCopy);
6593	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
6594	.addImm(Val: `0`)
6595	.add(MO: Src0Sub1)
6596	.addImm(Val: `0`)
6597	.add(MO: Src1Sub1)
6598	.addReg(RegNo: SrcCondCopy);
6599
6600	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
6601	.addReg(RegNo: DstLo)
6602	.addImm(Val: AMDGPU::sub0)
6603	.addReg(RegNo: DstHi)
6604	.addImm(Val: AMDGPU::sub1);
6605	MI.eraseFromParent();
6606	return BB;
6607	}
6608	case AMDGPU::SI_BR_UNDEF: {
6609	MachineInstr Br = BuildMI(BB&: BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6610	.add(MO: MI.getOperand(i: `0`));
6611	Br->getOperand(i: `1`).setIsUndef(); // read undef SCC
6612	MI.eraseFromParent();
6613	return BB;
6614	}
6615	case AMDGPU::ADJCALLSTACKUP:
6616	case AMDGPU::ADJCALLSTACKDOWN: {
6617	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6618	MachineInstrBuilder MIB(*MF, &MI);
6619	MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::ImplicitDefine)
6620	.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::Implicit);
6621	return BB;
6622	}
6623	case AMDGPU::SI_CALL_ISEL: {
6624	unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
6625
6626	MachineInstrBuilder MIB;
6627	MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
6628
6629	for (const MachineOperand &MO : MI.operands())
6630	MIB.add(MO);
6631
6632	MIB.cloneMemRefs(OtherMI: MI);
6633	MI.eraseFromParent();
6634	return BB;
6635	}
6636	case AMDGPU::V_ADD_CO_U32_e32:
6637	case AMDGPU::V_SUB_CO_U32_e32:
6638	case AMDGPU::V_SUBREV_CO_U32_e32: {
6639	// TODO: Define distinct V__I32_Pseudo instructions instead.*
6640	unsigned Opc = MI.getOpcode();
6641
6642	bool NeedClampOperand = false;
6643	if (TII->pseudoToMCOpcode(Opcode: Opc) == -`1`) {
6644	Opc = AMDGPU::getVOPe64(Opcode: Opc);
6645	NeedClampOperand = true;
6646	}
6647
6648	auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: `0`).getReg());
6649	if (TII->isVOP3(MI: *I)) {
6650	I.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
6651	}
6652	I.add(MO: MI.getOperand(i: `1`)).add(MO: MI.getOperand(i: `2`));
6653	if (NeedClampOperand)
6654	I.addImm(Val: `0`); // clamp bit for e64 encoding
6655
6656	TII->legalizeOperands(MI&: *I);
6657
6658	MI.eraseFromParent();
6659	return BB;
6660	}
6661	case AMDGPU::V_ADDC_U32_e32:
6662	case AMDGPU::V_SUBB_U32_e32:
6663	case AMDGPU::V_SUBBREV_U32_e32:
6664	// These instructions have an implicit use of vcc which counts towards the
6665	// constant bus limit.
6666	TII->legalizeOperands(MI);
6667	return BB;
6668	case AMDGPU::DS_GWS_INIT:
6669	case AMDGPU::DS_GWS_SEMA_BR:
6670	case AMDGPU::DS_GWS_BARRIER:
6671	case AMDGPU::DS_GWS_SEMA_V:
6672	case AMDGPU::DS_GWS_SEMA_P:
6673	case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6674	// A s_waitcnt 0 is required to be the instruction immediately following.
6675	if (getSubtarget()->hasGWSAutoReplay()) {
6676	bundleInstWithWaitcnt(MI);
6677	return BB;
6678	}
6679
6680	return emitGWSMemViolTestLoop(MI, BB);
6681	case AMDGPU::S_SETREG_B32: {
6682	// Try to optimize cases that only set the denormal mode or rounding mode.
6683	//
6684	// If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6685	// denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6686	// instead.
6687	//
6688	// FIXME: This could be predicates on the immediate, but tablegen doesn't
6689	// allow you to have a no side effect instruction in the output of a
6690	// sideeffecting pattern.
6691	auto [ID, Offset, Width] =
6692	AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: `1`).getImm());
6693	if (ID != AMDGPU::Hwreg::ID_MODE)
6694	return BB;
6695
6696	const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
6697	const unsigned SetMask = WidthMask << Offset;
6698
6699	if (getSubtarget()->hasDenormModeInst()) {
6700	unsigned SetDenormOp = `0`;
6701	unsigned SetRoundOp = `0`;
6702
6703	// The dedicated instructions can only set the whole denorm or round mode
6704	// at once, not a subset of bits in either.
6705	if (SetMask ==
6706	(AMDGPU::Hwreg::FP_ROUND_MASK \| AMDGPU::Hwreg::FP_DENORM_MASK)) {
6707	// If this fully sets both the round and denorm mode, emit the two
6708	// dedicated instructions for these.
6709	SetRoundOp = AMDGPU::S_ROUND_MODE;
6710	SetDenormOp = AMDGPU::S_DENORM_MODE;
6711	} else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6712	SetRoundOp = AMDGPU::S_ROUND_MODE;
6713	} else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6714	SetDenormOp = AMDGPU::S_DENORM_MODE;
6715	}
6716
6717	if (SetRoundOp \|\| SetDenormOp) {
6718	MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: `0`).getReg());
6719	if (Def && Def->isMoveImmediate() && Def->getOperand(i: `1`).isImm()) {
6720	unsigned ImmVal = Def->getOperand(i: `1`).getImm();
6721	if (SetRoundOp) {
6722	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
6723	.addImm(Val: ImmVal & `0xf`);
6724
6725	// If we also have the denorm mode, get just the denorm mode bits.
6726	ImmVal >>= `4`;
6727	}
6728
6729	if (SetDenormOp) {
6730	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
6731	.addImm(Val: ImmVal & `0xf`);
6732	}
6733
6734	MI.eraseFromParent();
6735	return BB;
6736	}
6737	}
6738	}
6739
6740	// If only FP bits are touched, used the no side effects pseudo.
6741	if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK \|
6742	AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6743	MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
6744
6745	return BB;
6746	}
6747	case AMDGPU::S_INVERSE_BALLOT_U32:
6748	case AMDGPU::S_INVERSE_BALLOT_U64:
6749	// These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6750	// necessary. After that they are equivalent to a COPY.
6751	MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
6752	return BB;
6753	case AMDGPU::ENDPGM_TRAP: {
6754	if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
6755	MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
6756	MI.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
6757	return BB;
6758	}
6759
6760	// We need a block split to make the real endpgm a terminator. We also don't
6761	// want to break phis in successor blocks, so we can't just delete to the
6762	// end of the block.
6763
6764	MachineBasicBlock SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false* /UpdateLiveIns/);
6765	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6766	MF->push_back(MBB: TrapBB);
6767	// clang-format off
6768	BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
6769	.addImm(Val: `0`);
6770	BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6771	.addMBB(MBB: TrapBB);
6772	// clang-format on
6773
6774	BB->addSuccessor(Succ: TrapBB);
6775	MI.eraseFromParent();
6776	return SplitBB;
6777	}
6778	case AMDGPU::SIMULATED_TRAP: {
6779	assert(Subtarget->hasPrivEnabledTrap2NopBug());
6780	MachineBasicBlock *SplitBB =
6781	TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
6782	MI.eraseFromParent();
6783	return SplitBB;
6784	}
6785	case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6786	case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6787	assert(MFI->isWholeWaveFunction());
6788
6789	// During ISel, it's difficult to propagate the original EXEC mask to use as
6790	// an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6791	MachineInstr Setup = TII->getWholeWaveFunctionSetup(MF&: BB->getParent());
6792	assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6793	Register OriginalExec = Setup->getOperand(i: `0`).getReg();
6794	MF->getRegInfo().clearKillFlags(Reg: OriginalExec);
6795	MI.getOperand(i: `0`).setReg(OriginalExec);
6796	return BB;
6797	}
6798	default:
6799	if (TII->isImage(MI) \|\| TII->isMUBUF(MI)) {
6800	if (!MI.mayStore())
6801	AddMemOpInit(MI);
6802	return BB;
6803	}
6804	return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
6805	}
6806	}
6807
6808	bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
6809	// This currently forces unfolding various combinations of fsub into fma with
6810	// free fneg'd operands. As long as we have fast FMA (controlled by
6811	// isFMAFasterThanFMulAndFAdd), we should perform these.
6812
6813	// When fma is quarter rate, for f64 where add / sub are at best half rate,
6814	// most of these combines appear to be cycle neutral but save on instruction
6815	// count / code size.
6816	return true;
6817	}
6818
6819	bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
6820
6821	EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
6822	EVT VT) const {
6823	if (!VT.isVector()) {
6824	return MVT::i1;
6825	}
6826	return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
6827	}
6828
6829	MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
6830	// TODO: Should i16 be used always if legal? For now it would force VALU
6831	// shifts.
6832	return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6833	}
6834
6835	LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
6836	return (Ty.getScalarSizeInBits() <= `16` && Subtarget->has16BitInsts())
6837	? Ty.changeElementSize(NewEltSize: `16`)
6838	: Ty.changeElementSize(NewEltSize: `32`);
6839	}
6840
6841	// Answering this is somewhat tricky and depends on the specific device which
6842	// have different rates for fma or all f64 operations.
6843	//
6844	// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6845	// regardless of which device (although the number of cycles differs between
6846	// devices), so it is always profitable for f64.
6847	//
6848	// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6849	// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6850	// which we can always do even without fused FP ops since it returns the same
6851	// result as the separate operations and since it is always full
6852	// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6853	// however does not support denormals, so we do report fma as faster if we have
6854	// a fast fma device and require denormals.
6855	//
6856	bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6857	EVT VT) const {
6858	VT = VT.getScalarType();
6859
6860	switch (VT.getSimpleVT().SimpleTy) {
6861	case MVT::f32: {
6862	// If mad is not available this depends only on if f32 fma is full rate.
6863	if (!Subtarget->hasMadMacF32Insts())
6864	return Subtarget->hasFastFMAF32();
6865
6866	// Otherwise f32 mad is always full rate and returns the same result as
6867	// the separate operations so should be preferred over fma.
6868	// However does not support denormals.
6869	if (!denormalModeIsFlushAllF32(MF))
6870	return Subtarget->hasFastFMAF32() \|\| Subtarget->hasDLInsts();
6871
6872	// If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6873	return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6874	}
6875	case MVT::f64:
6876	return true;
6877	case MVT::f16:
6878	case MVT::bf16:
6879	return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6880	default:
6881	break;
6882	}
6883
6884	return false;
6885	}
6886
6887	bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6888	LLT Ty) const {
6889	switch (Ty.getScalarSizeInBits()) {
6890	case `16`:
6891	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
6892	case `32`:
6893	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
6894	case `64`:
6895	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
6896	default:
6897	break;
6898	}
6899
6900	return false;
6901	}
6902
6903	bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
6904	if (!Ty.isScalar())
6905	return false;
6906
6907	if (Ty.getScalarSizeInBits() == `16`)
6908	return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
6909	if (Ty.getScalarSizeInBits() == `32`)
6910	return Subtarget->hasMadMacF32Insts() &&
6911	denormalModeIsFlushAllF32(MF: *MI.getMF());
6912
6913	return false;
6914	}
6915
6916	bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6917	const SDNode N) const* {
6918	// TODO: Check future ftz flag
6919	// v_mad_f32/v_mac_f32 do not support denormals.
6920	EVT VT = N->getValueType(ResNo: `0`);
6921	if (VT == MVT::f32)
6922	return Subtarget->hasMadMacF32Insts() &&
6923	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6924	if (VT == MVT::f16) {
6925	return Subtarget->hasMadF16() &&
6926	denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6927	}
6928
6929	return false;
6930	}
6931
6932	//===----------------------------------------------------------------------===//
6933	// Custom DAG Lowering Operations
6934	//===----------------------------------------------------------------------===//
6935
6936	// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6937	// wider vector type is legal.
6938	SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6939	SelectionDAG &DAG) const {
6940	unsigned Opc = Op.getOpcode();
6941	EVT VT = Op.getValueType();
6942	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16 \|\|
6943	VT == MVT::v4f32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
6944	VT == MVT::v8bf16 \|\| VT == MVT::v16i16 \|\| VT == MVT::v16f16 \|\|
6945	VT == MVT::v16bf16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
6946	VT == MVT::v32f32 \|\| VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\|
6947	VT == MVT::v32bf16);
6948
6949	auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
6950
6951	SDLoc SL(Op);
6952	SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op ->getFlags());
6953	SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op ->getFlags());
6954
6955	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
6956	}
6957
6958	// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6959	// regression whereby extra unnecessary instructions were added to codegen
6960	// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6961	// instructions to extract the result from the vector.
6962	SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6963	[[maybe_unused]] EVT VT = Op.getValueType();
6964
6965	assert((VT == MVT::v2i32 \|\| VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\|
6966	VT == MVT::v16i32) &&
6967	"Unexpected ValueType.");
6968
6969	return DAG.UnrollVectorOp(N: Op.getNode());
6970	}
6971
6972	// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6973	// wider vector type is legal.
6974	SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6975	SelectionDAG &DAG) const {
6976	unsigned Opc = Op.getOpcode();
6977	EVT VT = Op.getValueType();
6978	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16 \|\|
6979	VT == MVT::v4f32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
6980	VT == MVT::v8bf16 \|\| VT == MVT::v16i16 \|\| VT == MVT::v16f16 \|\|
6981	VT == MVT::v16bf16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
6982	VT == MVT::v32f32 \|\| VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\|
6983	VT == MVT::v32bf16);
6984
6985	auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
6986	auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `1`);
6987
6988	SDLoc SL(Op);
6989
6990	SDValue OpLo =
6991	DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op ->getFlags());
6992	SDValue OpHi =
6993	DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op ->getFlags());
6994
6995	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
6996	}
6997
6998	SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6999	SelectionDAG &DAG) const {
7000	unsigned Opc = Op.getOpcode();
7001	EVT VT = Op.getValueType();
7002	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8i16 \|\|
7003	VT == MVT::v8f16 \|\| VT == MVT::v4f32 \|\| VT == MVT::v16i16 \|\|
7004	VT == MVT::v16f16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
7005	VT == MVT::v32f32 \|\| VT == MVT::v32f16 \|\| VT == MVT::v32i16 \|\|
7006	VT == MVT::v4bf16 \|\| VT == MVT::v8bf16 \|\| VT == MVT::v16bf16 \|\|
7007	VT == MVT::v32bf16);
7008
7009	SDValue Op0 = Op.getOperand(i: `0`);
7010	auto [Lo0, Hi0] = Op0.getValueType().isVector()
7011	? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`)
7012	: std::pair(Op0, Op0);
7013
7014	auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `1`);
7015	auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `2`);
7016
7017	SDLoc SL(Op);
7018	auto ResVT = DAG.GetSplitDestVTs(VT);
7019
7020	SDValue OpLo =
7021	DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op ->getFlags());
7022	SDValue OpHi =
7023	DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op ->getFlags());
7024
7025	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
7026	}
7027
7028	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7029	switch (Op.getOpcode()) {
7030	default:
7031	return AMDGPUTargetLowering::LowerOperation(Op, DAG);
7032	case ISD::BRCOND:
7033	return LowerBRCOND(Op, DAG);
7034	case ISD::RETURNADDR:
7035	return LowerRETURNADDR(Op, DAG);
7036	case ISD::SPONENTRY:
7037	return LowerSPONENTRY(Op, DAG);
7038	case ISD::LOAD: {
7039	SDValue Result = LowerLOAD(Op, DAG);
7040	assert((!Result.getNode() \|\| Result.getNode()->getNumValues() == `2`) &&
7041	"Load should return a value and a chain");
7042	return Result;
7043	}
7044	case ISD::FSQRT: {
7045	EVT VT = Op.getValueType();
7046	if (VT == MVT::f32)
7047	return lowerFSQRTF32(Op, DAG);
7048	if (VT == MVT::f64)
7049	return lowerFSQRTF64(Op, DAG);
7050	return SDValue ();
7051	}
7052	case ISD::FSIN:
7053	case ISD::FCOS:
7054	return LowerTrig(Op, DAG);
7055	case ISD::SELECT:
7056	return LowerSELECT(Op, DAG);
7057	case ISD::FDIV:
7058	return LowerFDIV(Op, DAG);
7059	case ISD::FFREXP:
7060	return LowerFFREXP(Op, DAG);
7061	case ISD::ATOMIC_CMP_SWAP:
7062	return LowerATOMIC_CMP_SWAP(Op, DAG);
7063	case ISD::STORE:
7064	return LowerSTORE(Op, DAG);
7065	case ISD::GlobalAddress: {
7066	MachineFunction &MF = DAG.getMachineFunction();
7067	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7068	return LowerGlobalAddress(MFI, Op, DAG);
7069	}
7070	case ISD::ExternalSymbol:
7071	return LowerExternalSymbol(Op, DAG);
7072	case ISD::INTRINSIC_WO_CHAIN:
7073	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7074	case ISD::INTRINSIC_W_CHAIN:
7075	return LowerINTRINSIC_W_CHAIN(Op, DAG);
7076	case ISD::INTRINSIC_VOID:
7077	return LowerINTRINSIC_VOID(Op, DAG);
7078	case ISD::ADDRSPACECAST:
7079	return lowerADDRSPACECAST(Op, DAG);
7080	case ISD::INSERT_SUBVECTOR:
7081	return lowerINSERT_SUBVECTOR(Op, DAG);
7082	case ISD::INSERT_VECTOR_ELT:
7083	return lowerINSERT_VECTOR_ELT(Op, DAG);
7084	case ISD::EXTRACT_VECTOR_ELT:
7085	return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7086	case ISD::VECTOR_SHUFFLE:
7087	return lowerVECTOR_SHUFFLE(Op, DAG);
7088	case ISD::SCALAR_TO_VECTOR:
7089	return lowerSCALAR_TO_VECTOR(Op, DAG);
7090	case ISD::BUILD_VECTOR:
7091	return lowerBUILD_VECTOR(Op, DAG);
7092	case ISD::FP_ROUND:
7093	case ISD::STRICT_FP_ROUND:
7094	return lowerFP_ROUND(Op, DAG);
7095	case ISD::TRAP:
7096	return lowerTRAP(Op, DAG);
7097	case ISD::DEBUGTRAP:
7098	return lowerDEBUGTRAP(Op, DAG);
7099	case ISD::ABS:
7100	case ISD::FABS:
7101	case ISD::FNEG:
7102	case ISD::FCANONICALIZE:
7103	case ISD::BSWAP:
7104	return splitUnaryVectorOp(Op, DAG);
7105	case ISD::FMINNUM:
7106	case ISD::FMAXNUM:
7107	return lowerFMINNUM_FMAXNUM(Op, DAG);
7108	case ISD::FMINIMUMNUM:
7109	case ISD::FMAXIMUMNUM:
7110	return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7111	case ISD::FMINIMUM:
7112	case ISD::FMAXIMUM:
7113	return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7114	case ISD::FLDEXP:
7115	case ISD::STRICT_FLDEXP:
7116	return lowerFLDEXP(Op, DAG);
7117	case ISD::FMA:
7118	return splitTernaryVectorOp(Op, DAG);
7119	case ISD::FP_TO_SINT:
7120	case ISD::FP_TO_UINT:
7121	if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7122	Op.getValueType() == MVT::i16 &&
7123	Op.getOperand(i: `0`).getValueType() == MVT::f32) {
7124	// Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7125	return Op;
7126	}
7127	return LowerFP_TO_INT(Op, DAG);
7128	case ISD::SHL:
7129	case ISD::SRA:
7130	case ISD::SRL:
7131	case ISD::ADD:
7132	case ISD::SUB:
7133	case ISD::SMIN:
7134	case ISD::SMAX:
7135	case ISD::UMIN:
7136	case ISD::UMAX:
7137	case ISD::FADD:
7138	case ISD::FMUL:
7139	case ISD::FMINNUM_IEEE:
7140	case ISD::FMAXNUM_IEEE:
7141	case ISD::UADDSAT:
7142	case ISD::USUBSAT:
7143	case ISD::SADDSAT:
7144	case ISD::SSUBSAT:
7145	return splitBinaryVectorOp(Op, DAG);
7146	case ISD::FCOPYSIGN:
7147	return lowerFCOPYSIGN(Op, DAG);
7148	case ISD::MUL:
7149	return lowerMUL(Op, DAG);
7150	case ISD::SMULO:
7151	case ISD::UMULO:
7152	return lowerXMULO(Op, DAG);
7153	case ISD::SMUL_LOHI:
7154	case ISD::UMUL_LOHI:
7155	return lowerXMUL_LOHI(Op, DAG);
7156	case ISD::DYNAMIC_STACKALLOC:
7157	return LowerDYNAMIC_STACKALLOC(Op, DAG);
7158	case ISD::STACKSAVE:
7159	return LowerSTACKSAVE(Op, DAG);
7160	case ISD::GET_ROUNDING:
7161	return lowerGET_ROUNDING(Op, DAG);
7162	case ISD::SET_ROUNDING:
7163	return lowerSET_ROUNDING(Op, DAG);
7164	case ISD::PREFETCH:
7165	return lowerPREFETCH(Op, DAG);
7166	case ISD::FP_EXTEND:
7167	case ISD::STRICT_FP_EXTEND:
7168	return lowerFP_EXTEND(Op, DAG);
7169	case ISD::GET_FPENV:
7170	return lowerGET_FPENV(Op, DAG);
7171	case ISD::SET_FPENV:
7172	return lowerSET_FPENV(Op, DAG);
7173	case ISD::ROTR:
7174	return lowerROTR(Op, DAG);
7175	}
7176	return SDValue ();
7177	}
7178
7179	// Used for D16: Casts the result of an instruction into the right vector,
7180	// packs values if loads return unpacked values.
7181	static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
7182	const SDLoc &DL, SelectionDAG &DAG,
7183	bool Unpacked) {
7184	if (!LoadVT.isVector())
7185	return Result;
7186
7187	// Cast back to the original packed type or to a larger type that is a
7188	// multiple of 32 bit for D16. Widening the return type is a required for
7189	// legalization.
7190	EVT FittingLoadVT = LoadVT;
7191	if ((LoadVT.getVectorNumElements() % `2`) == `1`) {
7192	FittingLoadVT =
7193	EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7194	NumElements: LoadVT.getVectorNumElements() + `1`);
7195	}
7196
7197	if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7198	// Truncate to v2i16/v4i16.
7199	EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7200
7201	// Workaround legalizer not scalarizing truncate after vector op
7202	// legalization but not creating intermediate vector trunc.
7203	SmallVector<SDValue, `4`> Elts;
7204	DAG.ExtractVectorElements(Op: Result, Args&: Elts);
7205	for (SDValue &Elt : Elts)
7206	Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
7207
7208	// Pad illegal v1i16/v3fi6 to v4i16
7209	if ((LoadVT.getVectorNumElements() % `2`) == `1`)
7210	Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
7211
7212	Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
7213
7214	// Bitcast to original type (v2f16/v4f16).
7215	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7216	}
7217
7218	// Cast back to the original packed type.
7219	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7220	}
7221
7222	SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7223	SelectionDAG &DAG,
7224	ArrayRef<SDValue> Ops,
7225	bool IsIntrinsic) const {
7226	SDLoc DL(M);
7227
7228	bool Unpacked = Subtarget->hasUnpackedD16VMem();
7229	EVT LoadVT = M->getValueType(ResNo: `0`);
7230
7231	EVT EquivLoadVT = LoadVT;
7232	if (LoadVT.isVector()) {
7233	if (Unpacked) {
7234	EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
7235	NumElements: LoadVT.getVectorNumElements());
7236	} else if ((LoadVT.getVectorNumElements() % `2`) == `1`) {
7237	// Widen v3f16 to legal type
7238	EquivLoadVT =
7239	EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7240	NumElements: LoadVT.getVectorNumElements() + `1`);
7241	}
7242	}
7243
7244	// Change from v4f16/v2f16 to EquivLoadVT.
7245	SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
7246
7247	SDValue Load = DAG.getMemIntrinsicNode(
7248	Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
7249	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
7250
7251	SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
7252
7253	return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: `1`)}, dl: DL);
7254	}
7255
7256	SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode M, bool* IsFormat,
7257	SelectionDAG &DAG,
7258	ArrayRef<SDValue> Ops) const {
7259	SDLoc DL(M);
7260	EVT LoadVT = M->getValueType(ResNo: `0`);
7261	EVT EltType = LoadVT.getScalarType();
7262	EVT IntVT = LoadVT.changeTypeToInteger();
7263
7264	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
7265
7266	assert(M->getNumValues() == `2` \|\| M->getNumValues() == `3`);
7267	bool IsTFE = M->getNumValues() == `3`;
7268
7269	unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7270	: AMDGPUISD::BUFFER_LOAD_FORMAT)
7271	: IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7272	: AMDGPUISD::BUFFER_LOAD;
7273
7274	if (IsD16) {
7275	return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7276	}
7277
7278	// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7279	if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < `32`)
7280	return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
7281	IsTFE);
7282
7283	if (isTypeLegal(VT: LoadVT)) {
7284	return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
7285	MMO: M->getMemOperand(), DAG);
7286	}
7287
7288	EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
7289	SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
7290	SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
7291	MMO: M->getMemOperand(), DAG);
7292	return DAG.getMergeValues(
7293	Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: `1`)},
7294	dl: DL);
7295	}
7296
7297	static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7298	SelectionDAG &DAG) {
7299	EVT VT = N->getValueType(ResNo: `0`);
7300	unsigned CondCode = N->getConstantOperandVal(Num: `3`);
7301	if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
7302	return DAG.getPOISON(VT);
7303
7304	ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7305
7306	SDValue LHS = N->getOperand(Num: `1`);
7307	SDValue RHS = N->getOperand(Num: `2`);
7308
7309	SDLoc DL(N);
7310
7311	EVT CmpVT = LHS.getValueType();
7312	if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
7313	unsigned PromoteOp =
7314	ICmpInst::isSigned(Pred: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7315	LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
7316	RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
7317	}
7318
7319	ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
7320
7321	unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7322	EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7323
7324	SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
7325	N3: DAG.getCondCode(Cond: CCOpcode));
7326	if (VT.bitsEq(VT: CCVT))
7327	return SetCC;
7328	return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
7329	}
7330
7331	static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7332	SelectionDAG &DAG) {
7333	EVT VT = N->getValueType(ResNo: `0`);
7334
7335	unsigned CondCode = N->getConstantOperandVal(Num: `3`);
7336	if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
7337	return DAG.getPOISON(VT);
7338
7339	SDValue Src0 = N->getOperand(Num: `1`);
7340	SDValue Src1 = N->getOperand(Num: `2`);
7341	EVT CmpVT = Src0.getValueType();
7342	SDLoc SL(N);
7343
7344	if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
7345	Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
7346	Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
7347	}
7348
7349	FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7350	ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
7351	unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7352	EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7353	SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
7354	N3: DAG.getCondCode(Cond: CCOpcode));
7355	if (VT.bitsEq(VT: CCVT))
7356	return SetCC;
7357	return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
7358	}
7359
7360	static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
7361	SelectionDAG &DAG) {
7362	EVT VT = N->getValueType(ResNo: `0`);
7363	SDValue Src = N->getOperand(Num: `1`);
7364	SDLoc SL(N);
7365
7366	if (Src.getOpcode() == ISD::SETCC) {
7367	SDValue Op0 = Src.getOperand(i: `0`);
7368	SDValue Op1 = Src.getOperand(i: `1`);
7369	// Need to expand bfloat to float for comparison (setcc).
7370	if (Op0.getValueType() == MVT::bf16) {
7371	Op0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op0);
7372	Op1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op1);
7373	}
7374	// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7375	return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Op0, N2: Op1, N3: Src.getOperand(i: `2`));
7376	}
7377	if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
7378	// (ballot 0) -> 0
7379	if (Arg->isZero())
7380	return DAG.getConstant(Val: `0`, DL: SL, VT);
7381
7382	// (ballot 1) -> EXEC/EXEC_LO
7383	if (Arg->isOne()) {
7384	Register Exec;
7385	if (VT.getScalarSizeInBits() == `32`)
7386	Exec = AMDGPU::EXEC_LO;
7387	else if (VT.getScalarSizeInBits() == `64`)
7388	Exec = AMDGPU::EXEC;
7389	else
7390	return SDValue ();
7391
7392	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
7393	}
7394	}
7395
7396	// (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7397	// ISD::SETNE)
7398	return DAG.getNode(
7399	Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
7400	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
7401	}
7402
7403	static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
7404	SelectionDAG &DAG) {
7405	EVT VT = N->getValueType(ResNo: `0`);
7406	unsigned ValSize = VT.getSizeInBits();
7407	unsigned IID = N->getConstantOperandVal(Num: `0`);
7408	bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 \|\|
7409	IID == Intrinsic::amdgcn_permlanex16;
7410	bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive \|\|
7411	IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7412	SDLoc SL(N);
7413	MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
7414	const GCNSubtarget *ST = TLI.getSubtarget();
7415	unsigned SplitSize = `32`;
7416	if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % `64` == `0`) &&
7417	ST->hasDPALU_DPP() &&
7418	AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: N->getConstantOperandVal(Num: `3`)))
7419	SplitSize = `64`;
7420
7421	auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7422	SDValue Src2, MVT ValT) -> SDValue {
7423	SmallVector<SDValue, `8`> Operands;
7424	switch (IID) {
7425	case Intrinsic::amdgcn_permlane16:
7426	case Intrinsic::amdgcn_permlanex16:
7427	case Intrinsic::amdgcn_update_dpp:
7428	Operands.push_back(Elt: N->getOperand(Num: `6`));
7429	Operands.push_back(Elt: N->getOperand(Num: `5`));
7430	Operands.push_back(Elt: N->getOperand(Num: `4`));
7431	[[fallthrough]];
7432	case Intrinsic::amdgcn_writelane:
7433	Operands.push_back(Elt: Src2);
7434	[[fallthrough]];
7435	case Intrinsic::amdgcn_readlane:
7436	case Intrinsic::amdgcn_set_inactive:
7437	case Intrinsic::amdgcn_set_inactive_chain_arg:
7438	case Intrinsic::amdgcn_mov_dpp8:
7439	Operands.push_back(Elt: Src1);
7440	[[fallthrough]];
7441	case Intrinsic::amdgcn_readfirstlane:
7442	case Intrinsic::amdgcn_permlane64:
7443	Operands.push_back(Elt: Src0);
7444	break;
7445	default:
7446	llvm_unreachable("unhandled lane op");
7447	}
7448
7449	Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
7450	std::reverse(first: Operands.begin(), last: Operands.end());
7451
7452	if (SDNode *GL = N->getGluedNode()) {
7453	assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7454	GL = GL->getOperand(Num: `0`).getNode();
7455	Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7456	Operand: SDValue (GL, `0`)));
7457	}
7458
7459	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
7460	};
7461
7462	SDValue Src0 = N->getOperand(Num: `1`);
7463	SDValue Src1, Src2;
7464	if (IID == Intrinsic::amdgcn_readlane \|\| IID == Intrinsic::amdgcn_writelane \|\|
7465	IID == Intrinsic::amdgcn_mov_dpp8 \|\|
7466	IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16) {
7467	Src1 = N->getOperand(Num: `2`);
7468	if (IID == Intrinsic::amdgcn_writelane \|\|
7469	IID == Intrinsic::amdgcn_update_dpp \|\| IsPermLane16)
7470	Src2 = N->getOperand(Num: `3`);
7471	}
7472
7473	if (ValSize == SplitSize) {
7474	// Already legal
7475	return SDValue ();
7476	}
7477
7478	if (ValSize < `32`) {
7479	bool IsFloat = VT.isFloatingPoint();
7480	Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
7481	DL: SL, VT: MVT::i32);
7482
7483	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16) {
7484	Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
7485	DL: SL, VT: MVT::i32);
7486	}
7487
7488	if (IID == Intrinsic::amdgcn_writelane) {
7489	Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
7490	DL: SL, VT: MVT::i32);
7491	}
7492
7493	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, MVT::i32);
7494	SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
7495	return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
7496	}
7497
7498	if (ValSize % SplitSize != `0`)
7499	return SDValue ();
7500
7501	auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7502	EVT VT = N->getValueType(ResNo: `0`);
7503	unsigned NE = VT.getVectorNumElements();
7504	EVT EltVT = VT.getVectorElementType();
7505	SmallVector<SDValue, `8`> Scalars;
7506	unsigned NumOperands = N->getNumOperands();
7507	SmallVector<SDValue, `4`> Operands(NumOperands);
7508	SDNode *GL = N->getGluedNode();
7509
7510	// only handle convergencectrl_glue
7511	assert(!GL \|\| GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7512
7513	for (unsigned i = `0`; i != NE; ++i) {
7514	for (unsigned j = `0`, e = GL ? NumOperands - `1` : NumOperands; j != e;
7515	++j) {
7516	SDValue Operand = N->getOperand(Num: j);
7517	EVT OperandVT = Operand.getValueType();
7518	if (OperandVT.isVector()) {
7519	// A vector operand; extract a single element.
7520	EVT OperandEltVT = OperandVT.getVectorElementType();
7521	Operands [j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
7522	N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
7523	} else {
7524	// A scalar operand; just use it as is.
7525	Operands [j] = Operand;
7526	}
7527	}
7528
7529	if (GL)
7530	Operands [NumOperands - `1`] =
7531	DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7532	Operand: SDValue (GL->getOperand(Num: `0`).getNode(), `0`));
7533
7534	Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
7535	}
7536
7537	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
7538	return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
7539	};
7540
7541	if (VT.isVector()) {
7542	switch (MVT::SimpleValueType EltTy =
7543	VT.getVectorElementType().getSimpleVT().SimpleTy) {
7544	case MVT::i32:
7545	case MVT::f32:
7546	if (SplitSize == `32`) {
7547	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, VT.getSimpleVT());
7548	return unrollLaneOp (LaneOp.getNode());
7549	}
7550	[[fallthrough]];
7551	case MVT::i16:
7552	case MVT::f16:
7553	case MVT::bf16: {
7554	unsigned SubVecNumElt =
7555	SplitSize / VT.getVectorElementType().getSizeInBits();
7556	MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
7557	SmallVector<SDValue, `4`> Pieces;
7558	SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7559	for (unsigned i = `0`, EltIdx = `0`; i < ValSize / SplitSize; i++) {
7560	Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
7561	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7562
7563	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\|
7564	IsPermLane16)
7565	Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
7566	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7567
7568	if (IID == Intrinsic::amdgcn_writelane)
7569	Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
7570	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7571
7572	Pieces.push_back(
7573	Elt: IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16
7574	? createLaneOp (Src0SubVec, Src1SubVec, Src2, SubVecVT)
7575	: createLaneOp (Src0SubVec, Src1, Src2SubVec, SubVecVT));
7576	EltIdx += SubVecNumElt;
7577	}
7578	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
7579	}
7580	default:
7581	// Handle all other cases by bitcasting to i32 vectors
7582	break;
7583	}
7584	}
7585
7586	MVT VecVT =
7587	MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
7588	Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
7589
7590	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16)
7591	Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
7592
7593	if (IID == Intrinsic::amdgcn_writelane)
7594	Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
7595
7596	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, VecVT);
7597	SDValue UnrolledLaneOp = unrollLaneOp (LaneOp.getNode());
7598	return DAG.getBitcast(VT, V: UnrolledLaneOp);
7599	}
7600
7601	static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
7602	SelectionDAG &DAG) {
7603	EVT VT = N->getValueType(ResNo: `0`);
7604
7605	if (VT.getSizeInBits() != `32`)
7606	return SDValue ();
7607
7608	SDLoc SL(N);
7609
7610	SDValue Value = N->getOperand(Num: `1`);
7611	SDValue Index = N->getOperand(Num: `2`);
7612
7613	// ds_bpermute requires index to be multiplied by 4
7614	SDValue ShiftAmount = DAG.getShiftAmountConstant(Val: `2`, VT: MVT::i32, DL: SL);
7615	SDValue ShiftedIndex =
7616	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: Index.getValueType(), N1: Index, N2: ShiftAmount);
7617
7618	// Intrinsics will require i32 to operate on
7619	SDValue ValueI32 = DAG.getBitcast(VT: MVT::i32, V: Value);
7620
7621	auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7622	SmallVector<SDValue> IntrinArgs) -> SDValue {
7623	SmallVector<SDValue> Operands(`1`);
7624	Operands [`0`] = DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32);
7625	Operands.append(RHS: IntrinArgs);
7626	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: RetVT, Ops: Operands);
7627	};
7628
7629	// If we can bpermute across the whole wave, then just do that
7630	if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7631	SDValue BPermute = MakeIntrinsic (Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7632	{ShiftedIndex, ValueI32});
7633	return DAG.getBitcast(VT, V: BPermute);
7634	}
7635
7636	assert(TLI.getSubtarget()->isWave64());
7637
7638	// Otherwise, we need to make use of whole wave mode
7639	SDValue PoisonVal = DAG.getPOISON(VT: ValueI32 ->getValueType(ResNo: `0`));
7640
7641	// Set inactive lanes to poison
7642	SDValue WWMValue = MakeIntrinsic (Intrinsic::amdgcn_set_inactive, MVT::i32,
7643	{ValueI32, PoisonVal});
7644	SDValue WWMIndex = MakeIntrinsic (Intrinsic::amdgcn_set_inactive, MVT::i32,
7645	{ShiftedIndex, PoisonVal});
7646
7647	SDValue Swapped =
7648	MakeIntrinsic (Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7649
7650	// Get permutation of each half, then we'll select which one to use
7651	SDValue BPermSameHalf = MakeIntrinsic (Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7652	{WWMIndex, WWMValue});
7653	SDValue BPermOtherHalf = MakeIntrinsic (Intrinsic::amdgcn_ds_bpermute,
7654	MVT::i32, {WWMIndex, Swapped});
7655	SDValue BPermOtherHalfWWM =
7656	MakeIntrinsic (Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7657
7658	// Select which side to take the permute from
7659	SDValue ThreadIDMask = DAG.getAllOnesConstant(DL: SL, VT: MVT::i32);
7660	// We can get away with only using mbcnt_lo here since we're only
7661	// trying to detect which side of 32 each lane is on, and mbcnt_lo
7662	// returns 32 for lanes 32-63.
7663	SDValue ThreadID =
7664	MakeIntrinsic (Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7665	{ThreadIDMask, DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32)});
7666
7667	SDValue SameOrOtherHalf =
7668	DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
7669	N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: ThreadID, N2: Index),
7670	N2: DAG.getTargetConstant(Val: `32`, DL: SL, VT: MVT::i32));
7671	SDValue UseSameHalf =
7672	DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SameOrOtherHalf,
7673	RHS: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), Cond: ISD::SETEQ);
7674	SDValue Result = DAG.getSelect(DL: SL, VT: MVT::i32, Cond: UseSameHalf, LHS: BPermSameHalf,
7675	RHS: BPermOtherHalfWWM);
7676	return DAG.getBitcast(VT, V: Result);
7677	}
7678
7679	void SITargetLowering::ReplaceNodeResults(SDNode *N,
7680	SmallVectorImpl<SDValue> &Results,
7681	SelectionDAG &DAG) const {
7682	switch (N->getOpcode()) {
7683	case ISD::INSERT_VECTOR_ELT: {
7684	if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue (N, `0`), DAG))
7685	Results.push_back(Elt: Res);
7686	return;
7687	}
7688	case ISD::EXTRACT_VECTOR_ELT: {
7689	if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue (N, `0`), DAG))
7690	Results.push_back(Elt: Res);
7691	return;
7692	}
7693	case ISD::INTRINSIC_WO_CHAIN: {
7694	unsigned IID = N->getConstantOperandVal(Num: `0`);
7695	switch (IID) {
7696	case Intrinsic::amdgcn_make_buffer_rsrc:
7697	Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
7698	return;
7699	case Intrinsic::amdgcn_cvt_pkrtz: {
7700	SDValue Src0 = N->getOperand(Num: `1`);
7701	SDValue Src1 = N->getOperand(Num: `2`);
7702	SDLoc SL(N);
7703	SDValue Cvt =
7704	DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7705	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
7706	return;
7707	}
7708	case Intrinsic::amdgcn_cvt_pknorm_i16:
7709	case Intrinsic::amdgcn_cvt_pknorm_u16:
7710	case Intrinsic::amdgcn_cvt_pk_i16:
7711	case Intrinsic::amdgcn_cvt_pk_u16: {
7712	SDValue Src0 = N->getOperand(Num: `1`);
7713	SDValue Src1 = N->getOperand(Num: `2`);
7714	SDLoc SL(N);
7715	unsigned Opcode;
7716
7717	if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7718	Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7719	else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7720	Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7721	else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7722	Opcode = AMDGPUISD::CVT_PK_I16_I32;
7723	else
7724	Opcode = AMDGPUISD::CVT_PK_U16_U32;
7725
7726	EVT VT = N->getValueType(ResNo: `0`);
7727	if (isTypeLegal(VT))
7728	Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
7729	else {
7730	SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7731	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
7732	}
7733	return;
7734	}
7735	case Intrinsic::amdgcn_s_buffer_load: {
7736	// Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7737	// s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7738	// combiner tries to merge the s_buffer_load_u8 with a sext instruction
7739	// (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7740	// s_buffer_load_i8.
7741	if (!Subtarget->hasScalarSubwordLoads())
7742	return;
7743	SDValue Op = SDValue (N, `0`);
7744	SDValue Rsrc = Op.getOperand(i: `1`);
7745	SDValue Offset = Op.getOperand(i: `2`);
7746	SDValue CachePolicy = Op.getOperand(i: `3`);
7747	EVT VT = Op.getValueType();
7748	assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7749	SDLoc DL(Op);
7750	MachineFunction &MF = DAG.getMachineFunction();
7751	const DataLayout &DataLayout = DAG.getDataLayout();
7752	Align Alignment =
7753	DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
7754	MachineMemOperand *MMO = MF.getMachineMemOperand(
7755	PtrInfo: MachinePointerInfo (),
7756	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
7757	MachineMemOperand::MOInvariant,
7758	Size: VT.getStoreSize(), BaseAlignment: Alignment);
7759	SDValue LoadVal;
7760	if (!Offset ->isDivergent()) {
7761	SDValue Ops[] = {Rsrc, // source register
7762	Offset, CachePolicy};
7763	SDValue BufferLoad =
7764	DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
7765	VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
7766	LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
7767	} else {
7768	SDValue Ops[] = {
7769	DAG.getEntryNode(), // Chain
7770	Rsrc, // rsrc
7771	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
7772	{}, // voffset
7773	{}, // soffset
7774	{}, // offset
7775	CachePolicy, // cachepolicy
7776	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
7777	};
7778	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`], Alignment: Align (`4`));
7779	LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
7780	}
7781	Results.push_back(Elt: LoadVal);
7782	return;
7783	}
7784	case Intrinsic::amdgcn_dead: {
7785	for (unsigned I = `0`, E = N->getNumValues(); I < E; ++I)
7786	Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
7787	return;
7788	}
7789	}
7790	break;
7791	}
7792	case ISD::INTRINSIC_W_CHAIN: {
7793	if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue (N, `0`), DAG)) {
7794	if (Res.getOpcode() == ISD::MERGE_VALUES) {
7795	// FIXME: Hacky
7796	for (unsigned I = `0`; I < Res.getNumOperands(); I++) {
7797	Results.push_back(Elt: Res.getOperand(i: I));
7798	}
7799	} else {
7800	Results.push_back(Elt: Res);
7801	Results.push_back(Elt: Res.getValue(R: `1`));
7802	}
7803	return;
7804	}
7805
7806	break;
7807	}
7808	case ISD::SELECT: {
7809	SDLoc SL(N);
7810	EVT VT = N->getValueType(ResNo: `0`);
7811	EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
7812	SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: `1`));
7813	SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: `2`));
7814
7815	EVT SelectVT = NewVT;
7816	if (NewVT.bitsLT(VT: MVT::i32)) {
7817	LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
7818	RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
7819	SelectVT = MVT::i32;
7820	}
7821
7822	SDValue NewSelect =
7823	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: `0`), N2: LHS, N3: RHS);
7824
7825	if (NewVT != SelectVT)
7826	NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
7827	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
7828	return;
7829	}
7830	case ISD::FNEG: {
7831	if (N->getValueType(ResNo: `0`) != MVT::v2f16)
7832	break;
7833
7834	SDLoc SL(N);
7835	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: `0`));
7836
7837	SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
7838	N2: DAG.getConstant(Val: `0x80008000`, DL: SL, VT: MVT::i32));
7839	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7840	return;
7841	}
7842	case ISD::FABS: {
7843	if (N->getValueType(ResNo: `0`) != MVT::v2f16)
7844	break;
7845
7846	SDLoc SL(N);
7847	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: `0`));
7848
7849	SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
7850	N2: DAG.getConstant(Val: `0x7fff7fff`, DL: SL, VT: MVT::i32));
7851	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7852	return;
7853	}
7854	case ISD::FSQRT: {
7855	if (N->getValueType(ResNo: `0`) != MVT::f16)
7856	break;
7857	Results.push_back(Elt: lowerFSQRTF16(Op: SDValue (N, `0`), DAG));
7858	break;
7859	}
7860	default:
7861	AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
7862	break;
7863	}
7864	}
7865
7866	/// Helper function for LowerBRCOND
7867	static SDNode findUser(SDValue Value, unsigned* Opcode) {
7868
7869	for (SDUse &U : Value ->uses()) {
7870	if (U.get() != Value)
7871	continue;
7872
7873	if (U.getUser()->getOpcode() == Opcode)
7874	return U.getUser();
7875	}
7876	return nullptr;
7877	}
7878
7879	unsigned SITargetLowering::isCFIntrinsic(const SDNode Intr) const* {
7880	if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7881	switch (Intr->getConstantOperandVal(Num: `1`)) {
7882	case Intrinsic::amdgcn_if:
7883	return AMDGPUISD::IF;
7884	case Intrinsic::amdgcn_else:
7885	return AMDGPUISD::ELSE;
7886	case Intrinsic::amdgcn_loop:
7887	return AMDGPUISD::LOOP;
7888	case Intrinsic::amdgcn_end_cf:
7889	llvm_unreachable("should not occur");
7890	default:
7891	return `0`;
7892	}
7893	}
7894
7895	// break, if_break, else_break are all only used as inputs to loop, not
7896	// directly as branch conditions.
7897	return `0`;
7898	}
7899
7900	bool SITargetLowering::shouldEmitFixup(const GlobalValue GV) const* {
7901	const Triple &TT = getTargetMachine().getTargetTriple();
7902	return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
7903	GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7904	AMDGPU::shouldEmitConstantsToTextSection(TT);
7905	}
7906
7907	bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue GV) const* {
7908	if (Subtarget->isAmdPalOS() \|\| Subtarget->isMesa3DOS())
7909	return false;
7910
7911	// FIXME: Either avoid relying on address space here or change the default
7912	// address space for functions to avoid the explicit check.
7913	return (GV->getValueType()->isFunctionTy() \|\|
7914	!isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
7915	!shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
7916	}
7917
7918	bool SITargetLowering::shouldEmitPCReloc(const GlobalValue GV) const* {
7919	return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7920	}
7921
7922	bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue GV) const* {
7923	if (!GV->hasExternalLinkage())
7924	return true;
7925
7926	const auto OS = getTargetMachine().getTargetTriple().getOS();
7927	return OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL;
7928	}
7929
7930	/// This transforms the control flow intrinsics to get the branch destination as
7931	/// last parameter, also switches branch target with BR if the need arise
7932	SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7933	SDLoc DL(BRCOND);
7934
7935	SDNode *Intr = BRCOND.getOperand(i: `1`).getNode();
7936	SDValue Target = BRCOND.getOperand(i: `2`);
7937	SDNode BR = nullptr*;
7938	SDNode SetCC = nullptr*;
7939
7940	switch (Intr->getOpcode()) {
7941	case ISD::SETCC: {
7942	// As long as we negate the condition everything is fine
7943	SetCC = Intr;
7944	Intr = SetCC->getOperand(Num: `0`).getNode();
7945	break;
7946	}
7947	case ISD::XOR: {
7948	// Similar to SETCC, if we have (xor c, -1), we will be fine.
7949	SDValue LHS = Intr->getOperand(Num: `0`);
7950	SDValue RHS = Intr->getOperand(Num: `1`);
7951	if (auto *C = dyn_cast<ConstantSDNode>(Val&: RHS); C && C->getZExtValue()) {
7952	Intr = LHS.getNode();
7953	break;
7954	}
7955	[[fallthrough]];
7956	}
7957	default: {
7958	// Get the target from BR if we don't negate the condition
7959	BR = findUser(Value: BRCOND, Opcode: ISD::BR);
7960	assert(BR && "brcond missing unconditional branch user");
7961	Target = BR->getOperand(Num: `1`);
7962	}
7963	}
7964
7965	unsigned CFNode = isCFIntrinsic(Intr);
7966	if (CFNode == `0`) {
7967	// This is a uniform branch so we don't need to legalize.
7968	return BRCOND;
7969	}
7970
7971	bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID \|\|
7972	Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7973
7974	assert(!SetCC \|\|
7975	(SetCC->getConstantOperandVal(`1`) == `1` &&
7976	cast<CondCodeSDNode>(SetCC->getOperand(`2`).getNode())->get() ==
7977	ISD::SETNE));
7978
7979	// operands of the new intrinsic call
7980	SmallVector<SDValue, `4`> Ops;
7981	if (HaveChain)
7982	Ops.push_back(Elt: BRCOND.getOperand(i: `0`));
7983
7984	Ops.append(in_start: Intr->op_begin() + (HaveChain ? `2` : `1`), in_end: Intr->op_end());
7985	Ops.push_back(Elt: Target);
7986
7987	ArrayRef<EVT> Res(Intr->value_begin() + `1`, Intr->value_end());
7988
7989	// build the new intrinsic call
7990	SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
7991
7992	if (!HaveChain) {
7993	SDValue Ops[] = {SDValue (Result, `0`), BRCOND.getOperand(i: `0`)};
7994
7995	Result = DAG.getMergeValues(Ops, dl: DL).getNode();
7996	}
7997
7998	if (BR) {
7999	// Give the branch instruction our target
8000	SDValue Ops[] = {BR->getOperand(Num: `0`), BRCOND.getOperand(i: `2`)};
8001	SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
8002	DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
8003	}
8004
8005	SDValue Chain = SDValue (Result, Result->getNumValues() - `1`);
8006
8007	// Copy the intrinsic results to registers
8008	for (unsigned i = `1`, e = Intr->getNumValues() - `1`; i != e; ++i) {
8009	SDNode *CopyToReg = findUser(Value: SDValue (Intr, i), Opcode: ISD::CopyToReg);
8010	if (!CopyToReg)
8011	continue;
8012
8013	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: `1`),
8014	N: SDValue (Result, i - `1`), Glue: SDValue ());
8015
8016	DAG.ReplaceAllUsesWith(From: SDValue (CopyToReg, `0`), To: CopyToReg->getOperand(Num: `0`));
8017	}
8018
8019	// Remove the old intrinsic from the chain
8020	DAG.ReplaceAllUsesOfValueWith(From: SDValue (Intr, Intr->getNumValues() - `1`),
8021	To: Intr->getOperand(Num: `0`));
8022
8023	return Chain;
8024	}
8025
8026	SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8027	MVT VT = Op.getSimpleValueType();
8028	SDLoc DL(Op);
8029	// Checking the depth
8030	if (Op.getConstantOperandVal(i: `0`) != `0`)
8031	return DAG.getConstant(Val: `0`, DL, VT);
8032
8033	MachineFunction &MF = DAG.getMachineFunction();
8034	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8035	// Check for kernel and shader functions
8036	if (Info->isEntryFunction())
8037	return DAG.getConstant(Val: `0`, DL, VT);
8038
8039	MachineFrameInfo &MFI = MF.getFrameInfo();
8040	// There is a call to @llvm.returnaddress in this function
8041	MFI.setReturnAddressIsTaken(true);
8042
8043	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8044	// Get the return address reg and mark it as an implicit live-in
8045	Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
8046	RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
8047
8048	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
8049	}
8050
8051	SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8052	MachineFunction &MF = DAG.getMachineFunction();
8053	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8054
8055	// For functions that set up their own stack, select the GET_STACK_BASE
8056	// pseudo.
8057	if (MFI->isBottomOfStack())
8058	return Op;
8059
8060	// For everything else, create a dummy stack object.
8061	int FI = MF.getFrameInfo().CreateFixedObject(Size: `1`, SPOffset: `0`, /IsImmutable=/false);
8062	return DAG.getFrameIndex(FI, VT: Op.getValueType());
8063	}
8064
8065	SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8066	const SDLoc &DL, EVT VT) const {
8067	return Op.getValueType().bitsLE(VT)
8068	? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
8069	: DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
8070	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
8071	}
8072
8073	SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8074	SelectionDAG &DAG) const {
8075	EVT DstVT = Op.getValueType();
8076	unsigned NumElts = DstVT.getVectorNumElements();
8077	assert(NumElts > `2` && isPowerOf2_32(NumElts));
8078
8079	auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
8080
8081	SDLoc DL(Op);
8082	unsigned Opc = Op.getOpcode();
8083	SDValue Flags = Op.getOperand(i: `1`);
8084	EVT HalfDstVT =
8085	EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / `2`);
8086	SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
8087	SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
8088
8089	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
8090	}
8091
8092	SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8093	SDValue Src = Op.getOperand(i: `0`);
8094	EVT SrcVT = Src.getValueType();
8095	EVT DstVT = Op.getValueType();
8096
8097	if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8098	assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8099	if (SrcVT.getScalarType() != MVT::f32)
8100	return SDValue ();
8101	return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8102	}
8103
8104	if (SrcVT.getScalarType() != MVT::f64)
8105	return Op;
8106
8107	SDLoc DL(Op);
8108	if (DstVT == MVT::f16) {
8109	// TODO: Handle strictfp
8110	if (Op.getOpcode() != ISD::FP_ROUND)
8111	return Op;
8112
8113	if (!Subtarget->has16BitInsts()) {
8114	SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
8115	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8116	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8117	}
8118	if (Op ->getFlags().hasApproximateFuncs()) {
8119	SDValue Flags = Op.getOperand(i: `1`);
8120	SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
8121	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
8122	}
8123	SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8124	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8125	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8126	}
8127
8128	assert(DstVT.getScalarType() == MVT::bf16 &&
8129	"custom lower FP_ROUND for f16 or bf16");
8130	assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8131
8132	// Round-inexact-to-odd f64 to f32, then do the final rounding using the
8133	// hardware f32 -> bf16 instruction.
8134	EVT F32VT = SrcVT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::f32);
8135	SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
8136	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
8137	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
8138	}
8139
8140	SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8141	SelectionDAG &DAG) const {
8142	EVT VT = Op.getValueType();
8143	const MachineFunction &MF = DAG.getMachineFunction();
8144	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8145	bool IsIEEEMode = Info->getMode().IEEE;
8146
8147	// FIXME: Assert during selection that this is only selected for
8148	// ieee_mode. Currently a combine can produce the ieee version for non-ieee
8149	// mode functions, but this happens to be OK since it's only done in cases
8150	// where there is known no sNaN.
8151	if (IsIEEEMode)
8152	return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
8153
8154	if (VT == MVT::v4f16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16f16 \|\|
8155	VT == MVT::v16bf16)
8156	return splitBinaryVectorOp(Op, DAG);
8157	return Op;
8158	}
8159
8160	SDValue
8161	SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8162	SelectionDAG &DAG) const {
8163	EVT VT = Op.getValueType();
8164	const MachineFunction &MF = DAG.getMachineFunction();
8165	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8166	bool IsIEEEMode = Info->getMode().IEEE;
8167
8168	if (IsIEEEMode)
8169	return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
8170
8171	if (VT == MVT::v4f16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16f16 \|\|
8172	VT == MVT::v16bf16)
8173	return splitBinaryVectorOp(Op, DAG);
8174	return Op;
8175	}
8176
8177	SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8178	SelectionDAG &DAG) const {
8179	EVT VT = Op.getValueType();
8180	if (VT.isVector())
8181	return splitBinaryVectorOp(Op, DAG);
8182
8183	assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8184	!Subtarget->hasMinimum3Maximum3F16() &&
8185	Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8186	"should not need to widen f16 minimum/maximum to v2f16");
8187
8188	// Widen f16 operation to v2f16
8189
8190	// fminimum f16:x, f16:y ->
8191	// extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8192	// (v2f16 (scalar_to_vector y))), 0
8193	SDLoc SL(Op);
8194	SDValue WideSrc0 =
8195	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: `0`));
8196	SDValue WideSrc1 =
8197	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: `1`));
8198
8199	SDValue Widened =
8200	DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
8201
8202	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
8203	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
8204	}
8205
8206	SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8207	bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8208	EVT VT = Op.getValueType();
8209	assert(VT == MVT::f16);
8210
8211	SDValue Exp = Op.getOperand(i: IsStrict ? `2` : `1`);
8212	EVT ExpVT = Exp.getValueType();
8213	if (ExpVT == MVT::i16)
8214	return Op;
8215
8216	SDLoc DL(Op);
8217
8218	// Correct the exponent type for f16 to i16.
8219	// Clamp the range of the exponent to the instruction's range.
8220
8221	// TODO: This should be a generic narrowing legalization, and can easily be
8222	// for GlobalISel.
8223
8224	SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: `16`), DL, VT: ExpVT);
8225	SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
8226
8227	SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: `16`), DL, VT: ExpVT);
8228	SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
8229
8230	SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
8231
8232	if (IsStrict) {
8233	return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
8234	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`), TruncExp});
8235	}
8236
8237	return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: `0`), N2: TruncExp);
8238	}
8239
8240	static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
8241	switch (Op ->getOpcode()) {
8242	case ISD::SRA:
8243	case ISD::SMIN:
8244	case ISD::SMAX:
8245	return ISD::SIGN_EXTEND;
8246	case ISD::SRL:
8247	case ISD::UMIN:
8248	case ISD::UMAX:
8249	return ISD::ZERO_EXTEND;
8250	case ISD::ADD:
8251	case ISD::SUB:
8252	case ISD::AND:
8253	case ISD::OR:
8254	case ISD::XOR:
8255	case ISD::SHL:
8256	case ISD::SELECT:
8257	case ISD::MUL:
8258	// operation result won't be influenced by garbage high bits.
8259	// TODO: are all of those cases correct, and are there more?
8260	return ISD::ANY_EXTEND;
8261	case ISD::SETCC: {
8262	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
8263	return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8264	}
8265	default:
8266	llvm_unreachable("unexpected opcode!");
8267	}
8268	}
8269
8270	SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8271	DAGCombinerInfo &DCI) const {
8272	const unsigned Opc = Op.getOpcode();
8273	assert(Opc == ISD::ADD \|\| Opc == ISD::SUB \|\| Opc == ISD::SHL \|\|
8274	Opc == ISD::SRL \|\| Opc == ISD::SRA \|\| Opc == ISD::AND \|\|
8275	Opc == ISD::OR \|\| Opc == ISD::XOR \|\| Opc == ISD::MUL \|\|
8276	Opc == ISD::SETCC \|\| Opc == ISD::SELECT \|\| Opc == ISD::SMIN \|\|
8277	Opc == ISD::SMAX \|\| Opc == ISD::UMIN \|\| Opc == ISD::UMAX);
8278
8279	EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8280	: Op ->getOperand(Num: `0`).getValueType();
8281	auto &DAG = DCI.DAG;
8282	auto ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8283
8284	if (DCI.isBeforeLegalizeOps() \|\|
8285	isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8286	return SDValue ();
8287
8288	SDLoc DL(Op);
8289	SDValue LHS;
8290	SDValue RHS;
8291	if (Opc == ISD::SELECT) {
8292	LHS = Op ->getOperand(Num: `1`);
8293	RHS = Op ->getOperand(Num: `2`);
8294	} else {
8295	LHS = Op ->getOperand(Num: `0`);
8296	RHS = Op ->getOperand(Num: `1`);
8297	}
8298
8299	const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8300	LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
8301
8302	// Special case: for shifts, the RHS always needs a zext.
8303	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
8304	RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
8305	else
8306	RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
8307
8308	// setcc always return i1/i1 vec so no need to truncate after.
8309	if (Opc == ISD::SETCC) {
8310	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
8311	return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
8312	}
8313
8314	// For other ops, we extend the operation's return type as well so we need to
8315	// truncate back to the original type.
8316	SDValue NewVal;
8317	if (Opc == ISD::SELECT)
8318	NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op ->getOperand(Num: `0`), LHS, RHS});
8319	else
8320	NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
8321
8322	return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
8323	}
8324
8325	SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8326	SDValue Mag = Op.getOperand(i: `0`);
8327	EVT MagVT = Mag.getValueType();
8328
8329	if (MagVT.getVectorNumElements() > `2`)
8330	return splitBinaryVectorOp(Op, DAG);
8331
8332	SDValue Sign = Op.getOperand(i: `1`);
8333	EVT SignVT = Sign.getValueType();
8334
8335	if (MagVT == SignVT)
8336	return Op;
8337
8338	// fcopysign v2f16:mag, v2f32:sign ->
8339	// fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8340
8341	SDLoc SL(Op);
8342	SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
8343	SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
8344
8345	SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
8346
8347	return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
8348	}
8349
8350	// Custom lowering for vector multiplications and s_mul_u64.
8351	SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8352	EVT VT = Op.getValueType();
8353
8354	// Split vector operands.
8355	if (VT.isVector())
8356	return splitBinaryVectorOp(Op, DAG);
8357
8358	assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8359
8360	// There are four ways to lower s_mul_u64:
8361	//
8362	// 1. If all the operands are uniform, then we lower it as it is.
8363	//
8364	// 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8365	// multiplications because there is not a vector equivalent of s_mul_u64.
8366	//
8367	// 3. If the cost model decides that it is more efficient to use vector
8368	// registers, then we have to split s_mul_u64 in 32-bit multiplications.
8369	// This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8370	//
8371	// 4. If the cost model decides to use vector registers and both of the
8372	// operands are zero-extended/sign-extended from 32-bits, then we split the
8373	// s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8374	// possible to check if the operands are zero-extended or sign-extended in
8375	// SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8376	// s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8377	// s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8378	// If the cost model decides that we have to use vector registers, then
8379	// splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8380	// s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8381	// decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8382	// s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8383	// SIInstrInfo.cpp .
8384
8385	if (Op ->isDivergent())
8386	return SDValue ();
8387
8388	SDValue Op0 = Op.getOperand(i: `0`);
8389	SDValue Op1 = Op.getOperand(i: `1`);
8390	// If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8391	// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8392	// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8393	KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
8394	unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8395	KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
8396	unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8397	SDLoc SL(Op);
8398	if (Op0LeadingZeros >= `32` && Op1LeadingZeros >= `32`)
8399	return SDValue (
8400	DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), `0`);
8401	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
8402	unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
8403	if (Op0SignBits >= `33` && Op1SignBits >= `33`)
8404	return SDValue (
8405	DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), `0`);
8406	// If all the operands are uniform, then we lower s_mul_u64 as it is.
8407	return Op;
8408	}
8409
8410	SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8411	EVT VT = Op.getValueType();
8412	SDLoc SL(Op);
8413	SDValue LHS = Op.getOperand(i: `0`);
8414	SDValue RHS = Op.getOperand(i: `1`);
8415	bool isSigned = Op.getOpcode() == ISD::SMULO;
8416
8417	if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
8418	const APInt &C = RHSC->getAPIntValue();
8419	// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8420	if (C.isPowerOf2()) {
8421	// smulo(x, signed_min) is same as umulo(x, signed_min).
8422	bool UseArithShift = isSigned && !C.isMinSignedValue();
8423	SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
8424	SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
8425	SDValue Overflow =
8426	DAG.getSetCC(DL: SL, VT: MVT::i1,
8427	LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
8428	N1: Result, N2: ShiftAmt),
8429	RHS: LHS, Cond: ISD::SETNE);
8430	return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8431	}
8432	}
8433
8434	SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
8435	SDValue Top =
8436	DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
8437
8438	SDValue Sign = isSigned
8439	? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
8440	N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - `1`,
8441	DL: SL, VT: MVT::i32))
8442	: DAG.getConstant(Val: `0`, DL: SL, VT);
8443	SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
8444
8445	return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8446	}
8447
8448	SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8449	if (Op ->isDivergent()) {
8450	// Select to V_MAD_[IU]64_[IU]32.
8451	return Op;
8452	}
8453	if (Subtarget->hasSMulHi()) {
8454	// Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8455	return SDValue ();
8456	}
8457	// The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8458	// calculate the high part, so we might as well do the whole thing with
8459	// V_MAD_[IU]64_[IU]32.
8460	return Op;
8461	}
8462
8463	SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8464	if (!Subtarget->hasTrapHandler() \|\|
8465	Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8466	return lowerTrapEndpgm(Op, DAG);
8467
8468	return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8469	: lowerTrapHsaQueuePtr(Op, DAG);
8470	}
8471
8472	SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8473	SDLoc SL(Op);
8474	SDValue Chain = Op.getOperand(i: `0`);
8475	return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8476	}
8477
8478	SDValue
8479	SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8480	const SDLoc &DL, Align Alignment,
8481	ImplicitParameter Param) const {
8482	MachineFunction &MF = DAG.getMachineFunction();
8483	uint64_t Offset = getImplicitParameterOffset(MF, Param);
8484	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
8485	MachinePointerInfo PtrInfo =
8486	getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
8487	return DAG.getLoad(
8488	VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
8489	MMOFlags: MachineMemOperand::MODereferenceable \| MachineMemOperand::MOInvariant);
8490	}
8491
8492	SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8493	SelectionDAG &DAG) const {
8494	SDLoc SL(Op);
8495	SDValue Chain = Op.getOperand(i: `0`);
8496
8497	SDValue QueuePtr;
8498	// For code object version 5, QueuePtr is passed through implicit kernarg.
8499	const Module *M = DAG.getMachineFunction().getFunction().getParent();
8500	if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8501	QueuePtr =
8502	loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align (`8`), Param: QUEUE_PTR);
8503	} else {
8504	MachineFunction &MF = DAG.getMachineFunction();
8505	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8506	Register UserSGPR = Info->getQueuePtrUserSGPR();
8507
8508	if (UserSGPR == AMDGPU::NoRegister) {
8509	// We probably are in a function incorrectly marked with
8510	// amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8511	// trap, so just use a null pointer.
8512	QueuePtr = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i64);
8513	} else {
8514	QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
8515	VT: MVT::i64);
8516	}
8517	}
8518
8519	SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
8520	SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue ());
8521
8522	uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8523	SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
8524	ToReg.getValue(R: `1`)};
8525	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8526	}
8527
8528	SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8529	SDLoc SL(Op);
8530	SDValue Chain = Op.getOperand(i: `0`);
8531
8532	// We need to simulate the 's_trap 2' instruction on targets that run in
8533	// PRIV=1 (where it is treated as a nop).
8534	if (Subtarget->hasPrivEnabledTrap2NopBug())
8535	return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8536
8537	uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8538	SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8539	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8540	}
8541
8542	SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8543	SDLoc SL(Op);
8544	SDValue Chain = Op.getOperand(i: `0`);
8545	MachineFunction &MF = DAG.getMachineFunction();
8546
8547	if (!Subtarget->hasTrapHandler() \|\|
8548	Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8549	LLVMContext &Ctx = MF.getFunction().getContext();
8550	Ctx.diagnose(DI: DiagnosticInfoUnsupported (MF.getFunction(),
8551	"debugtrap handler not supported",
8552	Op.getDebugLoc(), DS_Warning));
8553	return Chain;
8554	}
8555
8556	uint64_t TrapID =
8557	static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8558	SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8559	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8560	}
8561
8562	SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8563	SelectionDAG &DAG) const {
8564	if (Subtarget->hasApertureRegs()) {
8565	const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8566	? AMDGPU::SRC_SHARED_BASE
8567	: AMDGPU::SRC_PRIVATE_BASE;
8568	assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE \|\|
8569	!Subtarget->hasGloballyAddressableScratch()) &&
8570	"Cannot use src_private_base with globally addressable scratch!");
8571	// Note: this feature (register) is broken. When used as a 32-bit operand,
8572	// it returns a wrong value (all zeroes?). The real value is in the upper 32
8573	// bits.
8574	//
8575	// To work around the issue, emit a 64 bit copy from this register
8576	// then extract the high bits. Note that this shouldn't even result in a
8577	// shift being emitted and simply become a pair of registers (e.g.):
8578	// s_mov_b64 s[6:7], src_shared_base
8579	// v_mov_b32_e32 v1, s7
8580	SDValue Copy =
8581	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ApertureRegNo, VT: MVT::v2i32);
8582	return DAG.getExtractVectorElt(DL, VT: MVT::i32, Vec: Copy, Idx: `1`);
8583	}
8584
8585	// For code object version 5, private_base and shared_base are passed through
8586	// implicit kernargs.
8587	const Module *M = DAG.getMachineFunction().getFunction().getParent();
8588	if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8589	ImplicitParameter Param =
8590	(AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
8591	return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align (`4`), Param);
8592	}
8593
8594	MachineFunction &MF = DAG.getMachineFunction();
8595	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8596	Register UserSGPR = Info->getQueuePtrUserSGPR();
8597	if (UserSGPR == AMDGPU::NoRegister) {
8598	// We probably are in a function incorrectly marked with
8599	// amdgpu-no-queue-ptr. This is undefined.
8600	return DAG.getPOISON(VT: MVT::i32);
8601	}
8602
8603	SDValue QueuePtr =
8604	CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
8605
8606	// Offset into amd_queue_t for group_segment_aperture_base_hi /
8607	// private_segment_aperture_base_hi.
8608	uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? `0x40` : `0x44`;
8609
8610	SDValue Ptr =
8611	DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
8612
8613	// TODO: Use custom target PseudoSourceValue.
8614	// TODO: We should use the value from the IR intrinsic call, but it might not
8615	// be available and how do we get it?
8616	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8617	return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: `1`), Ptr, PtrInfo,
8618	Alignment: commonAlignment(A: Align (`64`), Offset: StructOffset),
8619	MMOFlags: MachineMemOperand::MODereferenceable \|
8620	MachineMemOperand::MOInvariant);
8621	}
8622
8623	/// Return true if the value is a known valid address, such that a null check is
8624	/// not necessary.
8625	static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
8626	const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8627	if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
8628	return true;
8629
8630	if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8631	return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
8632
8633	// TODO: Search through arithmetic, handle arguments and loads
8634	// marked nonnull.
8635	return false;
8636	}
8637
8638	SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8639	SelectionDAG &DAG) const {
8640	SDLoc SL(Op);
8641
8642	const AMDGPUTargetMachine &TM =
8643	static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8644
8645	unsigned DestAS, SrcAS;
8646	SDValue Src;
8647	bool IsNonNull = false;
8648	if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
8649	SrcAS = ASC->getSrcAddressSpace();
8650	Src = ASC->getOperand(Num: `0`);
8651	DestAS = ASC->getDestAddressSpace();
8652	} else {
8653	assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8654	Op.getConstantOperandVal(`0`) ==
8655	Intrinsic::amdgcn_addrspacecast_nonnull);
8656	Src = Op ->getOperand(Num: `1`);
8657	SrcAS = Op ->getConstantOperandVal(Num: `2`);
8658	DestAS = Op ->getConstantOperandVal(Num: `3`);
8659	IsNonNull = true;
8660	}
8661
8662	SDValue FlatNullPtr = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i64);
8663
8664	// flat -> local/private
8665	if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8666	if (DestAS == AMDGPUAS::LOCAL_ADDRESS \|\|
8667	DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8668	SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8669
8670	if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8671	Subtarget->hasGloballyAddressableScratch()) {
8672	// flat -> private with globally addressable scratch: subtract
8673	// src_flat_scratch_base_lo.
8674	SDValue FlatScratchBaseLo(
8675	DAG.getMachineNode(
8676	Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32,
8677	Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, VT: MVT::i32)),
8678	`0`);
8679	Ptr = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: Ptr, N2: FlatScratchBaseLo);
8680	}
8681
8682	if (IsNonNull \|\| isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8683	return Ptr;
8684
8685	unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
8686	SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8687	SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
8688
8689	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
8690	N3: SegmentNullPtr);
8691	}
8692	}
8693
8694	// local/private -> flat
8695	if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8696	if (SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
8697	SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8698	SDValue CvtPtr;
8699	if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8700	Subtarget->hasGloballyAddressableScratch()) {
8701	// For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8702	// For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8703	SDValue AllOnes = DAG.getSignedTargetConstant(Val: -`1`, DL: SL, VT: MVT::i32);
8704	SDValue ThreadID = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
8705	ThreadID = DAG.getNode(
8706	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8707	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_lo, DL: SL, VT: MVT::i32),
8708	N2: AllOnes, N3: ThreadID);
8709	if (Subtarget->isWave64())
8710	ThreadID = DAG.getNode(
8711	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8712	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_hi, DL: SL, VT: MVT::i32),
8713	N2: AllOnes, N3: ThreadID);
8714	SDValue ShAmt = DAG.getShiftAmountConstant(
8715	Val: `57` - `32` - Subtarget->getWavefrontSizeLog2(), VT: MVT::i32, DL: SL);
8716	SDValue SrcHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ThreadID, N2: ShAmt);
8717	CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: SrcHi);
8718	CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8719	// Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8720	// 64-bit hi:lo value.
8721	SDValue FlatScratchBase = {
8722	DAG.getMachineNode(
8723	Opcode: AMDGPU::S_MOV_B64, dl: SL, VT: MVT::i64,
8724	Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE, VT: MVT::i64)),
8725	`0`};
8726	CvtPtr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: CvtPtr, N2: FlatScratchBase);
8727	} else {
8728	SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
8729	CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
8730	CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8731	}
8732
8733	if (IsNonNull \|\| isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8734	return CvtPtr;
8735
8736	unsigned NullVal = AMDGPU::getNullPointerValue(AS: SrcAS);
8737	SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8738
8739	SDValue NonNull =
8740	DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
8741
8742	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
8743	N3: FlatNullPtr);
8744	}
8745	}
8746
8747	if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8748	Op.getValueType() == MVT::i64) {
8749	const SIMachineFunctionInfo *Info =
8750	DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8751	if (Info->get32BitAddressHighBits() == `0`)
8752	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: Src);
8753
8754	SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
8755	SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
8756	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
8757	}
8758
8759	if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8760	Src.getValueType() == MVT::i64)
8761	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8762
8763	// global <-> flat are no-ops and never emitted.
8764
8765	// Invalid casts are poison.
8766	return DAG.getPOISON(VT: Op ->getValueType(ResNo: `0`));
8767	}
8768
8769	// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8770	// the small vector and inserting them into the big vector. That is better than
8771	// the default expansion of doing it via a stack slot. Even though the use of
8772	// the stack slot would be optimized away afterwards, the stack slot itself
8773	// remains.
8774	SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8775	SelectionDAG &DAG) const {
8776	SDValue Vec = Op.getOperand(i: `0`);
8777	SDValue Ins = Op.getOperand(i: `1`);
8778	SDValue Idx = Op.getOperand(i: `2`);
8779	EVT VecVT = Vec.getValueType();
8780	EVT InsVT = Ins.getValueType();
8781	EVT EltVT = VecVT.getVectorElementType();
8782	unsigned InsNumElts = InsVT.getVectorNumElements();
8783	unsigned IdxVal = Idx ->getAsZExtVal();
8784	SDLoc SL(Op);
8785
8786	if (EltVT.getScalarSizeInBits() == `16` && IdxVal % `2` == `0`) {
8787	// Insert 32-bit registers at a time.
8788	assert(InsNumElts % `2` == `0` && "expect legal vector types");
8789
8790	unsigned VecNumElts = VecVT.getVectorNumElements();
8791	EVT NewVecVT =
8792	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / `2`);
8793	EVT NewInsVT = InsNumElts == `2` ? MVT::i32
8794	: EVT::getVectorVT(Context&: *DAG.getContext(),
8795	VT: MVT::i32, NumElements: InsNumElts / `2`);
8796
8797	Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
8798	Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
8799
8800	for (unsigned I = `0`; I != InsNumElts / `2`; ++I) {
8801	SDValue Elt;
8802	if (InsNumElts == `2`) {
8803	Elt = Ins;
8804	} else {
8805	Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
8806	N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8807	}
8808	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
8809	N3: DAG.getConstant(Val: IdxVal / `2` + I, DL: SL, VT: MVT::i32));
8810	}
8811
8812	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
8813	}
8814
8815	for (unsigned I = `0`; I != InsNumElts; ++I) {
8816	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
8817	N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8818	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
8819	N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
8820	}
8821	return Vec;
8822	}
8823
8824	SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8825	SelectionDAG &DAG) const {
8826	SDValue Vec = Op.getOperand(i: `0`);
8827	SDValue InsVal = Op.getOperand(i: `1`);
8828	SDValue Idx = Op.getOperand(i: `2`);
8829	EVT VecVT = Vec.getValueType();
8830	EVT EltVT = VecVT.getVectorElementType();
8831	unsigned VecSize = VecVT.getSizeInBits();
8832	unsigned EltSize = EltVT.getSizeInBits();
8833	SDLoc SL(Op);
8834
8835	// Specially handle the case of v4i16 with static indexing.
8836	unsigned NumElts = VecVT.getVectorNumElements();
8837	auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
8838	if (NumElts == `4` && EltSize == `16` && KIdx) {
8839	SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
8840
8841	SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8842	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
8843	SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8844	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
8845
8846	SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
8847	SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
8848
8849	unsigned Idx = KIdx->getZExtValue();
8850	bool InsertLo = Idx < `2`;
8851	SDValue InsHalf = DAG.getNode(
8852	Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
8853	N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
8854	N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - `2`), DL: SL, VT: MVT::i32));
8855
8856	InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
8857
8858	SDValue Concat =
8859	InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
8860	: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
8861
8862	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
8863	}
8864
8865	// Static indexing does not lower to stack access, and hence there is no need
8866	// for special custom lowering to avoid stack access.
8867	if (isa<ConstantSDNode>(Val: Idx))
8868	return SDValue ();
8869
8870	// Avoid stack access for dynamic indexing by custom lowering to
8871	// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8872
8873	assert(VecSize <= `64` && "Expected target vector size to be <= 64 bits");
8874
8875	MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8876
8877	// Convert vector index to bit-index and get the required bit mask.
8878	assert(isPowerOf2_32(EltSize));
8879	const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
8880	SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8881	SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8882	SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
8883	N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
8884
8885	// 1. Create a congruent vector with the target value in each element.
8886	SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
8887	Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
8888
8889	// 2. Mask off all other indices except the required index within (1).
8890	SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
8891
8892	// 3. Mask off the required index within the target vector.
8893	SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8894	SDValue RHS =
8895	DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
8896
8897	// 4. Get (2) and (3) ORed into the target vector.
8898	SDValue BFI =
8899	DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
8900
8901	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
8902	}
8903
8904	SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8905	SelectionDAG &DAG) const {
8906	SDLoc SL(Op);
8907
8908	EVT ResultVT = Op.getValueType();
8909	SDValue Vec = Op.getOperand(i: `0`);
8910	SDValue Idx = Op.getOperand(i: `1`);
8911	EVT VecVT = Vec.getValueType();
8912	unsigned VecSize = VecVT.getSizeInBits();
8913	EVT EltVT = VecVT.getVectorElementType();
8914
8915	DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8916
8917	// Make sure we do any optimizations that will make it easier to fold
8918	// source modifiers before obscuring it with bit operations.
8919
8920	// XXX - Why doesn't this get called when vector_shuffle is expanded?
8921	if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
8922	return Combined;
8923
8924	if (VecSize == `128` \|\| VecSize == `256` \|\| VecSize == `512`) {
8925	SDValue Lo, Hi;
8926	auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
8927
8928	if (VecSize == `128`) {
8929	SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
8930	Lo = DAG.getBitcast(VT: LoVT,
8931	V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8932	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32)));
8933	Hi = DAG.getBitcast(VT: HiVT,
8934	V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8935	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32)));
8936	} else if (VecSize == `256`) {
8937	SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
8938	SDValue Parts[`4`];
8939	for (unsigned P = `0`; P < `4`; ++P) {
8940	Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8941	N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8942	}
8943
8944	Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8945	N1: Parts[`0`], N2: Parts[`1`]));
8946	Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8947	N1: Parts[`2`], N2: Parts[`3`]));
8948	} else {
8949	assert(VecSize == `512`);
8950
8951	SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
8952	SDValue Parts[`8`];
8953	for (unsigned P = `0`; P < `8`; ++P) {
8954	Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8955	N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8956	}
8957
8958	Lo = DAG.getBitcast(VT: LoVT,
8959	V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8960	N1: Parts[`0`], N2: Parts[`1`], N3: Parts[`2`], N4: Parts[`3`]));
8961	Hi = DAG.getBitcast(VT: HiVT,
8962	V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8963	N1: Parts[`4`], N2: Parts[`5`], N3: Parts[`6`], N4: Parts[`7`]));
8964	}
8965
8966	EVT IdxVT = Idx.getValueType();
8967	unsigned NElem = VecVT.getVectorNumElements();
8968	assert(isPowerOf2_32(NElem));
8969	SDValue IdxMask = DAG.getConstant(Val: NElem / `2` - `1`, DL: SL, VT: IdxVT);
8970	SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
8971	SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
8972	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
8973	}
8974
8975	assert(VecSize <= `64`);
8976
8977	MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8978
8979	// If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8980	SDValue VecBC = peekThroughBitcasts(V: Vec);
8981	if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8982	SDValue Src = VecBC.getOperand(i: `0`);
8983	Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
8984	Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
8985	}
8986
8987	unsigned EltSize = EltVT.getSizeInBits();
8988	assert(isPowerOf2_32(EltSize));
8989
8990	SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8991
8992	// Convert vector index to bit-index ( EltSize)*
8993	SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8994
8995	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8996	SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
8997
8998	if (ResultVT == MVT::f16 \|\| ResultVT == MVT::bf16) {
8999	SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
9000	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
9001	}
9002
9003	return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
9004	}
9005
9006	static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9007	assert(Elt % `2` == `0`);
9008	return Mask [Elt + `1`] == Mask [Elt] + `1` && (Mask [Elt] % `2` == `0`);
9009	}
9010
9011	static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9012	assert(Elt % `2` == `0`);
9013	return Mask [Elt] >= `0` && Mask [Elt + `1`] >= `0` && (Mask [Elt] & `1`) &&
9014	!(Mask [Elt + `1`] & `1`);
9015	}
9016
9017	SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9018	SelectionDAG &DAG) const {
9019	SDLoc SL(Op);
9020	EVT ResultVT = Op.getValueType();
9021	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
9022	MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9023	const int NewSrcNumElts = `2`;
9024	MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
9025	int SrcNumElts = Op.getOperand(i: `0`).getValueType().getVectorNumElements();
9026
9027	// Break up the shuffle into registers sized pieces.
9028	//
9029	// We're trying to form sub-shuffles that the register allocation pipeline
9030	// won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9031	// blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9032	// pair of copies into a consecutive register copy, so use the ordinary
9033	// extract_vector_elt lowering unless we can use the shuffle.
9034	//
9035	// TODO: This is a bit of hack, and we should probably always use
9036	// extract_subvector for the largest possible subvector we can (or at least
9037	// use it for PackVT aligned pieces). However we have worse support for
9038	// combines on them don't directly treat extract_subvector / insert_subvector
9039	// as legal. The DAG scheduler also ends up doing a worse job with the
9040	// extract_subvectors.
9041	const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == `16`;
9042
9043	// vector_shuffle <0,1,6,7> lhs, rhs
9044	// -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9045	//
9046	// vector_shuffle <6,7,2,3> lhs, rhs
9047	// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9048	//
9049	// vector_shuffle <6,7,0,1> lhs, rhs
9050	// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9051
9052	// Avoid scalarizing when both halves are reading from consecutive elements.
9053
9054	// If we're treating 2 element shuffles as legal, also create odd-to-even
9055	// shuffles of neighboring pairs.
9056	//
9057	// vector_shuffle <3,2,7,6> lhs, rhs
9058	// -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9059	// vector_shuffle <1, 0> (extract_subvector rhs, 2)
9060
9061	SmallVector<SDValue, `16`> Pieces;
9062	for (int I = `0`, N = ResultVT.getVectorNumElements(); I != N; I += `2`) {
9063	if (ShouldUseConsecutiveExtract &&
9064	elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
9065	const int Idx = SVN->getMaskElt(Idx: I);
9066	int VecIdx = Idx < SrcNumElts ? `0` : `1`;
9067	int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9068	SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
9069	N1: SVN->getOperand(Num: VecIdx),
9070	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
9071	Pieces.push_back(Elt: SubVec);
9072	} else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
9073	isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
9074	int Idx0 = SVN->getMaskElt(Idx: I);
9075	int Idx1 = SVN->getMaskElt(Idx: I + `1`);
9076
9077	SDValue SrcOp0 = SVN->getOperand(Num: `0`);
9078	SDValue SrcOp1 = SrcOp0;
9079	if (Idx0 >= SrcNumElts) {
9080	SrcOp0 = SVN->getOperand(Num: `1`);
9081	Idx0 -= SrcNumElts;
9082	}
9083
9084	if (Idx1 >= SrcNumElts) {
9085	SrcOp1 = SVN->getOperand(Num: `1`);
9086	Idx1 -= SrcNumElts;
9087	}
9088
9089	int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - `1`);
9090	int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - `1`);
9091
9092	// Extract nearest even aligned piece.
9093	SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
9094	N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
9095	SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
9096	N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
9097
9098	int NewMaskIdx0 = Idx0 - AlignedIdx0;
9099	int NewMaskIdx1 = Idx1 - AlignedIdx1;
9100
9101	SDValue Result0 = SubVec0;
9102	SDValue Result1 = SubVec0;
9103
9104	if (SubVec0 != SubVec1) {
9105	NewMaskIdx1 += NewSrcNumElts;
9106	Result1 = SubVec1;
9107	} else {
9108	Result1 = DAG.getPOISON(VT: PackVT);
9109	}
9110
9111	SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
9112	Mask: {NewMaskIdx0, NewMaskIdx1});
9113	Pieces.push_back(Elt: Shuf);
9114	} else {
9115	const int Idx0 = SVN->getMaskElt(Idx: I);
9116	const int Idx1 = SVN->getMaskElt(Idx: I + `1`);
9117	int VecIdx0 = Idx0 < SrcNumElts ? `0` : `1`;
9118	int VecIdx1 = Idx1 < SrcNumElts ? `0` : `1`;
9119	int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9120	int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9121
9122	SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
9123	SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
9124	N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
9125
9126	SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
9127	SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
9128	N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
9129	Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
9130	}
9131	}
9132
9133	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
9134	}
9135
9136	SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9137	SelectionDAG &DAG) const {
9138	SDValue SVal = Op.getOperand(i: `0`);
9139	EVT ResultVT = Op.getValueType();
9140	EVT SValVT = SVal.getValueType();
9141	SDValue UndefVal = DAG.getPOISON(VT: SValVT);
9142	SDLoc SL(Op);
9143
9144	SmallVector<SDValue, `8`> VElts;
9145	VElts.push_back(Elt: SVal);
9146	for (int I = `1`, E = ResultVT.getVectorNumElements(); I < E; ++I)
9147	VElts.push_back(Elt: UndefVal);
9148
9149	return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
9150	}
9151
9152	SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9153	SelectionDAG &DAG) const {
9154	SDLoc SL(Op);
9155	EVT VT = Op.getValueType();
9156
9157	if (VT == MVT::v2f16 \|\| VT == MVT::v2i16 \|\| VT == MVT::v2bf16) {
9158	assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9159
9160	SDValue Lo = Op.getOperand(i: `0`);
9161	SDValue Hi = Op.getOperand(i: `1`);
9162
9163	// Avoid adding defined bits with the zero_extend.
9164	if (Hi.isUndef()) {
9165	Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9166	SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9167	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
9168	}
9169
9170	Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
9171	Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
9172
9173	SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
9174	N2: DAG.getConstant(Val: `16`, DL: SL, VT: MVT::i32));
9175	if (Lo.isUndef())
9176	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
9177
9178	Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9179	Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9180
9181	SDValue Or =
9182	DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
9183	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
9184	}
9185
9186	// Split into 2-element chunks.
9187	const unsigned NumParts = VT.getVectorNumElements() / `2`;
9188	EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: `2`);
9189	MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
9190
9191	SmallVector<SDValue> Casts;
9192	for (unsigned P = `0`; P < NumParts; ++P) {
9193	SDValue Vec = DAG.getBuildVector(
9194	VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * `2`), Op.getOperand(i: P * `2` + `1`)});
9195	Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
9196	}
9197
9198	SDValue Blend =
9199	DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
9200	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
9201	}
9202
9203	bool SITargetLowering::isOffsetFoldingLegal(
9204	const GlobalAddressSDNode GA) const* {
9205	// OSes that use ELF REL relocations (instead of RELA) can only store a
9206	// 32-bit addend in the instruction, so it is not safe to allow offset folding
9207	// which can create arbitrary 64-bit addends. (This is only a problem for
9208	// R_AMDGPU_32_HI relocations since other relocation types are unaffected by*
9209	// the high 32 bits of the addend.)
9210	//
9211	// This should be kept in sync with how HasRelocationAddend is initialized in
9212	// the constructor of ELFAMDGPUAsmBackend.
9213	if (!Subtarget->isAmdHsaOS())
9214	return false;
9215
9216	// We can fold offsets for anything that doesn't require a GOT relocation.
9217	return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS \|\|
9218	GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
9219	GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
9220	!shouldEmitGOTReloc(GV: GA->getGlobal());
9221	}
9222
9223	static SDValue
9224	buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
9225	const SDLoc &DL, int64_t Offset, EVT PtrVT,
9226	unsigned GAFlags = SIInstrInfo::MO_NONE) {
9227	assert(isInt<`32`>(Offset + `4`) && "32-bit offset is expected!");
9228	// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9229	// lowered to the following code sequence:
9230	//
9231	// For constant address space:
9232	// s_getpc_b64 s[0:1]
9233	// s_add_u32 s0, s0, $symbol
9234	// s_addc_u32 s1, s1, 0
9235	//
9236	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
9237	// a fixup or relocation is emitted to replace $symbol with a literal
9238	// constant, which is a pc-relative offset from the encoding of the $symbol
9239	// operand to the global variable.
9240	//
9241	// For global address space:
9242	// s_getpc_b64 s[0:1]
9243	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9244	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9245	//
9246	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
9247	// fixups or relocations are emitted to replace $symbol@@lo and*
9248	// $symbol@@hi with lower 32 bits and higher 32 bits of a literal constant,*
9249	// which is a 64-bit pc-relative offset from the encoding of the $symbol
9250	// operand to the global variable.
9251	if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9252	assert(GAFlags != SIInstrInfo::MO_NONE);
9253
9254	SDValue Ptr =
9255	DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset, TargetFlags: GAFlags + `2`);
9256	return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET64, DL, VT: PtrVT, Operand: Ptr);
9257	}
9258
9259	SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
9260	SDValue PtrHi;
9261	if (GAFlags == SIInstrInfo::MO_NONE)
9262	PtrHi = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32);
9263	else
9264	PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + `1`);
9265	return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
9266	}
9267
9268	SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9269	SDValue Op,
9270	SelectionDAG &DAG) const {
9271	GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
9272	SDLoc DL(GSD);
9273	EVT PtrVT = Op.getValueType();
9274
9275	const GlobalValue *GV = GSD->getGlobal();
9276	if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9277	shouldUseLDSConstAddress(GV)) \|\|
9278	GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|
9279	GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
9280	if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9281	GV->hasExternalLinkage()) {
9282	const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
9283	// HIP uses an unsized array `extern __shared__ T s[]` or similar
9284	// zero-sized type in other languages to declare the dynamic shared
9285	// memory which size is not known at the compile time. They will be
9286	// allocated by the runtime and placed directly after the static
9287	// allocated ones. They all share the same offset.
9288	if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == `0`) {
9289	assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9290	// Adjust alignment for that dynamic shared memory array.
9291	Function &F = DAG.getMachineFunction().getFunction();
9292	MFI->setDynLDSAlign(F, GV: GVar);
9293	MFI->setUsesDynamicLDS(true);
9294	return SDValue (
9295	DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), `0`);
9296	}
9297	}
9298	return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
9299	}
9300
9301	if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
9302	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
9303	TargetFlags: SIInstrInfo::MO_ABS32_LO);
9304	return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
9305	}
9306
9307	if (Subtarget->isAmdPalOS() \|\| Subtarget->isMesa3DOS()) {
9308	if (Subtarget->has64BitLiterals()) {
9309	SDValue Addr = DAG.getTargetGlobalAddress(
9310	GV, DL, VT: MVT::i64, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS64);
9311	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, Op1: Addr),
9312	`0`);
9313	}
9314
9315	SDValue AddrLo = DAG.getTargetGlobalAddress(
9316	GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
9317	AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), `0`};
9318
9319	SDValue AddrHi = DAG.getTargetGlobalAddress(
9320	GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
9321	AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), `0`};
9322
9323	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
9324	}
9325
9326	if (shouldEmitFixup(GV))
9327	return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
9328
9329	if (shouldEmitPCReloc(GV))
9330	return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
9331	GAFlags: SIInstrInfo::MO_REL32);
9332
9333	SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: `0`, PtrVT,
9334	GAFlags: SIInstrInfo::MO_GOTPCREL32);
9335	PointerType *PtrTy =
9336	PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
9337	const DataLayout &DataLayout = DAG.getDataLayout();
9338	Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
9339	MachinePointerInfo PtrInfo =
9340	MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
9341
9342	return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
9343	MMOFlags: MachineMemOperand::MODereferenceable \|
9344	MachineMemOperand::MOInvariant);
9345	}
9346
9347	SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9348	SelectionDAG &DAG) const {
9349	// TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9350	const Function &Fn = DAG.getMachineFunction().getFunction();
9351	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9352	Fn, "unsupported external symbol", Op.getDebugLoc()));
9353	return DAG.getPOISON(VT: Op.getValueType());
9354	}
9355
9356	SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
9357	const SDLoc &DL, SDValue V) const {
9358	// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9359	// the destination register.
9360	//
9361	// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9362	// so we will end up with redundant moves to m0.
9363	//
9364	// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9365
9366	// A Null SDValue creates a glue result.
9367	SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
9368	Op1: V, Op2: Chain);
9369	return SDValue (M0, `0`);
9370	}
9371
9372	SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9373	MVT VT,
9374	unsigned Offset) const {
9375	SDLoc SL(Op);
9376	SDValue Param = lowerKernargMemParameter(
9377	DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align (`4`), Signed: false);
9378	// The local size values will have the hi 16-bits as zero.
9379	return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
9380	N2: DAG.getValueType(VT));
9381	}
9382
9383	static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9384	EVT VT) {
9385	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9386	DAG.getMachineFunction().getFunction(),
9387	"non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9388	return DAG.getPOISON(VT);
9389	}
9390
9391	static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9392	EVT VT) {
9393	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9394	DAG.getMachineFunction().getFunction(),
9395	"intrinsic not supported on subtarget", DL.getDebugLoc()));
9396	return DAG.getPOISON(VT);
9397	}
9398
9399	static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
9400	ArrayRef<SDValue> Elts) {
9401	assert(!Elts.empty());
9402	MVT Type;
9403	unsigned NumElts = Elts.size();
9404
9405	if (NumElts <= `12`) {
9406	Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
9407	} else {
9408	assert(Elts.size() <= `16`);
9409	Type = MVT::v16f32;
9410	NumElts = `16`;
9411	}
9412
9413	SmallVector<SDValue, `16`> VecElts(NumElts);
9414	for (unsigned i = `0`; i < Elts.size(); ++i) {
9415	SDValue Elt = Elts [i];
9416	if (Elt.getValueType() != MVT::f32)
9417	Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
9418	VecElts [i] = Elt;
9419	}
9420	for (unsigned i = Elts.size(); i < NumElts; ++i)
9421	VecElts [i] = DAG.getPOISON(VT: MVT::f32);
9422
9423	if (NumElts == `1`)
9424	return VecElts [`0`];
9425	return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
9426	}
9427
9428	static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9429	SDValue Src, int ExtraElts) {
9430	EVT SrcVT = Src.getValueType();
9431
9432	SmallVector<SDValue, `8`> Elts;
9433
9434	if (SrcVT.isVector())
9435	DAG.ExtractVectorElements(Op: Src, Args&: Elts);
9436	else
9437	Elts.push_back(Elt: Src);
9438
9439	SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
9440	while (ExtraElts--)
9441	Elts.push_back(Elt: Undef);
9442
9443	return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
9444	}
9445
9446	// Re-construct the required return value for a image load intrinsic.
9447	// This is more complicated due to the optional use TexFailCtrl which means the
9448	// required return type is an aggregate
9449	static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
9450	ArrayRef<EVT> ResultTypes, bool IsTexFail,
9451	bool Unpacked, bool IsD16, int DMaskPop,
9452	int NumVDataDwords, bool IsAtomicPacked16Bit,
9453	const SDLoc &DL) {
9454	// Determine the required return type. This is the same regardless of
9455	// IsTexFail flag
9456	EVT ReqRetVT = ResultTypes [`0`];
9457	int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : `1`;
9458	int NumDataDwords = ((IsD16 && !Unpacked) \|\| IsAtomicPacked16Bit)
9459	? (ReqRetNumElts + `1`) / `2`
9460	: ReqRetNumElts;
9461
9462	int MaskPopDwords = (!IsD16 \|\| Unpacked) ? DMaskPop : (DMaskPop + `1`) / `2`;
9463
9464	MVT DataDwordVT =
9465	NumDataDwords == `1` ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
9466
9467	MVT MaskPopVT =
9468	MaskPopDwords == `1` ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
9469
9470	SDValue Data(Result, `0`);
9471	SDValue TexFail;
9472
9473	if (DMaskPop > `0` && Data.getValueType() != MaskPopVT) {
9474	SDValue ZeroIdx = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
9475	if (MaskPopVT.isVector()) {
9476	Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
9477	N1: SDValue (Result, `0`), N2: ZeroIdx);
9478	} else {
9479	Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
9480	N1: SDValue (Result, `0`), N2: ZeroIdx);
9481	}
9482	}
9483
9484	if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9485	Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
9486	ExtraElts: NumDataDwords - MaskPopDwords);
9487
9488	if (IsD16)
9489	Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
9490
9491	EVT LegalReqRetVT = ReqRetVT;
9492	if (!ReqRetVT.isVector()) {
9493	if (!Data.getValueType().isInteger())
9494	Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
9495	VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
9496	Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
9497	} else {
9498	// We need to widen the return vector to a legal type
9499	if ((ReqRetVT.getVectorNumElements() % `2`) == `1` &&
9500	ReqRetVT.getVectorElementType().getSizeInBits() == `16`) {
9501	LegalReqRetVT =
9502	EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
9503	NumElements: ReqRetVT.getVectorNumElements() + `1`);
9504	}
9505	}
9506	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
9507
9508	if (IsTexFail) {
9509	TexFail =
9510	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue (Result, `0`),
9511	N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
9512
9513	return DAG.getMergeValues(Ops: {Data, TexFail, SDValue (Result, `1`)}, dl: DL);
9514	}
9515
9516	if (Result->getNumValues() == `1`)
9517	return Data;
9518
9519	return DAG.getMergeValues(Ops: {Data, SDValue (Result, `1`)}, dl: DL);
9520	}
9521
9522	static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9523	SDValue LWE, bool* &IsTexFail) {
9524	auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
9525
9526	uint64_t Value = TexFailCtrlConst->getZExtValue();
9527	if (Value) {
9528	IsTexFail = true;
9529	}
9530
9531	SDLoc DL(TexFailCtrlConst);
9532	*TFE = DAG.getTargetConstant(Val: (Value & `0x1`) ? `1` : `0`, DL, VT: MVT::i32);
9533	Value &= ~(uint64_t)`0x1`;
9534	*LWE = DAG.getTargetConstant(Val: (Value & `0x2`) ? `1` : `0`, DL, VT: MVT::i32);
9535	Value &= ~(uint64_t)`0x2`;
9536
9537	return Value == `0`;
9538	}
9539
9540	static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
9541	MVT PackVectorVT,
9542	SmallVectorImpl<SDValue> &PackedAddrs,
9543	unsigned DimIdx, unsigned EndIdx,
9544	unsigned NumGradients) {
9545	SDLoc DL(Op);
9546	for (unsigned I = DimIdx; I < EndIdx; I++) {
9547	SDValue Addr = Op.getOperand(i: I);
9548
9549	// Gradients are packed with undef for each coordinate.
9550	// In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9551	// 1D: undef,dx/dh; undef,dx/dv
9552	// 2D: dy/dh,dx/dh; dy/dv,dx/dv
9553	// 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9554	if (((I + `1`) >= EndIdx) \|\|
9555	((NumGradients / `2`) % `2` == `1` && (I == DimIdx + (NumGradients / `2`) - `1` \|\|
9556	I == DimIdx + NumGradients - `1`))) {
9557	if (Addr.getValueType() != MVT::i16)
9558	Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
9559	Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
9560	} else {
9561	Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + `1`)});
9562	I++;
9563	}
9564	Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
9565	PackedAddrs.push_back(Elt: Addr);
9566	}
9567	}
9568
9569	SDValue SITargetLowering::lowerImage(SDValue Op,
9570	const AMDGPU::ImageDimIntrinsicInfo *Intr,
9571	SelectionDAG &DAG, bool WithChain) const {
9572	SDLoc DL(Op);
9573	MachineFunction &MF = DAG.getMachineFunction();
9574	const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9575	unsigned IntrOpcode = Intr->BaseOpcode;
9576	// For image atomic: use no-return opcode if result is unused.
9577	if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9578	!Op.getNode()->hasAnyUseOfValue(Value: `0`))
9579	IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9580	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9581	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
9582	const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
9583	bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
9584	bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9585	bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9586
9587	SmallVector<EVT, `3`> ResultTypes(Op ->values());
9588	SmallVector<EVT, `3`> OrigResultTypes(Op ->values());
9589	if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9590	ResultTypes.erase(CI: &ResultTypes [`0`]);
9591
9592	bool IsD16 = false;
9593	bool IsG16 = false;
9594	bool IsA16 = false;
9595	SDValue VData;
9596	int NumVDataDwords = `0`;
9597	bool AdjustRetType = false;
9598	bool IsAtomicPacked16Bit = false;
9599
9600	// Offset of intrinsic arguments
9601	const unsigned ArgOffset = WithChain ? `2` : `1`;
9602
9603	unsigned DMask;
9604	unsigned DMaskLanes = `0`;
9605
9606	if (BaseOpcode->Atomic) {
9607	VData = Op.getOperand(i: `2`);
9608
9609	IsAtomicPacked16Bit =
9610	(IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 \|\|
9611	IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN \|\|
9612	IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 \|\|
9613	IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9614
9615	bool Is64Bit = VData.getValueSizeInBits() == `64`;
9616	if (BaseOpcode->AtomicX2) {
9617	SDValue VData2 = Op.getOperand(i: `3`);
9618	VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9619	Ops: {VData, VData2});
9620	if (Is64Bit)
9621	VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
9622
9623	if (!BaseOpcode->NoReturn)
9624	ResultTypes [`0`] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9625
9626	DMask = Is64Bit ? `0xf` : `0x3`;
9627	NumVDataDwords = Is64Bit ? `4` : `2`;
9628	} else {
9629	DMask = Is64Bit ? `0x3` : `0x1`;
9630	NumVDataDwords = Is64Bit ? `2` : `1`;
9631	}
9632	} else {
9633	DMask = Op ->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
9634	DMaskLanes = BaseOpcode->Gather4 ? `4` : llvm::popcount(Value: DMask);
9635
9636	if (BaseOpcode->Store) {
9637	VData = Op.getOperand(i: `2`);
9638
9639	MVT StoreVT = VData.getSimpleValueType();
9640	if (StoreVT.getScalarType() == MVT::f16) {
9641	if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
9642	return Op; // D16 is unsupported for this instruction
9643
9644	IsD16 = true;
9645	VData = handleD16VData(VData, DAG, ImageStore: true);
9646	}
9647
9648	NumVDataDwords = (VData.getValueType().getSizeInBits() + `31`) / `32`;
9649	} else if (!BaseOpcode->NoReturn) {
9650	// Work out the num dwords based on the dmask popcount and underlying type
9651	// and whether packing is supported.
9652	MVT LoadVT = ResultTypes [`0`].getSimpleVT();
9653	if (LoadVT.getScalarType() == MVT::f16) {
9654	if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
9655	return Op; // D16 is unsupported for this instruction
9656
9657	IsD16 = true;
9658	}
9659
9660	// Confirm that the return type is large enough for the dmask specified
9661	if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|
9662	(!LoadVT.isVector() && DMaskLanes > `1`))
9663	return Op;
9664
9665	// The sq block of gfx8 and gfx9 do not estimate register use correctly
9666	// for d16 image_gather4, image_gather4_l, and image_gather4_lz
9667	// instructions.
9668	if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9669	!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9670	NumVDataDwords = (DMaskLanes + `1`) / `2`;
9671	else
9672	NumVDataDwords = DMaskLanes;
9673
9674	AdjustRetType = true;
9675	}
9676	}
9677
9678	unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9679	SmallVector<SDValue, `4`> VAddrs;
9680
9681	// Check for 16 bit addresses or derivatives and pack if true.
9682	MVT VAddrVT =
9683	Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
9684	MVT VAddrScalarVT = VAddrVT.getScalarType();
9685	MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9686	IsG16 = VAddrScalarVT == MVT::f16 \|\| VAddrScalarVT == MVT::i16;
9687
9688	VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
9689	VAddrScalarVT = VAddrVT.getScalarType();
9690	MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9691	IsA16 = VAddrScalarVT == MVT::f16 \|\| VAddrScalarVT == MVT::i16;
9692
9693	// Push back extra arguments.
9694	for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9695	if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
9696	assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9697	// Special handling of bias when A16 is on. Bias is of type half but
9698	// occupies full 32-bit.
9699	SDValue Bias = DAG.getBuildVector(
9700	VT: MVT::v2f16, DL,
9701	Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
9702	VAddrs.push_back(Elt: Bias);
9703	} else {
9704	assert((!IsA16 \|\| Intr->NumBiasArgs == `0` \|\| I != Intr->BiasIndex) &&
9705	"Bias needs to be converted to 16 bit in A16 mode");
9706	VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
9707	}
9708	}
9709
9710	if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9711	// 16 bit gradients are supported, but are tied to the A16 control
9712	// so both gradients and addresses must be 16 bit
9713	LLVM_DEBUG(
9714	dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9715	"require 16 bit args for both gradients and addresses");
9716	return Op;
9717	}
9718
9719	if (IsA16) {
9720	if (!ST->hasA16()) {
9721	LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9722	"support 16 bit addresses\n");
9723	return Op;
9724	}
9725	}
9726
9727	// We've dealt with incorrect input so we know that if IsA16, IsG16
9728	// are set then we have to compress/pack operands (either address,
9729	// gradient or both)
9730	// In the case where a16 and gradients are tied (no G16 support) then we
9731	// have already verified that both IsA16 and IsG16 are true
9732	if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9733	// Activate g16
9734	const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9735	AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
9736	IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9737	}
9738
9739	// Add gradients (packed or unpacked)
9740	if (IsG16) {
9741	// Pack the gradients
9742	// const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9743	packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
9744	DimIdx: ArgOffset + Intr->GradientStart,
9745	EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
9746	} else {
9747	for (unsigned I = ArgOffset + Intr->GradientStart;
9748	I < ArgOffset + Intr->CoordStart; I++)
9749	VAddrs.push_back(Elt: Op.getOperand(i: I));
9750	}
9751
9752	// Add addresses (packed or unpacked)
9753	if (IsA16) {
9754	packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
9755	DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
9756	NumGradients: `0` / No gradients /);
9757	} else {
9758	// Add uncompressed address
9759	for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9760	VAddrs.push_back(Elt: Op.getOperand(i: I));
9761	}
9762
9763	// If the register allocator cannot place the address registers contiguously
9764	// without introducing moves, then using the non-sequential address encoding
9765	// is always preferable, since it saves VALU instructions and is usually a
9766	// wash in terms of code size or even better.
9767	//
9768	// However, we currently have no way of hinting to the register allocator that
9769	// MIMG addresses should be placed contiguously when it is possible to do so,
9770	// so force non-NSA for the common 2-address case as a heuristic.
9771	//
9772	// SIShrinkInstructions will convert NSA encodings to non-NSA after register
9773	// allocation when possible.
9774	//
9775	// Partial NSA is allowed on GFX11+ where the final register is a contiguous
9776	// set of the remaining addresses.
9777	const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
9778	const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9779	const bool UseNSA = ST->hasNSAEncoding() &&
9780	VAddrs.size() >= ST->getNSAThreshold(MF) &&
9781	(VAddrs.size() <= NSAMaxSize \|\| HasPartialNSAEncoding);
9782	const bool UsePartialNSA =
9783	UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9784
9785	SDValue VAddr;
9786	if (UsePartialNSA) {
9787	VAddr = getBuildDwordsVector(DAG, DL,
9788	Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - `1`));
9789	} else if (!UseNSA) {
9790	VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
9791	}
9792
9793	SDValue True = DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1);
9794	SDValue False = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1);
9795	SDValue Unorm;
9796	if (!BaseOpcode->Sampler) {
9797	Unorm = True;
9798	} else {
9799	uint64_t UnormConst =
9800	Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
9801
9802	Unorm = UnormConst ? True : False;
9803	}
9804
9805	SDValue TFE;
9806	SDValue LWE;
9807	SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
9808	bool IsTexFail = false;
9809	if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
9810	return Op;
9811
9812	if (IsTexFail) {
9813	if (!DMaskLanes) {
9814	// Expecting to get an error flag since TFC is on - and dmask is 0
9815	// Force dmask to be at least 1 otherwise the instruction will fail
9816	DMask = `0x1`;
9817	DMaskLanes = `1`;
9818	NumVDataDwords = `1`;
9819	}
9820	NumVDataDwords += `1`;
9821	AdjustRetType = true;
9822	}
9823
9824	// Has something earlier tagged that the return type needs adjusting
9825	// This happens if the instruction is a load or has set TexFailCtrl flags
9826	if (AdjustRetType) {
9827	// NumVDataDwords reflects the true number of dwords required in the return
9828	// type
9829	if (DMaskLanes == `0` && !BaseOpcode->Store) {
9830	// This is a no-op load. This can be eliminated
9831	SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
9832	if (isa<MemSDNode>(Val: Op))
9833	return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: `0`)}, dl: DL);
9834	return Undef;
9835	}
9836
9837	EVT NewVT = NumVDataDwords > `1` ? EVT::getVectorVT(Context&: *DAG.getContext(),
9838	VT: MVT::i32, NumElements: NumVDataDwords)
9839	: MVT::i32;
9840
9841	ResultTypes [`0`] = NewVT;
9842	if (ResultTypes.size() == `3`) {
9843	// Original result was aggregate type used for TexFailCtrl results
9844	// The actual instruction returns as a vector type which has now been
9845	// created. Remove the aggregate result.
9846	ResultTypes.erase(CI: &ResultTypes [`1`]);
9847	}
9848	}
9849
9850	unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
9851	// Keep GLC only when the atomic's result is actually used.
9852	if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9853	CPol \|= AMDGPU::CPol::GLC;
9854	if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) \|
9855	AMDGPU::CPol::VOLATILE))
9856	return Op;
9857
9858	SmallVector<SDValue, `26`> Ops;
9859	if (BaseOpcode->Store \|\| BaseOpcode->Atomic)
9860	Ops.push_back(Elt: VData); // vdata
9861	if (UsePartialNSA) {
9862	append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - `1`));
9863	Ops.push_back(Elt: VAddr);
9864	} else if (UseNSA)
9865	append_range(C&: Ops, R&: VAddrs);
9866	else
9867	Ops.push_back(Elt: VAddr);
9868	SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
9869	EVT RsrcVT = Rsrc.getValueType();
9870	if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9871	return Op;
9872	Ops.push_back(Elt: Rsrc);
9873	if (BaseOpcode->Sampler) {
9874	SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
9875	if (Samp.getValueType() != MVT::v4i32)
9876	return Op;
9877	Ops.push_back(Elt: Samp);
9878	}
9879	Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
9880	if (IsGFX10Plus)
9881	Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
9882	if (!IsGFX12Plus \|\| BaseOpcode->Sampler \|\| BaseOpcode->MSAA)
9883	Ops.push_back(Elt: Unorm);
9884	Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
9885	Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
9886	ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
9887	? True
9888	: False);
9889	if (IsGFX10Plus)
9890	Ops.push_back(Elt: IsA16 ? True : False);
9891
9892	if (!Subtarget->hasGFX90AInsts())
9893	Ops.push_back(Elt: TFE); // tfe
9894	else if (TFE ->getAsZExtVal()) {
9895	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9896	DAG.getMachineFunction().getFunction(),
9897	"TFE is not supported on this GPU", DL.getDebugLoc()));
9898	}
9899
9900	if (!IsGFX12Plus \|\| BaseOpcode->Sampler \|\| BaseOpcode->MSAA)
9901	Ops.push_back(Elt: LWE); // lwe
9902	if (!IsGFX10Plus)
9903	Ops.push_back(Elt: DimInfo->DA ? True : False);
9904	if (BaseOpcode->HasD16)
9905	Ops.push_back(Elt: IsD16 ? True : False);
9906	if (isa<MemSDNode>(Val: Op))
9907	Ops.push_back(Elt: Op.getOperand(i: `0`)); // chain
9908
9909	int NumVAddrDwords =
9910	UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / `32`;
9911	int Opcode = -`1`;
9912
9913	if (IsGFX12Plus) {
9914	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
9915	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9916	} else if (IsGFX11Plus) {
9917	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9918	MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
9919	: AMDGPU::MIMGEncGfx11Default,
9920	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9921	} else if (IsGFX10Plus) {
9922	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9923	MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
9924	: AMDGPU::MIMGEncGfx10Default,
9925	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9926	} else {
9927	if (Subtarget->hasGFX90AInsts()) {
9928	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
9929	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9930	if (Opcode == -`1`) {
9931	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9932	DAG.getMachineFunction().getFunction(),
9933	"requested image instruction is not supported on this GPU",
9934	DL.getDebugLoc()));
9935
9936	unsigned Idx = `0`;
9937	SmallVector<SDValue, `3`> RetValues(OrigResultTypes.size());
9938	for (EVT VT : OrigResultTypes) {
9939	if (VT == MVT::Other)
9940	RetValues [Idx++] = Op.getOperand(i: `0`); // Chain
9941	else
9942	RetValues [Idx++] = DAG.getPOISON(VT);
9943	}
9944
9945	return DAG.getMergeValues(Ops: RetValues, dl: DL);
9946	}
9947	}
9948	if (Opcode == -`1` &&
9949	Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9950	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
9951	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9952	if (Opcode == -`1`)
9953	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
9954	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9955	}
9956	if (Opcode == -`1`)
9957	return Op;
9958
9959	MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
9960	if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
9961	MachineMemOperand *MemRef = MemOp->getMemOperand();
9962	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9963	}
9964
9965	if (BaseOpcode->NoReturn) {
9966	if (BaseOpcode->Atomic)
9967	return DAG.getMergeValues(
9968	Ops: {DAG.getPOISON(VT: OrigResultTypes [`0`]), SDValue (NewNode, `0`)}, dl: DL);
9969
9970	return SDValue (NewNode, `0`);
9971	}
9972
9973	if (BaseOpcode->AtomicX2) {
9974	SmallVector<SDValue, `1`> Elt;
9975	DAG.ExtractVectorElements(Op: SDValue (NewNode, `0`), Args&: Elt, Start: `0`, Count: `1`);
9976	return DAG.getMergeValues(Ops: {Elt [`0`], SDValue (NewNode, `1`)}, dl: DL);
9977	}
9978
9979	return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
9980	Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
9981	NumVDataDwords, IsAtomicPacked16Bit, DL);
9982	}
9983
9984	SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9985	SDValue Offset, SDValue CachePolicy,
9986	SelectionDAG &DAG) const {
9987	MachineFunction &MF = DAG.getMachineFunction();
9988
9989	const DataLayout &DataLayout = DAG.getDataLayout();
9990	Align Alignment =
9991	DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
9992
9993	MachineMemOperand *MMO = MF.getMachineMemOperand(
9994	PtrInfo: MachinePointerInfo (),
9995	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
9996	MachineMemOperand::MOInvariant,
9997	Size: VT.getStoreSize(), BaseAlignment: Alignment);
9998
9999	if (!Offset ->isDivergent()) {
10000	SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10001
10002	// Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10003	// s_buffer_load_u16 instruction is emitted for both signed and unsigned
10004	// loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10005	// and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10006	if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10007	SDValue BufferLoad =
10008	DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
10009	VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
10010	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
10011	}
10012
10013	// Widen vec3 load to vec4.
10014	if (VT.isVector() && VT.getVectorNumElements() == `3` &&
10015	!Subtarget->hasScalarDwordx3Loads()) {
10016	EVT WidenedVT =
10017	EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: `4`);
10018	auto WidenedOp = DAG.getMemIntrinsicNode(
10019	Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
10020	MMO: MF.getMachineMemOperand(MMO, Offset: `0`, Size: WidenedVT.getStoreSize()));
10021	auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
10022	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
10023	return Subvector;
10024	}
10025
10026	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
10027	VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
10028	}
10029
10030	// We have a divergent offset. Emit a MUBUF buffer load instead. We can
10031	// assume that the buffer is unswizzled.
10032	SDValue Ops[] = {
10033	DAG.getEntryNode(), // Chain
10034	Rsrc, // rsrc
10035	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10036	{}, // voffset
10037	{}, // soffset
10038	{}, // offset
10039	CachePolicy, // cachepolicy
10040	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10041	};
10042	if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10043	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`], Alignment: Align (`4`));
10044	return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
10045	}
10046
10047	SmallVector<SDValue, `4`> Loads;
10048	unsigned NumLoads = `1`;
10049	MVT LoadVT = VT.getSimpleVT();
10050	unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : `1`;
10051	assert((LoadVT.getScalarType() == MVT::i32 \|\|
10052	LoadVT.getScalarType() == MVT::f32));
10053
10054	if (NumElts == `8` \|\| NumElts == `16`) {
10055	NumLoads = NumElts / `4`;
10056	LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: `4`);
10057	}
10058
10059	SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
10060
10061	// Use the alignment to ensure that the required offsets will fit into the
10062	// immediate offsets.
10063	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`],
10064	Alignment: NumLoads > `1` ? Align (`16` * NumLoads) : Align (`4`));
10065
10066	uint64_t InstOffset = Ops[`5`]->getAsZExtVal();
10067	for (unsigned i = `0`; i < NumLoads; ++i) {
10068	Ops[`5`] = DAG.getTargetConstant(Val: InstOffset + `16` * i, DL, VT: MVT::i32);
10069	Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10070	MemVT: LoadVT, MMO, DAG));
10071	}
10072
10073	if (NumElts == `8` \|\| NumElts == `16`)
10074	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
10075
10076	return Loads [`0`];
10077	}
10078
10079	SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10080	// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10081	if (!Subtarget->hasArchitectedSGPRs())
10082	return {};
10083	SDLoc SL(Op);
10084	MVT VT = MVT::i32;
10085	SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
10086	return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
10087	N2: DAG.getConstant(Val: `25`, DL: SL, VT), N3: DAG.getConstant(Val: `5`, DL: SL, VT));
10088	}
10089
10090	SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10091	AMDGPU::Hwreg::Id HwReg,
10092	unsigned LowBit,
10093	unsigned Width) const {
10094	SDLoc SL(Op);
10095	using namespace AMDGPU::Hwreg;
10096	return {DAG.getMachineNode(
10097	Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT: MVT::i32,
10098	Op1: DAG.getTargetConstant(Val: HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width),
10099	DL: SL, VT: MVT::i32)),
10100	`0`};
10101	}
10102
10103	SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10104	unsigned Dim,
10105	const ArgDescriptor &Arg) const {
10106	SDLoc SL(Op);
10107	MachineFunction &MF = DAG.getMachineFunction();
10108	unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
10109	if (MaxID == `0`)
10110	return DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
10111
10112	// It's undefined behavior if a function marked with the amdgpu-no-*
10113	// attributes uses the corresponding intrinsic.
10114	if (!Arg)
10115	return DAG.getPOISON(VT: Op ->getValueType(ResNo: `0`));
10116
10117	SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
10118	SL: SDLoc (DAG.getEntryNode()), Arg);
10119
10120	// Don't bother inserting AssertZext for packed IDs since we're emitting the
10121	// masking operations anyway.
10122	//
10123	// TODO: We could assert the top bit is 0 for the source copy.
10124	if (Arg.isMasked())
10125	return Val;
10126
10127	// Preserve the known bits after expansion to a copy.
10128	EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
10129	return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
10130	N2: DAG.getValueType(SmallVT));
10131	}
10132
10133	SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10134	SelectionDAG &DAG) const {
10135	MachineFunction &MF = DAG.getMachineFunction();
10136	auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10137
10138	EVT VT = Op.getValueType();
10139	SDLoc DL(Op);
10140	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
10141
10142	// TODO: Should this propagate fast-math-flags?
10143
10144	switch (IntrinsicID) {
10145	case Intrinsic::amdgcn_implicit_buffer_ptr: {
10146	if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
10147	return emitNonHSAIntrinsicError(DAG, DL, VT);
10148	return getPreloadedValue(DAG, MFI: *MFI, VT,
10149	PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
10150	}
10151	case Intrinsic::amdgcn_dispatch_ptr:
10152	case Intrinsic::amdgcn_queue_ptr: {
10153	if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
10154	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
10155	MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10156	DL.getDebugLoc()));
10157	return DAG.getPOISON(VT);
10158	}
10159
10160	auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10161	? AMDGPUFunctionArgInfo::DISPATCH_PTR
10162	: AMDGPUFunctionArgInfo::QUEUE_PTR;
10163	return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
10164	}
10165	case Intrinsic::amdgcn_implicitarg_ptr: {
10166	if (MFI->isEntryFunction())
10167	return getImplicitArgPtr(DAG, SL: DL);
10168	return getPreloadedValue(DAG, MFI: *MFI, VT,
10169	PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
10170	}
10171	case Intrinsic::amdgcn_kernarg_segment_ptr: {
10172	if (!AMDGPU::isKernel(F: MF.getFunction())) {
10173	// This only makes sense to call in a kernel, so just lower to null.
10174	return DAG.getConstant(Val: `0`, DL, VT);
10175	}
10176
10177	return getPreloadedValue(DAG, MFI: *MFI, VT,
10178	PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
10179	}
10180	case Intrinsic::amdgcn_dispatch_id: {
10181	return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
10182	}
10183	case Intrinsic::amdgcn_rcp:
10184	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: `1`));
10185	case Intrinsic::amdgcn_rsq:
10186	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: `1`));
10187	case Intrinsic::amdgcn_rsq_legacy:
10188	if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10189	return emitRemovedIntrinsicError(DAG, DL, VT);
10190	return SDValue ();
10191	case Intrinsic::amdgcn_rcp_legacy:
10192	if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10193	return emitRemovedIntrinsicError(DAG, DL, VT);
10194	return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: `1`));
10195	case Intrinsic::amdgcn_rsq_clamp: {
10196	if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10197	return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: `1`));
10198
10199	Type Type = VT.getTypeForEVT(Context&: DAG.getContext());
10200	APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
10201	APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
10202
10203	SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: `1`));
10204	SDValue Tmp =
10205	DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
10206	return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
10207	N2: DAG.getConstantFP(Val: Min, DL, VT));
10208	}
10209	case Intrinsic::r600_read_ngroups_x:
10210	if (Subtarget->isAmdHsaOS())
10211	return emitNonHSAIntrinsicError(DAG, DL, VT);
10212
10213	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10214	Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align (`4`),
10215	Signed: false);
10216	case Intrinsic::r600_read_ngroups_y:
10217	if (Subtarget->isAmdHsaOS())
10218	return emitNonHSAIntrinsicError(DAG, DL, VT);
10219
10220	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10221	Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align (`4`),
10222	Signed: false);
10223	case Intrinsic::r600_read_ngroups_z:
10224	if (Subtarget->isAmdHsaOS())
10225	return emitNonHSAIntrinsicError(DAG, DL, VT);
10226
10227	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10228	Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align (`4`),
10229	Signed: false);
10230	case Intrinsic::r600_read_local_size_x:
10231	if (Subtarget->isAmdHsaOS())
10232	return emitNonHSAIntrinsicError(DAG, DL, VT);
10233
10234	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10235	Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
10236	case Intrinsic::r600_read_local_size_y:
10237	if (Subtarget->isAmdHsaOS())
10238	return emitNonHSAIntrinsicError(DAG, DL, VT);
10239
10240	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10241	Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
10242	case Intrinsic::r600_read_local_size_z:
10243	if (Subtarget->isAmdHsaOS())
10244	return emitNonHSAIntrinsicError(DAG, DL, VT);
10245
10246	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10247	Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
10248	case Intrinsic::amdgcn_workgroup_id_x:
10249	return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10250	WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
10251	ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
10252	ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
10253	case Intrinsic::amdgcn_workgroup_id_y:
10254	return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10255	WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
10256	ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
10257	ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
10258	case Intrinsic::amdgcn_workgroup_id_z:
10259	return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10260	WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
10261	ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
10262	ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
10263	case Intrinsic::amdgcn_cluster_id_x:
10264	return Subtarget->hasClusters()
10265	? getPreloadedValue(DAG, MFI: *MFI, VT,
10266	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
10267	: DAG.getPOISON(VT);
10268	case Intrinsic::amdgcn_cluster_id_y:
10269	return Subtarget->hasClusters()
10270	? getPreloadedValue(DAG, MFI: *MFI, VT,
10271	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
10272	: DAG.getPOISON(VT);
10273	case Intrinsic::amdgcn_cluster_id_z:
10274	return Subtarget->hasClusters()
10275	? getPreloadedValue(DAG, MFI: *MFI, VT,
10276	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
10277	: DAG.getPOISON(VT);
10278	case Intrinsic::amdgcn_cluster_workgroup_id_x:
10279	return Subtarget->hasClusters()
10280	? getPreloadedValue(
10281	DAG, MFI: *MFI, VT,
10282	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
10283	: DAG.getPOISON(VT);
10284	case Intrinsic::amdgcn_cluster_workgroup_id_y:
10285	return Subtarget->hasClusters()
10286	? getPreloadedValue(
10287	DAG, MFI: *MFI, VT,
10288	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
10289	: DAG.getPOISON(VT);
10290	case Intrinsic::amdgcn_cluster_workgroup_id_z:
10291	return Subtarget->hasClusters()
10292	? getPreloadedValue(
10293	DAG, MFI: *MFI, VT,
10294	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
10295	: DAG.getPOISON(VT);
10296	case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10297	return Subtarget->hasClusters()
10298	? lowerConstHwRegRead(DAG, Op, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: `21`, Width: `4`)
10299	: SDValue ();
10300	case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10301	return Subtarget->hasClusters()
10302	? getPreloadedValue(
10303	DAG, MFI: *MFI, VT,
10304	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
10305	: DAG.getPOISON(VT);
10306	case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10307	return Subtarget->hasClusters()
10308	? getPreloadedValue(
10309	DAG, MFI: *MFI, VT,
10310	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
10311	: DAG.getPOISON(VT);
10312	case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10313	return Subtarget->hasClusters()
10314	? getPreloadedValue(
10315	DAG, MFI: *MFI, VT,
10316	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
10317	: DAG.getPOISON(VT);
10318	case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10319	return Subtarget->hasClusters()
10320	? getPreloadedValue(
10321	DAG, MFI: *MFI, VT,
10322	PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
10323	: DAG.getPOISON(VT);
10324	case Intrinsic::amdgcn_wave_id:
10325	return lowerWaveID(DAG, Op);
10326	case Intrinsic::amdgcn_lds_kernel_id: {
10327	if (MFI->isEntryFunction())
10328	return getLDSKernelId(DAG, SL: DL);
10329	return getPreloadedValue(DAG, MFI: *MFI, VT,
10330	PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
10331	}
10332	case Intrinsic::amdgcn_workitem_id_x:
10333	return lowerWorkitemID(DAG, Op, Dim: `0`, Arg: MFI->getArgInfo().WorkItemIDX);
10334	case Intrinsic::amdgcn_workitem_id_y:
10335	return lowerWorkitemID(DAG, Op, Dim: `1`, Arg: MFI->getArgInfo().WorkItemIDY);
10336	case Intrinsic::amdgcn_workitem_id_z:
10337	return lowerWorkitemID(DAG, Op, Dim: `2`, Arg: MFI->getArgInfo().WorkItemIDZ);
10338	case Intrinsic::amdgcn_wavefrontsize:
10339	return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10340	DL: SDLoc (Op), VT: MVT::i32);
10341	case Intrinsic::amdgcn_s_buffer_load: {
10342	unsigned CPol = Op.getConstantOperandVal(i: `3`);
10343	// s_buffer_load, because of how it's optimized, can't be volatile
10344	// so reject ones with the volatile bit set.
10345	if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10346	? AMDGPU::CPol::ALL
10347	: AMDGPU::CPol::ALL_pregfx12))
10348	return Op;
10349	return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: `1`), Offset: Op.getOperand(i: `2`),
10350	CachePolicy: Op.getOperand(i: `3`), DAG);
10351	}
10352	case Intrinsic::amdgcn_fdiv_fast:
10353	return lowerFDIV_FAST(Op, DAG);
10354	case Intrinsic::amdgcn_sin:
10355	return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: `1`));
10356
10357	case Intrinsic::amdgcn_cos:
10358	return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: `1`));
10359
10360	case Intrinsic::amdgcn_mul_u24:
10361	return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: `1`),
10362	N2: Op.getOperand(i: `2`));
10363	case Intrinsic::amdgcn_mul_i24:
10364	return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: `1`),
10365	N2: Op.getOperand(i: `2`));
10366
10367	case Intrinsic::amdgcn_log_clamp: {
10368	if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10369	return SDValue ();
10370
10371	return emitRemovedIntrinsicError(DAG, DL, VT);
10372	}
10373	case Intrinsic::amdgcn_fract:
10374	return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: `1`));
10375
10376	case Intrinsic::amdgcn_class:
10377	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: `1`),
10378	N2: Op.getOperand(i: `2`));
10379	case Intrinsic::amdgcn_div_fmas:
10380	return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: `1`),
10381	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`), N4: Op.getOperand(i: `4`));
10382
10383	case Intrinsic::amdgcn_div_fixup:
10384	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: `1`),
10385	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10386
10387	case Intrinsic::amdgcn_div_scale: {
10388	const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: `3`));
10389
10390	// Translate to the operands expected by the machine instruction. The
10391	// first parameter must be the same as the first instruction.
10392	SDValue Numerator = Op.getOperand(i: `1`);
10393	SDValue Denominator = Op.getOperand(i: `2`);
10394
10395	// Note this order is opposite of the machine instruction's operations,
10396	// which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10397	// intrinsic has the numerator as the first operand to match a normal
10398	// division operation.
10399
10400	SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10401
10402	return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op ->getVTList(), N1: Src0,
10403	N2: Denominator, N3: Numerator);
10404	}
10405	case Intrinsic::amdgcn_icmp: {
10406	// There is a Pat that handles this variant, so return it as-is.
10407	if (Op.getOperand(i: `1`).getValueType() == MVT::i1 &&
10408	Op.getConstantOperandVal(i: `2`) == `0` &&
10409	Op.getConstantOperandVal(i: `3`) == ICmpInst::Predicate::ICMP_NE)
10410	return Op;
10411	return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10412	}
10413	case Intrinsic::amdgcn_fcmp: {
10414	return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10415	}
10416	case Intrinsic::amdgcn_ballot:
10417	return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10418	case Intrinsic::amdgcn_fmed3:
10419	return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: `1`),
10420	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10421	case Intrinsic::amdgcn_fdot2:
10422	return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: `1`),
10423	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`), N4: Op.getOperand(i: `4`));
10424	case Intrinsic::amdgcn_fmul_legacy:
10425	return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: `1`),
10426	N2: Op.getOperand(i: `2`));
10427	case Intrinsic::amdgcn_sffbh:
10428	return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: `1`));
10429	case Intrinsic::amdgcn_sbfe:
10430	return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: `1`),
10431	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10432	case Intrinsic::amdgcn_ubfe:
10433	return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: `1`),
10434	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10435	case Intrinsic::amdgcn_cvt_pkrtz:
10436	case Intrinsic::amdgcn_cvt_pknorm_i16:
10437	case Intrinsic::amdgcn_cvt_pknorm_u16:
10438	case Intrinsic::amdgcn_cvt_pk_i16:
10439	case Intrinsic::amdgcn_cvt_pk_u16: {
10440	// FIXME: Stop adding cast if v2f16/v2i16 are legal.
10441	EVT VT = Op.getValueType();
10442	unsigned Opcode;
10443
10444	if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10445	Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10446	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10447	Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10448	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10449	Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10450	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10451	Opcode = AMDGPUISD::CVT_PK_I16_I32;
10452	else
10453	Opcode = AMDGPUISD::CVT_PK_U16_U32;
10454
10455	if (isTypeLegal(VT))
10456	return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
10457
10458	SDValue Node =
10459	DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
10460	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
10461	}
10462	case Intrinsic::amdgcn_fmad_ftz:
10463	return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: `1`),
10464	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10465
10466	case Intrinsic::amdgcn_if_break:
10467	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
10468	Op1: Op ->getOperand(Num: `1`), Op2: Op ->getOperand(Num: `2`)),
10469	`0`);
10470
10471	case Intrinsic::amdgcn_groupstaticsize: {
10472	Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
10473	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
10474	return Op;
10475
10476	const Module *M = MF.getFunction().getParent();
10477	const GlobalValue *GV =
10478	Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
10479	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: `0`,
10480	TargetFlags: SIInstrInfo::MO_ABS32_LO);
10481	return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), `0`};
10482	}
10483	case Intrinsic::amdgcn_is_shared:
10484	case Intrinsic::amdgcn_is_private: {
10485	SDLoc SL(Op);
10486	SDValue SrcVec =
10487	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
10488	SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
10489	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
10490
10491	unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10492	? AMDGPUAS::LOCAL_ADDRESS
10493	: AMDGPUAS::PRIVATE_ADDRESS;
10494	if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10495	Subtarget->hasGloballyAddressableScratch()) {
10496	SDValue FlatScratchBaseHi(
10497	DAG.getMachineNode(
10498	Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
10499	Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, VT: MVT::i32)),
10500	`0`);
10501	// Test bits 63..58 against the aperture address.
10502	return DAG.getSetCC(
10503	DL: SL, VT: MVT::i1,
10504	LHS: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: SrcHi, N2: FlatScratchBaseHi),
10505	RHS: DAG.getConstant(Val: `1u` << `26`, DL: SL, VT: MVT::i32), Cond: ISD::SETULT);
10506	}
10507
10508	SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
10509	return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
10510	}
10511	case Intrinsic::amdgcn_perm:
10512	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: `1`),
10513	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
10514	case Intrinsic::amdgcn_reloc_constant: {
10515	Module *M = MF.getFunction().getParent();
10516	const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: `1`))->getMD();
10517	auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: `0`))->getString();
10518	auto *RelocSymbol = cast<GlobalVariable>(
10519	Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
10520	SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: `0`,
10521	TargetFlags: SIInstrInfo::MO_ABS32_LO);
10522	return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), `0`};
10523	}
10524	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10525	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10526	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10527	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10528	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10529	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10530	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10531	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10532	if (Op.getOperand(i: `4`).getValueType() == MVT::i32)
10533	return SDValue ();
10534
10535	SDLoc SL(Op);
10536	auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `4`), DL: SL, VT: MVT::i32);
10537	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10538	N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`),
10539	N4: Op.getOperand(i: `3`), N5: IndexKeyi32);
10540	}
10541	case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10542	case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10543	case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10544	case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10545	case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10546	case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10547	case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10548	case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10549	if (Op.getOperand(i: `4`).getValueType() == MVT::i64)
10550	return SDValue ();
10551
10552	SDLoc SL(Op);
10553	auto IndexKeyi64 =
10554	Op.getOperand(i: `4`).getValueType() == MVT::v2i32
10555	? DAG.getBitcast(VT: MVT::i64, V: Op.getOperand(i: `4`))
10556	: DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `4`), DL: SL, VT: MVT::i64);
10557	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10558	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`), Op.getOperand(i: `2`),
10559	Op.getOperand(i: `3`), IndexKeyi64, Op.getOperand(i: `5`),
10560	Op.getOperand(i: `6`)});
10561	}
10562	case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10563	case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10564	case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10565	case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10566	case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10567	case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10568	EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10569	? MVT::i64
10570	: MVT::i32;
10571	if (Op.getOperand(i: `6`).getValueType() == IndexKeyTy)
10572	return SDValue ();
10573
10574	SDLoc SL(Op);
10575	auto IndexKey =
10576	Op.getOperand(i: `6`).getValueType().isVector()
10577	? DAG.getBitcast(VT: IndexKeyTy, V: Op.getOperand(i: `6`))
10578	: DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `6`), DL: SL, VT: IndexKeyTy);
10579	SmallVector<SDValue> Args{
10580	Op.getOperand(i: `0`), Op.getOperand(i: `1`), Op.getOperand(i: `2`),
10581	Op.getOperand(i: `3`), Op.getOperand(i: `4`), Op.getOperand(i: `5`),
10582	IndexKey, Op.getOperand(i: `7`), Op.getOperand(i: `8`)};
10583	if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10584	Args.push_back(Elt: Op.getOperand(i: `9`));
10585	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), Ops: Args);
10586	}
10587	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10588	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10589	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10590	if (Op.getOperand(i: `6`).getValueType() == MVT::i32)
10591	return SDValue ();
10592
10593	SDLoc SL(Op);
10594	auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `6`), DL: SL, VT: MVT::i32);
10595	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10596	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`), Op.getOperand(i: `2`),
10597	Op.getOperand(i: `3`), Op.getOperand(i: `4`), Op.getOperand(i: `5`),
10598	IndexKeyi32, Op.getOperand(i: `7`)});
10599	}
10600	case Intrinsic::amdgcn_addrspacecast_nonnull:
10601	return lowerADDRSPACECAST(Op, DAG);
10602	case Intrinsic::amdgcn_readlane:
10603	case Intrinsic::amdgcn_readfirstlane:
10604	case Intrinsic::amdgcn_writelane:
10605	case Intrinsic::amdgcn_permlane16:
10606	case Intrinsic::amdgcn_permlanex16:
10607	case Intrinsic::amdgcn_permlane64:
10608	case Intrinsic::amdgcn_set_inactive:
10609	case Intrinsic::amdgcn_set_inactive_chain_arg:
10610	case Intrinsic::amdgcn_mov_dpp8:
10611	case Intrinsic::amdgcn_update_dpp:
10612	return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
10613	case Intrinsic::amdgcn_dead: {
10614	SmallVector<SDValue, `8`> Poisons;
10615	for (const EVT ValTy : Op.getNode()->values())
10616	Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
10617	return DAG.getMergeValues(Ops: Poisons, dl: SDLoc (Op));
10618	}
10619	case Intrinsic::amdgcn_wave_shuffle:
10620	return lowerWaveShuffle(TLI: *this, N: Op.getNode(), DAG);
10621	default:
10622	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10623	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10624	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
10625
10626	return Op;
10627	}
10628	}
10629
10630	// On targets not supporting constant in soffset field, turn zero to
10631	// SGPR_NULL to avoid generating an extra s_mov with zero.
10632	static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
10633	const GCNSubtarget *Subtarget) {
10634	if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
10635	return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
10636	return SOffset;
10637	}
10638
10639	SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10640	SelectionDAG &DAG,
10641	unsigned NewOpcode) const {
10642	SDLoc DL(Op);
10643
10644	SDValue VData = Op.getOperand(i: `2`);
10645	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10646	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
10647	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
10648	SDValue Ops[] = {
10649	Op.getOperand(i: `0`), // Chain
10650	VData, // vdata
10651	Rsrc, // rsrc
10652	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10653	VOffset, // voffset
10654	SOffset, // soffset
10655	Offset, // offset
10656	Op.getOperand(i: `6`), // cachepolicy
10657	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10658	};
10659
10660	auto *M = cast<MemSDNode>(Val&: Op);
10661
10662	EVT MemVT = VData.getValueType();
10663	return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op ->getVTList(), Ops, MemVT,
10664	MMO: M->getMemOperand());
10665	}
10666
10667	SDValue
10668	SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10669	unsigned NewOpcode) const {
10670	SDLoc DL(Op);
10671
10672	SDValue VData = Op.getOperand(i: `2`);
10673	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10674	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
10675	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
10676	SDValue Ops[] = {
10677	Op.getOperand(i: `0`), // Chain
10678	VData, // vdata
10679	Rsrc, // rsrc
10680	Op.getOperand(i: `4`), // vindex
10681	VOffset, // voffset
10682	SOffset, // soffset
10683	Offset, // offset
10684	Op.getOperand(i: `7`), // cachepolicy
10685	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
10686	};
10687
10688	auto *M = cast<MemSDNode>(Val&: Op);
10689
10690	EVT MemVT = VData.getValueType();
10691	return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op ->getVTList(), Ops, MemVT,
10692	MMO: M->getMemOperand());
10693	}
10694
10695	SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10696	SelectionDAG &DAG) const {
10697	unsigned IntrID = Op.getConstantOperandVal(i: `1`);
10698	SDLoc DL(Op);
10699
10700	switch (IntrID) {
10701	case Intrinsic::amdgcn_ds_ordered_add:
10702	case Intrinsic::amdgcn_ds_ordered_swap: {
10703	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10704	SDValue Chain = M->getOperand(Num: `0`);
10705	SDValue M0 = M->getOperand(Num: `2`);
10706	SDValue Value = M->getOperand(Num: `3`);
10707	unsigned IndexOperand = M->getConstantOperandVal(Num: `7`);
10708	unsigned WaveRelease = M->getConstantOperandVal(Num: `8`);
10709	unsigned WaveDone = M->getConstantOperandVal(Num: `9`);
10710
10711	unsigned OrderedCountIndex = IndexOperand & `0x3f`;
10712	IndexOperand &= ~`0x3f`;
10713	unsigned CountDw = `0`;
10714
10715	if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10716	CountDw = (IndexOperand >> `24`) & `0xf`;
10717	IndexOperand &= ~(`0xf` << `24`);
10718
10719	if (CountDw < `1` \|\| CountDw > `4`) {
10720	const Function &Fn = DAG.getMachineFunction().getFunction();
10721	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
10722	Fn, "ds_ordered_count: dword count must be between 1 and 4",
10723	DL.getDebugLoc()));
10724	CountDw = `1`;
10725	}
10726	}
10727
10728	if (IndexOperand) {
10729	const Function &Fn = DAG.getMachineFunction().getFunction();
10730	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
10731	Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10732	}
10733
10734	if (WaveDone && !WaveRelease) {
10735	// TODO: Move this to IR verifier
10736	const Function &Fn = DAG.getMachineFunction().getFunction();
10737	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
10738	Fn, "ds_ordered_count: wave_done requires wave_release",
10739	DL.getDebugLoc()));
10740	}
10741
10742	unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? `0` : `1`;
10743	unsigned ShaderType =
10744	SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
10745	unsigned Offset0 = OrderedCountIndex << `2`;
10746	unsigned Offset1 = WaveRelease \| (WaveDone << `1`) \| (Instruction << `4`);
10747
10748	if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10749	Offset1 \|= (CountDw - `1`) << `6`;
10750
10751	if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10752	Offset1 \|= ShaderType << `2`;
10753
10754	unsigned Offset = Offset0 \| (Offset1 << `8`);
10755
10756	SDValue Ops[] = {
10757	Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
10758	copyToM0(DAG, Chain, DL, V: M0).getValue(R: `1`), // Glue
10759	};
10760	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
10761	VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
10762	MMO: M->getMemOperand());
10763	}
10764	case Intrinsic::amdgcn_raw_buffer_load:
10765	case Intrinsic::amdgcn_raw_ptr_buffer_load:
10766	case Intrinsic::amdgcn_raw_atomic_buffer_load:
10767	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10768	case Intrinsic::amdgcn_raw_buffer_load_format:
10769	case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10770	const bool IsFormat =
10771	IntrID == Intrinsic::amdgcn_raw_buffer_load_format \|\|
10772	IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10773
10774	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
10775	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `3`), DAG);
10776	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `4`), DAG, Subtarget);
10777	SDValue Ops[] = {
10778	Op.getOperand(i: `0`), // Chain
10779	Rsrc, // rsrc
10780	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10781	VOffset, // voffset
10782	SOffset, // soffset
10783	Offset, // offset
10784	Op.getOperand(i: `5`), // cachepolicy, swizzled buffer
10785	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10786	};
10787
10788	auto *M = cast<MemSDNode>(Val&: Op);
10789	return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10790	}
10791	case Intrinsic::amdgcn_struct_buffer_load:
10792	case Intrinsic::amdgcn_struct_ptr_buffer_load:
10793	case Intrinsic::amdgcn_struct_buffer_load_format:
10794	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10795	case Intrinsic::amdgcn_struct_atomic_buffer_load:
10796	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10797	const bool IsFormat =
10798	IntrID == Intrinsic::amdgcn_struct_buffer_load_format \|\|
10799	IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10800
10801	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
10802	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
10803	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
10804	SDValue Ops[] = {
10805	Op.getOperand(i: `0`), // Chain
10806	Rsrc, // rsrc
10807	Op.getOperand(i: `3`), // vindex
10808	VOffset, // voffset
10809	SOffset, // soffset
10810	Offset, // offset
10811	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
10812	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
10813	};
10814
10815	return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
10816	}
10817	case Intrinsic::amdgcn_raw_tbuffer_load:
10818	case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10819	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10820	EVT LoadVT = Op.getValueType();
10821	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
10822	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `3`), DAG);
10823	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `4`), DAG, Subtarget);
10824
10825	SDValue Ops[] = {
10826	Op.getOperand(i: `0`), // Chain
10827	Rsrc, // rsrc
10828	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10829	VOffset, // voffset
10830	SOffset, // soffset
10831	Offset, // offset
10832	Op.getOperand(i: `5`), // format
10833	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
10834	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10835	};
10836
10837	if (LoadVT.getScalarType() == MVT::f16)
10838	return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10839	Ops);
10840	return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10841	VTList: Op ->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10842	DAG);
10843	}
10844	case Intrinsic::amdgcn_struct_tbuffer_load:
10845	case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10846	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10847	EVT LoadVT = Op.getValueType();
10848	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
10849	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
10850	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
10851
10852	SDValue Ops[] = {
10853	Op.getOperand(i: `0`), // Chain
10854	Rsrc, // rsrc
10855	Op.getOperand(i: `3`), // vindex
10856	VOffset, // voffset
10857	SOffset, // soffset
10858	Offset, // offset
10859	Op.getOperand(i: `6`), // format
10860	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
10861	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
10862	};
10863
10864	if (LoadVT.getScalarType() == MVT::f16)
10865	return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10866	Ops);
10867	return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10868	VTList: Op ->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10869	DAG);
10870	}
10871	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10872	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10873	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10874	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10875	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10876	return lowerStructBufferAtomicIntrin(Op, DAG,
10877	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10878	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10879	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10880	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10881	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10882	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10883	return lowerStructBufferAtomicIntrin(Op, DAG,
10884	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10885	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10886	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10887	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10888	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10889	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10890	return lowerStructBufferAtomicIntrin(Op, DAG,
10891	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10892	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10893	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10894	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10895	case Intrinsic::amdgcn_raw_buffer_atomic_add:
10896	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10897	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10898	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10899	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10900	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10901	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10902	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10903	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10904	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10905	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10906	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10907	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10908	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10909	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10910	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10911	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10912	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10913	case Intrinsic::amdgcn_raw_buffer_atomic_and:
10914	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10915	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10916	case Intrinsic::amdgcn_raw_buffer_atomic_or:
10917	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10918	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10919	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10920	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10921	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10922	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10923	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10924	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10925	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10926	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10927	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10928	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10929	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10930	return lowerStructBufferAtomicIntrin(Op, DAG,
10931	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10932	case Intrinsic::amdgcn_struct_buffer_atomic_add:
10933	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10934	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10935	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10936	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10937	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10938	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10939	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10940	return lowerStructBufferAtomicIntrin(Op, DAG,
10941	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10942	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10943	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10944	return lowerStructBufferAtomicIntrin(Op, DAG,
10945	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10946	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10947	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10948	return lowerStructBufferAtomicIntrin(Op, DAG,
10949	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10950	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10951	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10952	return lowerStructBufferAtomicIntrin(Op, DAG,
10953	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10954	case Intrinsic::amdgcn_struct_buffer_atomic_and:
10955	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10956	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10957	case Intrinsic::amdgcn_struct_buffer_atomic_or:
10958	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10959	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10960	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10961	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10962	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10963	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10964	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10965	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10966	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10967	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10968	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10969	case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10970	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10971	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10972	case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10973	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10974	return lowerStructBufferAtomicIntrin(Op, DAG,
10975	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10976	case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10977	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10978	return lowerRawBufferAtomicIntrin(Op, DAG,
10979	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10980	case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10981	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10982	return lowerStructBufferAtomicIntrin(Op, DAG,
10983	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10984	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10985	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10986	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `4`), DAG);
10987	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
10988	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
10989	SDValue Ops[] = {
10990	Op.getOperand(i: `0`), // Chain
10991	Op.getOperand(i: `2`), // src
10992	Op.getOperand(i: `3`), // cmp
10993	Rsrc, // rsrc
10994	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10995	VOffset, // voffset
10996	SOffset, // soffset
10997	Offset, // offset
10998	Op.getOperand(i: `7`), // cachepolicy
10999	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
11000	};
11001	EVT VT = Op.getValueType();
11002	auto *M = cast<MemSDNode>(Val&: Op);
11003
11004	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11005	VTList: Op ->getVTList(), Ops, MemVT: VT,
11006	MMO: M->getMemOperand());
11007	}
11008	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11009	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11010	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op ->getOperand(Num: `4`), DAG);
11011	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `6`), DAG);
11012	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `7`), DAG, Subtarget);
11013	SDValue Ops[] = {
11014	Op.getOperand(i: `0`), // Chain
11015	Op.getOperand(i: `2`), // src
11016	Op.getOperand(i: `3`), // cmp
11017	Rsrc, // rsrc
11018	Op.getOperand(i: `5`), // vindex
11019	VOffset, // voffset
11020	SOffset, // soffset
11021	Offset, // offset
11022	Op.getOperand(i: `8`), // cachepolicy
11023	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
11024	};
11025	EVT VT = Op.getValueType();
11026	auto *M = cast<MemSDNode>(Val&: Op);
11027
11028	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11029	VTList: Op ->getVTList(), Ops, MemVT: VT,
11030	MMO: M->getMemOperand());
11031	}
11032	case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11033	case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11034	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11035	SDValue NodePtr = M->getOperand(Num: `2`);
11036	SDValue RayExtent = M->getOperand(Num: `3`);
11037	SDValue InstanceMask = M->getOperand(Num: `4`);
11038	SDValue RayOrigin = M->getOperand(Num: `5`);
11039	SDValue RayDir = M->getOperand(Num: `6`);
11040	SDValue Offsets = M->getOperand(Num: `7`);
11041	SDValue TDescr = M->getOperand(Num: `8`);
11042
11043	assert(NodePtr.getValueType() == MVT::i64);
11044	assert(RayDir.getValueType() == MVT::v3f32);
11045
11046	if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11047	emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11048	return SDValue ();
11049	}
11050
11051	bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11052	const unsigned NumVDataDwords = `10`;
11053	const unsigned NumVAddrDwords = IsBVH8 ? `11` : `12`;
11054	int Opcode = AMDGPU::getMIMGOpcode(
11055	BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11056	: AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11057	MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11058	assert(Opcode != -`1`);
11059
11060	SmallVector<SDValue, `7`> Ops;
11061	Ops.push_back(Elt: NodePtr);
11062	Ops.push_back(Elt: DAG.getBuildVector(
11063	VT: MVT::v2i32, DL,
11064	Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
11065	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
11066	Ops.push_back(Elt: RayOrigin);
11067	Ops.push_back(Elt: RayDir);
11068	Ops.push_back(Elt: Offsets);
11069	Ops.push_back(Elt: TDescr);
11070	Ops.push_back(Elt: M->getChain());
11071
11072	auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11073	MachineMemOperand *MemRef = M->getMemOperand();
11074	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11075	return SDValue (NewNode, `0`);
11076	}
11077	case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11078	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11079	SDValue NodePtr = M->getOperand(Num: `2`);
11080	SDValue RayExtent = M->getOperand(Num: `3`);
11081	SDValue RayOrigin = M->getOperand(Num: `4`);
11082	SDValue RayDir = M->getOperand(Num: `5`);
11083	SDValue RayInvDir = M->getOperand(Num: `6`);
11084	SDValue TDescr = M->getOperand(Num: `7`);
11085
11086	assert(NodePtr.getValueType() == MVT::i32 \|\|
11087	NodePtr.getValueType() == MVT::i64);
11088	assert(RayDir.getValueType() == MVT::v3f16 \|\|
11089	RayDir.getValueType() == MVT::v3f32);
11090
11091	if (!Subtarget->hasGFX10_AEncoding()) {
11092	emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11093	return SDValue ();
11094	}
11095
11096	const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
11097	const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
11098	const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11099	const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11100	const bool Is64 = NodePtr.getValueType() == MVT::i64;
11101	const unsigned NumVDataDwords = `4`;
11102	const unsigned NumVAddrDwords = IsA16 ? (Is64 ? `9` : `8`) : (Is64 ? `12` : `11`);
11103	const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? `4` : `5`) : NumVAddrDwords;
11104	const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11105	NumVAddrs <= Subtarget->getNSAMaxSize()) \|\|
11106	IsGFX12Plus;
11107	const unsigned BaseOpcodes[`2`][`2`] = {
11108	{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11109	{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11110	AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11111	int Opcode;
11112	if (UseNSA) {
11113	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11114	MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11115	: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11116	: AMDGPU::MIMGEncGfx10NSA,
11117	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11118	} else {
11119	assert(!IsGFX12Plus);
11120	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11121	MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11122	: AMDGPU::MIMGEncGfx10Default,
11123	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11124	}
11125	assert(Opcode != -`1`);
11126
11127	SmallVector<SDValue, `16`> Ops;
11128
11129	auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11130	SmallVector<SDValue, `3`> Lanes;
11131	DAG.ExtractVectorElements(Op, Args&: Lanes, Start: `0`, Count: `3`);
11132	if (Lanes [`0`].getValueSizeInBits() == `32`) {
11133	for (unsigned I = `0`; I < `3`; ++I)
11134	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes [I]));
11135	} else {
11136	if (IsAligned) {
11137	Ops.push_back(Elt: DAG.getBitcast(
11138	VT: MVT::i32,
11139	V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes [`0`], Lanes [`1`]})));
11140	Ops.push_back(Elt: Lanes [`2`]);
11141	} else {
11142	SDValue Elt0 = Ops.pop_back_val();
11143	Ops.push_back(Elt: DAG.getBitcast(
11144	VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes [`0`]})));
11145	Ops.push_back(Elt: DAG.getBitcast(
11146	VT: MVT::i32,
11147	V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes [`1`], Lanes [`2`]})));
11148	}
11149	}
11150	};
11151
11152	if (UseNSA && IsGFX11Plus) {
11153	Ops.push_back(Elt: NodePtr);
11154	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11155	Ops.push_back(Elt: RayOrigin);
11156	if (IsA16) {
11157	SmallVector<SDValue, `3`> DirLanes, InvDirLanes, MergedLanes;
11158	DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: `0`, Count: `3`);
11159	DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: `0`, Count: `3`);
11160	for (unsigned I = `0`; I < `3`; ++I) {
11161	MergedLanes.push_back(Elt: DAG.getBitcast(
11162	VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
11163	Ops: {DirLanes [I], InvDirLanes [I]})));
11164	}
11165	Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
11166	} else {
11167	Ops.push_back(Elt: RayDir);
11168	Ops.push_back(Elt: RayInvDir);
11169	}
11170	} else {
11171	if (Is64)
11172	DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: `0`,
11173	Count: `2`);
11174	else
11175	Ops.push_back(Elt: NodePtr);
11176
11177	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11178	packLanes (RayOrigin, true);
11179	packLanes (RayDir, true);
11180	packLanes (RayInvDir, false);
11181	}
11182
11183	if (!UseNSA) {
11184	// Build a single vector containing all the operands so far prepared.
11185	if (NumVAddrDwords > `12`) {
11186	SDValue Undef = DAG.getPOISON(VT: MVT::i32);
11187	Ops.append(NumInputs: `16` - Ops.size(), Elt: Undef);
11188	}
11189	assert(Ops.size() >= `8` && Ops.size() <= `12`);
11190	SDValue MergedOps =
11191	DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
11192	Ops.clear();
11193	Ops.push_back(Elt: MergedOps);
11194	}
11195
11196	Ops.push_back(Elt: TDescr);
11197	Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
11198	Ops.push_back(Elt: M->getChain());
11199
11200	auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11201	MachineMemOperand *MemRef = M->getMemOperand();
11202	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11203	return SDValue (NewNode, `0`);
11204	}
11205	case Intrinsic::amdgcn_global_atomic_fmin_num:
11206	case Intrinsic::amdgcn_global_atomic_fmax_num:
11207	case Intrinsic::amdgcn_flat_atomic_fmin_num:
11208	case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11209	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11210	SDValue Ops[] = {
11211	M->getOperand(Num: `0`), // Chain
11212	M->getOperand(Num: `2`), // Ptr
11213	M->getOperand(Num: `3`) // Value
11214	};
11215	unsigned Opcode = `0`;
11216	switch (IntrID) {
11217	case Intrinsic::amdgcn_global_atomic_fmin_num:
11218	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11219	Opcode = ISD::ATOMIC_LOAD_FMIN;
11220	break;
11221	}
11222	case Intrinsic::amdgcn_global_atomic_fmax_num:
11223	case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11224	Opcode = ISD::ATOMIC_LOAD_FMAX;
11225	break;
11226	}
11227	default:
11228	llvm_unreachable("unhandled atomic opcode");
11229	}
11230	return DAG.getAtomic(Opcode, dl: SDLoc (Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
11231	Ops, MMO: M->getMemOperand());
11232	}
11233	case Intrinsic::amdgcn_s_alloc_vgpr: {
11234	SDValue NumVGPRs = Op.getOperand(i: `2`);
11235	if (!NumVGPRs ->isDivergent())
11236	return Op;
11237
11238	SDValue ReadFirstLaneID =
11239	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
11240	NumVGPRs = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
11241	N1: ReadFirstLaneID, N2: NumVGPRs);
11242
11243	return DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, VTList: Op ->getVTList(),
11244	N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`), N3: NumVGPRs);
11245	}
11246	case Intrinsic::amdgcn_s_get_barrier_state:
11247	case Intrinsic::amdgcn_s_get_named_barrier_state: {
11248	SDValue Chain = Op ->getOperand(Num: `0`);
11249	SmallVector<SDValue, `2`> Ops;
11250	unsigned Opc;
11251
11252	if (isa<ConstantSDNode>(Val: Op ->getOperand(Num: `2`))) {
11253	uint64_t BarID = cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`))->getZExtValue();
11254	if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11255	BarID = (BarID >> `4`) & `0x3F`;
11256	Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11257	SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11258	Ops.push_back(Elt: K);
11259	Ops.push_back(Elt: Chain);
11260	} else {
11261	Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11262	if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11263	SDValue M0Val;
11264	M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op ->getOperand(Num: `2`),
11265	N2: DAG.getShiftAmountConstant(Val: `4`, VT: MVT::i32, DL));
11266	M0Val = SDValue (
11267	DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11268	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
11269	`0`);
11270	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: `0`));
11271	} else
11272	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op ->getOperand(Num: `2`)).getValue(R: `0`));
11273	}
11274
11275	auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
11276	return SDValue (NewMI, `0`);
11277	}
11278	case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11279	case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11280	case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11281	MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11282	SDValue Chain = Op ->getOperand(Num: `0`);
11283	SDValue Ptr = Op ->getOperand(Num: `2`);
11284	EVT VT = Op ->getValueType(ResNo: `0`);
11285	return DAG.getAtomicLoad(ExtType: ISD::NON_EXTLOAD, dl: DL, MemVT: MII->getMemoryVT(), VT,
11286	Chain, Ptr, MMO: MII->getMemOperand());
11287	}
11288	case Intrinsic::amdgcn_flat_load_monitor_b32:
11289	case Intrinsic::amdgcn_flat_load_monitor_b64:
11290	case Intrinsic::amdgcn_flat_load_monitor_b128: {
11291	MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11292	SDValue Chain = Op ->getOperand(Num: `0`);
11293	SDValue Ptr = Op ->getOperand(Num: `2`);
11294	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::FLAT_LOAD_MONITOR, dl: DL,
11295	VTList: Op ->getVTList(), Ops: {Chain, Ptr},
11296	MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11297	}
11298	case Intrinsic::amdgcn_global_load_monitor_b32:
11299	case Intrinsic::amdgcn_global_load_monitor_b64:
11300	case Intrinsic::amdgcn_global_load_monitor_b128: {
11301	MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11302	SDValue Chain = Op ->getOperand(Num: `0`);
11303	SDValue Ptr = Op ->getOperand(Num: `2`);
11304	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::GLOBAL_LOAD_MONITOR, dl: DL,
11305	VTList: Op ->getVTList(), Ops: {Chain, Ptr},
11306	MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11307	}
11308	default:
11309
11310	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11311	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
11312	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11313
11314	return SDValue ();
11315	}
11316	}
11317
11318	// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11319	// dwordx4 if on SI and handle TFE loads.
11320	SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11321	SDVTList VTList,
11322	ArrayRef<SDValue> Ops, EVT MemVT,
11323	MachineMemOperand *MMO,
11324	SelectionDAG &DAG) const {
11325	LLVMContext &C = *DAG.getContext();
11326	MachineFunction &MF = DAG.getMachineFunction();
11327	EVT VT = VTList.VTs[`0`];
11328
11329	assert(VTList.NumVTs == `2` \|\| VTList.NumVTs == `3`);
11330	bool IsTFE = VTList.NumVTs == `3`;
11331	if (IsTFE) {
11332	unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: `32`);
11333	unsigned NumOpDWords = NumValueDWords + `1`;
11334	EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
11335	SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[`2`]);
11336	MachineMemOperand *OpDWordsMMO =
11337	MF.getMachineMemOperand(MMO, Offset: `0`, Size: NumOpDWords * `4`);
11338	SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
11339	MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
11340	SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
11341	N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
11342	SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
11343	SDValue ValueDWords =
11344	NumValueDWords == `1`
11345	? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
11346	: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
11347	VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
11348	N2: ZeroIdx);
11349	SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
11350	return DAG.getMergeValues(Ops: {Value, Status, SDValue (Op.getNode(), `1`)}, dl: DL);
11351	}
11352
11353	if (!Subtarget->hasDwordx3LoadStores() &&
11354	(VT == MVT::v3i32 \|\| VT == MVT::v3f32)) {
11355	EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: `4`);
11356	EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: `4`);
11357	MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: `0`, Size: `16`);
11358	SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[`1`]);
11359	SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
11360	MemVT: WidenedMemVT, MMO: WidenedMMO);
11361	SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
11362	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
11363	return DAG.getMergeValues(Ops: {Value, SDValue (Op.getNode(), `1`)}, dl: DL);
11364	}
11365
11366	return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
11367	}
11368
11369	SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11370	bool ImageStore) const {
11371	EVT StoreVT = VData.getValueType();
11372
11373	// No change for f16 and legal vector D16 types.
11374	if (!StoreVT.isVector())
11375	return VData;
11376
11377	SDLoc DL(VData);
11378	unsigned NumElements = StoreVT.getVectorNumElements();
11379
11380	if (Subtarget->hasUnpackedD16VMem()) {
11381	// We need to unpack the packed data to store.
11382	EVT IntStoreVT = StoreVT.changeTypeToInteger();
11383	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11384
11385	EVT EquivStoreVT =
11386	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
11387	SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
11388	return DAG.UnrollVectorOp(N: ZExt.getNode());
11389	}
11390
11391	// The sq block of gfx8.1 does not estimate register use correctly for d16
11392	// image store instructions. The data operand is computed as if it were not a
11393	// d16 image instruction.
11394	if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11395	// Bitcast to i16
11396	EVT IntStoreVT = StoreVT.changeTypeToInteger();
11397	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11398
11399	// Decompose into scalars
11400	SmallVector<SDValue, `4`> Elts;
11401	DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
11402
11403	// Group pairs of i16 into v2i16 and bitcast to i32
11404	SmallVector<SDValue, `4`> PackedElts;
11405	for (unsigned I = `0`; I < Elts.size() / `2`; I += `1`) {
11406	SDValue Pair =
11407	DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts [I * `2`], Elts [I * `2` + `1`]});
11408	SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11409	PackedElts.push_back(Elt: IntPair);
11410	}
11411	if ((NumElements % `2`) == `1`) {
11412	// Handle v3i16
11413	unsigned I = Elts.size() / `2`;
11414	SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
11415	Ops: {Elts [I * `2`], DAG.getPOISON(VT: MVT::i16)});
11416	SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11417	PackedElts.push_back(Elt: IntPair);
11418	}
11419
11420	// Pad using UNDEF
11421	PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
11422
11423	// Build final vector
11424	EVT VecVT =
11425	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
11426	return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
11427	}
11428
11429	if (NumElements == `3`) {
11430	EVT IntStoreVT =
11431	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
11432	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11433
11434	EVT WidenedStoreVT = EVT::getVectorVT(
11435	Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + `1`);
11436	EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
11437	BitWidth: WidenedStoreVT.getStoreSizeInBits());
11438	SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
11439	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
11440	}
11441
11442	assert(isTypeLegal(StoreVT));
11443	return VData;
11444	}
11445
11446	static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11447	switch (Intr) {
11448	case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11449	case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11450	case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11451	case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11452	case Intrinsic::amdgcn_load_async_to_lds:
11453	case Intrinsic::amdgcn_global_load_async_lds:
11454	return true;
11455	}
11456	return false;
11457	}
11458
11459	SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11460	SelectionDAG &DAG) const {
11461	SDLoc DL(Op);
11462	SDValue Chain = Op.getOperand(i: `0`);
11463	unsigned IntrinsicID = Op.getConstantOperandVal(i: `1`);
11464
11465	switch (IntrinsicID) {
11466	case Intrinsic::amdgcn_exp_compr: {
11467	if (!Subtarget->hasCompressedExport()) {
11468	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
11469	DAG.getMachineFunction().getFunction(),
11470	"intrinsic not supported on subtarget", DL.getDebugLoc()));
11471	}
11472	SDValue Src0 = Op.getOperand(i: `4`);
11473	SDValue Src1 = Op.getOperand(i: `5`);
11474	// Hack around illegal type on SI by directly selecting it.
11475	if (isTypeLegal(VT: Src0.getValueType()))
11476	return SDValue ();
11477
11478	const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: `6`));
11479	SDValue Undef = DAG.getPOISON(VT: MVT::f32);
11480	const SDValue Ops[] = {
11481	Op.getOperand(i: `2`), // tgt
11482	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
11483	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
11484	Undef, // src2
11485	Undef, // src3
11486	Op.getOperand(i: `7`), // vm
11487	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // compr
11488	Op.getOperand(i: `3`), // en
11489	Op.getOperand(i: `0`) // Chain
11490	};
11491
11492	unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11493	return SDValue (DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops), `0`);
11494	}
11495
11496	case Intrinsic::amdgcn_struct_tbuffer_store:
11497	case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11498	SDValue VData = Op.getOperand(i: `2`);
11499	bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11500	if (IsD16)
11501	VData = handleD16VData(VData, DAG);
11502	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
11503	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
11504	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
11505	SDValue Ops[] = {
11506	Chain,
11507	VData, // vdata
11508	Rsrc, // rsrc
11509	Op.getOperand(i: `4`), // vindex
11510	VOffset, // voffset
11511	SOffset, // soffset
11512	Offset, // offset
11513	Op.getOperand(i: `7`), // format
11514	Op.getOperand(i: `8`), // cachepolicy, swizzled buffer
11515	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
11516	};
11517	unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11518	: AMDGPUISD::TBUFFER_STORE_FORMAT;
11519	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11520	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
11521	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11522	}
11523
11524	case Intrinsic::amdgcn_raw_tbuffer_store:
11525	case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11526	SDValue VData = Op.getOperand(i: `2`);
11527	bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11528	if (IsD16)
11529	VData = handleD16VData(VData, DAG);
11530	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
11531	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
11532	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
11533	SDValue Ops[] = {
11534	Chain,
11535	VData, // vdata
11536	Rsrc, // rsrc
11537	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
11538	VOffset, // voffset
11539	SOffset, // soffset
11540	Offset, // offset
11541	Op.getOperand(i: `6`), // format
11542	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
11543	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
11544	};
11545	unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11546	: AMDGPUISD::TBUFFER_STORE_FORMAT;
11547	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11548	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
11549	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11550	}
11551
11552	case Intrinsic::amdgcn_raw_buffer_store:
11553	case Intrinsic::amdgcn_raw_ptr_buffer_store:
11554	case Intrinsic::amdgcn_raw_buffer_store_format:
11555	case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11556	const bool IsFormat =
11557	IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format \|\|
11558	IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11559
11560	SDValue VData = Op.getOperand(i: `2`);
11561	EVT VDataVT = VData.getValueType();
11562	EVT EltType = VDataVT.getScalarType();
11563	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
11564	if (IsD16) {
11565	VData = handleD16VData(VData, DAG);
11566	VDataVT = VData.getValueType();
11567	}
11568
11569	if (!isTypeLegal(VT: VDataVT)) {
11570	VData =
11571	DAG.getNode(Opcode: ISD::BITCAST, DL,
11572	VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11573	}
11574
11575	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
11576	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
11577	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
11578	SDValue Ops[] = {
11579	Chain,
11580	VData,
11581	Rsrc,
11582	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
11583	VOffset, // voffset
11584	SOffset, // soffset
11585	Offset, // offset
11586	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
11587	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
11588	};
11589	unsigned Opc =
11590	IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11591	Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11592	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11593
11594	// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11595	if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < `32`)
11596	return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
11597
11598	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
11599	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11600	}
11601
11602	case Intrinsic::amdgcn_struct_buffer_store:
11603	case Intrinsic::amdgcn_struct_ptr_buffer_store:
11604	case Intrinsic::amdgcn_struct_buffer_store_format:
11605	case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11606	const bool IsFormat =
11607	IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format \|\|
11608	IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11609
11610	SDValue VData = Op.getOperand(i: `2`);
11611	EVT VDataVT = VData.getValueType();
11612	EVT EltType = VDataVT.getScalarType();
11613	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
11614
11615	if (IsD16) {
11616	VData = handleD16VData(VData, DAG);
11617	VDataVT = VData.getValueType();
11618	}
11619
11620	if (!isTypeLegal(VT: VDataVT)) {
11621	VData =
11622	DAG.getNode(Opcode: ISD::BITCAST, DL,
11623	VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11624	}
11625
11626	auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
11627	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
11628	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
11629	SDValue Ops[] = {
11630	Chain,
11631	VData,
11632	Rsrc,
11633	Op.getOperand(i: `4`), // vindex
11634	VOffset, // voffset
11635	SOffset, // soffset
11636	Offset, // offset
11637	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
11638	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
11639	};
11640	unsigned Opc =
11641	!IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11642	Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11643	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11644
11645	// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11646	EVT VDataType = VData.getValueType().getScalarType();
11647	if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < `32`)
11648	return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11649
11650	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
11651	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11652	}
11653	case Intrinsic::amdgcn_raw_buffer_load_lds:
11654	case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11655	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11656	case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11657	case Intrinsic::amdgcn_struct_buffer_load_lds:
11658	case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11659	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11660	case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11661	if (!Subtarget->hasVMemToLDSLoad())
11662	return SDValue ();
11663	unsigned Opc;
11664	bool HasVIndex =
11665	IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds \|\|
11666	IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds \|\|
11667	IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds \|\|
11668	IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11669	unsigned OpOffset = HasVIndex ? `1` : `0`;
11670	SDValue VOffset = Op.getOperand(i: `5` + OpOffset);
11671	bool HasVOffset = !isNullConstant(V: VOffset);
11672	unsigned Size = Op ->getConstantOperandVal(Num: `4`);
11673
11674	switch (Size) {
11675	default:
11676	return SDValue ();
11677	case `1`:
11678	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11679	: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11680	: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11681	: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11682	break;
11683	case `2`:
11684	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11685	: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11686	: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11687	: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11688	break;
11689	case `4`:
11690	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11691	: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11692	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11693	: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11694	break;
11695	case `12`:
11696	if (!Subtarget->hasLDSLoadB96_B128())
11697	return SDValue ();
11698	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11699	: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11700	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11701	: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11702	break;
11703	case `16`:
11704	if (!Subtarget->hasLDSLoadB96_B128())
11705	return SDValue ();
11706	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11707	: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11708	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11709	: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11710	break;
11711	}
11712
11713	SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: `3`));
11714
11715	SmallVector<SDValue, `8`> Ops;
11716
11717	if (HasVIndex && HasVOffset)
11718	Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
11719	Ops: {Op.getOperand(i: `5`), // VIndex
11720	VOffset}));
11721	else if (HasVIndex)
11722	Ops.push_back(Elt: Op.getOperand(i: `5`));
11723	else if (HasVOffset)
11724	Ops.push_back(Elt: VOffset);
11725
11726	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
11727	Ops.push_back(Elt: Rsrc);
11728	Ops.push_back(Elt: Op.getOperand(i: `6` + OpOffset)); // soffset
11729	Ops.push_back(Elt: Op.getOperand(i: `7` + OpOffset)); // imm offset
11730	bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11731	unsigned Aux = Op.getConstantOperandVal(i: `8` + OpOffset);
11732	Ops.push_back(Elt: DAG.getTargetConstant(
11733	Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11734	DL, VT: MVT::i8)); // cpol
11735	Ops.push_back(Elt: DAG.getTargetConstant(
11736	Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11737	? `1`
11738	: `0`,
11739	DL, VT: MVT::i8)); // swz
11740	Ops.push_back(
11741	Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11742	Ops.push_back(Elt: M0Val.getValue(R: `0`)); // Chain
11743	Ops.push_back(Elt: M0Val.getValue(R: `1`)); // Glue
11744
11745	auto *M = cast<MemSDNode>(Val&: Op);
11746	auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
11747	DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11748
11749	return SDValue (Load, `0`);
11750	}
11751	// Buffers are handled by LowerBufferFatPointers, and we're going to go
11752	// for "trust me" that the remaining cases are global pointers until
11753	// such time as we can put two mem operands on an intrinsic.
11754	case Intrinsic::amdgcn_load_to_lds:
11755	case Intrinsic::amdgcn_load_async_to_lds:
11756	case Intrinsic::amdgcn_global_load_lds:
11757	case Intrinsic::amdgcn_global_load_async_lds: {
11758	if (!Subtarget->hasVMemToLDSLoad())
11759	return SDValue ();
11760
11761	unsigned Opc;
11762	unsigned Size = Op ->getConstantOperandVal(Num: `4`);
11763	switch (Size) {
11764	default:
11765	return SDValue ();
11766	case `1`:
11767	Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11768	break;
11769	case `2`:
11770	Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11771	break;
11772	case `4`:
11773	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11774	break;
11775	case `12`:
11776	if (!Subtarget->hasLDSLoadB96_B128())
11777	return SDValue ();
11778	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11779	break;
11780	case `16`:
11781	if (!Subtarget->hasLDSLoadB96_B128())
11782	return SDValue ();
11783	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11784	break;
11785	}
11786
11787	SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: `3`));
11788
11789	SmallVector<SDValue, `6`> Ops;
11790
11791	SDValue Addr = Op.getOperand(i: `2`); // Global ptr
11792	SDValue VOffset;
11793	// Try to split SAddr and VOffset. Global and LDS pointers share the same
11794	// immediate offset, so we cannot use a regular SelectGlobalSAddr().
11795	if (Addr ->isDivergent() && Addr ->isAnyAdd()) {
11796	SDValue LHS = Addr.getOperand(i: `0`);
11797	SDValue RHS = Addr.getOperand(i: `1`);
11798
11799	if (LHS ->isDivergent())
11800	std::swap(a&: LHS, b&: RHS);
11801
11802	if (!LHS ->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11803	RHS.getOperand(i: `0`).getValueType() == MVT::i32) {
11804	// add (i64 sgpr), (zero_extend (i32 vgpr))
11805	Addr = LHS;
11806	VOffset = RHS.getOperand(i: `0`);
11807	}
11808	}
11809
11810	Ops.push_back(Elt: Addr);
11811	if (!Addr ->isDivergent()) {
11812	Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
11813	if (!VOffset)
11814	VOffset =
11815	SDValue (DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
11816	Op1: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
11817	`0`);
11818	Ops.push_back(Elt: VOffset);
11819	}
11820
11821	Ops.push_back(Elt: Op.getOperand(i: `5`)); // Offset
11822
11823	unsigned Aux = Op.getConstantOperandVal(i: `6`);
11824	Ops.push_back(Elt: DAG.getTargetConstant(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11825	VT: MVT::i32)); // CPol
11826	Ops.push_back(
11827	Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11828
11829	Ops.push_back(Elt: M0Val.getValue(R: `0`)); // Chain
11830	Ops.push_back(Elt: M0Val.getValue(R: `1`)); // Glue
11831
11832	auto *M = cast<MemSDNode>(Val&: Op);
11833	auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
11834	DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11835
11836	return SDValue (Load, `0`);
11837	}
11838	case Intrinsic::amdgcn_end_cf:
11839	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
11840	Op1: Op ->getOperand(Num: `2`), Op2: Chain),
11841	`0`);
11842	case Intrinsic::amdgcn_s_barrier_init:
11843	case Intrinsic::amdgcn_s_barrier_signal_var: {
11844	// these two intrinsics have two operands: barrier pointer and member count
11845	SDValue Chain = Op ->getOperand(Num: `0`);
11846	SmallVector<SDValue, `2`> Ops;
11847	SDValue BarOp = Op ->getOperand(Num: `2`);
11848	SDValue CntOp = Op ->getOperand(Num: `3`);
11849	SDValue M0Val;
11850	unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11851	? AMDGPU::S_BARRIER_INIT_M0
11852	: AMDGPU::S_BARRIER_SIGNAL_M0;
11853	// extract the BarrierID from bits 4-9 of BarOp
11854	SDValue BarID;
11855	BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11856	N2: DAG.getShiftAmountConstant(Val: `4`, VT: MVT::i32, DL));
11857	BarID =
11858	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
11859	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
11860	`0`);
11861	// Member count should be put into M0[ShAmt:+6]
11862	// Barrier ID should be put into M0[5:0]
11863	M0Val =
11864	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
11865	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
11866	`0`);
11867	constexpr unsigned ShAmt = `16`;
11868	M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
11869	N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
11870
11871	M0Val = SDValue (
11872	DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), `0`);
11873
11874	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: `0`));
11875
11876	auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
11877	return SDValue (NewMI, `0`);
11878	}
11879	case Intrinsic::amdgcn_s_wakeup_barrier: {
11880	if (!Subtarget->hasSWakeupBarrier())
11881	return SDValue ();
11882	[[fallthrough]];
11883	}
11884	case Intrinsic::amdgcn_s_barrier_join: {
11885	// these three intrinsics have one operand: barrier pointer
11886	SDValue Chain = Op ->getOperand(Num: `0`);
11887	SmallVector<SDValue, `2`> Ops;
11888	SDValue BarOp = Op ->getOperand(Num: `2`);
11889	unsigned Opc;
11890
11891	if (isa<ConstantSDNode>(Val: BarOp)) {
11892	uint64_t BarVal = cast<ConstantSDNode>(Val&: BarOp)->getZExtValue();
11893	switch (IntrinsicID) {
11894	default:
11895	return SDValue ();
11896	case Intrinsic::amdgcn_s_barrier_join:
11897	Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11898	break;
11899	case Intrinsic::amdgcn_s_wakeup_barrier:
11900	Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11901	break;
11902	}
11903	// extract the BarrierID from bits 4-9 of the immediate
11904	unsigned BarID = (BarVal >> `4`) & `0x3F`;
11905	SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11906	Ops.push_back(Elt: K);
11907	Ops.push_back(Elt: Chain);
11908	} else {
11909	switch (IntrinsicID) {
11910	default:
11911	return SDValue ();
11912	case Intrinsic::amdgcn_s_barrier_join:
11913	Opc = AMDGPU::S_BARRIER_JOIN_M0;
11914	break;
11915	case Intrinsic::amdgcn_s_wakeup_barrier:
11916	Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11917	break;
11918	}
11919	// extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11920	SDValue M0Val;
11921	M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11922	N2: DAG.getShiftAmountConstant(Val: `4`, VT: MVT::i32, DL));
11923	M0Val =
11924	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11925	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
11926	`0`);
11927	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: `0`));
11928	}
11929
11930	auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
11931	return SDValue (NewMI, `0`);
11932	}
11933	case Intrinsic::amdgcn_s_prefetch_data: {
11934	// For non-global address space preserve the chain and remove the call.
11935	if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
11936	return Op.getOperand(i: `0`);
11937	return Op;
11938	}
11939	case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11940	SDValue Ops[] = {
11941	Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG),
11942	Op.getOperand(i: `3`), // offset
11943	Op.getOperand(i: `4`), // length
11944	};
11945
11946	MemSDNode *M = cast<MemSDNode>(Val&: Op);
11947	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
11948	VTList: Op ->getVTList(), Ops, MemVT: M->getMemoryVT(),
11949	MMO: M->getMemOperand());
11950	}
11951	case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11952	case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11953	case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11954	MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11955	SDValue Chain = Op ->getOperand(Num: `0`);
11956	SDValue Ptr = Op ->getOperand(Num: `2`);
11957	SDValue Val = Op ->getOperand(Num: `3`);
11958	return DAG.getAtomic(Opcode: ISD::ATOMIC_STORE, dl: DL, MemVT: MII->getMemoryVT(), Chain, Ptr: Val,
11959	Val: Ptr, MMO: MII->getMemOperand());
11960	}
11961	default: {
11962	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11963	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
11964	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11965
11966	return Op;
11967	}
11968	}
11969	}
11970
11971	// Return whether the operation has NoUnsignedWrap property.
11972	static bool isNoUnsignedWrap(SDValue Addr) {
11973	return (Addr.getOpcode() == ISD::ADD &&
11974	Addr ->getFlags().hasNoUnsignedWrap()) \|\|
11975	Addr ->getOpcode() == ISD::OR;
11976	}
11977
11978	bool SITargetLowering::shouldPreservePtrArith(const Function &F,
11979	EVT PtrVT) const {
11980	return PtrVT == MVT::i64;
11981	}
11982
11983	bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
11984	EVT PtrVT) const {
11985	return true;
11986	}
11987
11988	// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11989	// offset (the offset that is included in bounds checking and swizzling, to be
11990	// split between the instruction's voffset and immoffset fields) and soffset
11991	// (the offset that is excluded from bounds checking and swizzling, to go in
11992	// the instruction's soffset field). This function takes the first kind of
11993	// offset and figures out how to split it between voffset and immoffset.
11994	std::pair<SDValue, SDValue>
11995	SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11996	SDLoc DL(Offset);
11997	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
11998	SDValue N0 = Offset;
11999	ConstantSDNode C1 = nullptr*;
12000
12001	if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
12002	N0 = SDValue ();
12003	else if (DAG.isBaseWithConstantOffset(Op: N0)) {
12004	// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12005	// being added, so we can only safely match a 32-bit addition with no
12006	// unsigned overflow.
12007	bool CheckNUW = Subtarget->hasGFX1250Insts();
12008	if (!CheckNUW \|\| isNoUnsignedWrap(Addr: N0)) {
12009	C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
12010	N0 = N0.getOperand(i: `0`);
12011	}
12012	}
12013
12014	if (C1) {
12015	unsigned ImmOffset = C1->getZExtValue();
12016	// If the immediate value is too big for the immoffset field, put only bits
12017	// that would normally fit in the immoffset field. The remaining value that
12018	// is copied/added for the voffset field is a large power of 2, and it
12019	// stands more chance of being CSEd with the copy/add for another similar
12020	// load/store.
12021	// However, do not do that rounding down if that is a negative
12022	// number, as it appears to be illegal to have a negative offset in the
12023	// vgpr, even if adding the immediate offset makes it positive.
12024	unsigned Overflow = ImmOffset & ~MaxImm;
12025	ImmOffset -= Overflow;
12026	if ((int32_t)Overflow < `0`) {
12027	Overflow += ImmOffset;
12028	ImmOffset = `0`;
12029	}
12030	C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
12031	if (Overflow) {
12032	auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
12033	if (!N0)
12034	N0 = OverflowVal;
12035	else {
12036	SDValue Ops[] = {N0, OverflowVal};
12037	N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
12038	}
12039	}
12040	}
12041	if (!N0)
12042	N0 = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
12043	if (!C1)
12044	C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
12045	return {N0, SDValue (C1, `0`)};
12046	}
12047
12048	// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12049	// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12050	// pointed to by Offsets.
12051	void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12052	SelectionDAG &DAG, SDValue *Offsets,
12053	Align Alignment) const {
12054	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12055	SDLoc DL(CombinedOffset);
12056	if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
12057	uint32_t Imm = C->getZExtValue();
12058	uint32_t SOffset, ImmOffset;
12059	if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12060	Offsets[`0`] = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
12061	Offsets[`1`] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12062	Offsets[`2`] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12063	return;
12064	}
12065	}
12066	if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
12067	// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12068	// being added, so we can only safely match a 32-bit addition with no
12069	// unsigned overflow.
12070	bool CheckNUW = Subtarget->hasGFX1250Insts();
12071	SDValue N0 = CombinedOffset.getOperand(i: `0`);
12072	SDValue N1 = CombinedOffset.getOperand(i: `1`);
12073	uint32_t SOffset, ImmOffset;
12074	int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
12075	if (Offset >= `0` && (!CheckNUW \|\| isNoUnsignedWrap(Addr: CombinedOffset)) &&
12076	TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
12077	Offsets[`0`] = N0;
12078	Offsets[`1`] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12079	Offsets[`2`] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12080	return;
12081	}
12082	}
12083
12084	SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12085	? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
12086	: DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
12087
12088	Offsets[`0`] = CombinedOffset;
12089	Offsets[`1`] = SOffsetZero;
12090	Offsets[`2`] = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32);
12091	}
12092
12093	SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12094	SelectionDAG &DAG) const {
12095	if (!MaybePointer.getValueType().isScalarInteger())
12096	return MaybePointer;
12097
12098	SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
12099	return Rsrc;
12100	}
12101
12102	// Wrap a global or flat pointer into a buffer intrinsic using the flags
12103	// specified in the intrinsic.
12104	SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12105	SelectionDAG &DAG) const {
12106	SDLoc Loc(Op);
12107
12108	SDValue Pointer = Op->getOperand(Num: `1`);
12109	SDValue Stride = Op->getOperand(Num: `2`);
12110	SDValue NumRecords = Op->getOperand(Num: `3`);
12111	SDValue Flags = Op->getOperand(Num: `4`);
12112
12113	SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
12114	SDValue Rsrc;
12115
12116	if (Subtarget->has45BitNumRecordsBufferResource()) {
12117	SDValue Zero = DAG.getConstant(Val: `0`, DL: Loc, VT: MVT::i32);
12118	// Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12119	// num_records.
12120	SDValue ExtPointer = DAG.getAnyExtOrTrunc(Op: Pointer, DL: Loc, VT: MVT::i64);
12121	SDValue NumRecordsLHS =
12122	DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12123	N2: DAG.getShiftAmountConstant(Val: `57`, VT: MVT::i32, DL: Loc));
12124	SDValue LowHalf =
12125	DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: ExtPointer, N2: NumRecordsLHS);
12126
12127	// Build the higher 64-bit value, which has the higher 38-bit num_records,
12128	// 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12129	SDValue NumRecordsRHS =
12130	DAG.getNode(Opcode: ISD::SRL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12131	N2: DAG.getShiftAmountConstant(Val: `7`, VT: MVT::i32, DL: Loc));
12132	SDValue ShiftedStride =
12133	DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12134	N2: DAG.getShiftAmountConstant(Val: `12`, VT: MVT::i32, DL: Loc));
12135	SDValue ExtShiftedStrideVec =
12136	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedStride);
12137	SDValue ExtShiftedStride =
12138	DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedStrideVec);
12139	SDValue ShiftedFlags =
12140	DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: Flags,
12141	N2: DAG.getShiftAmountConstant(Val: `28`, VT: MVT::i32, DL: Loc));
12142	SDValue ExtShiftedFlagsVec =
12143	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedFlags);
12144	SDValue ExtShiftedFlags =
12145	DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedFlagsVec);
12146	SDValue CombinedFields =
12147	DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: NumRecordsRHS, N2: ExtShiftedStride);
12148	SDValue HighHalf =
12149	DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: CombinedFields, N2: ExtShiftedFlags);
12150
12151	Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i64, N1: LowHalf, N2: HighHalf);
12152	} else {
12153	NumRecords = DAG.getAnyExtOrTrunc(Op: NumRecords, DL: Loc, VT: MVT::i32);
12154	auto [LowHalf, HighHalf] =
12155	DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
12156	SDValue Mask = DAG.getConstant(Val: `0x0000ffff`, DL: Loc, VT: MVT::i32);
12157	SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
12158	SDValue ShiftedStride =
12159	DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12160	N2: DAG.getShiftAmountConstant(Val: `16`, VT: MVT::i32, DL: Loc));
12161	SDValue NewHighHalf =
12162	DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
12163
12164	Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, N2: NewHighHalf,
12165	N3: NumRecords, N4: Flags);
12166	}
12167
12168	SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
12169	return RsrcPtr;
12170	}
12171
12172	// Handle 8 bit and 16 bit buffer loads
12173	SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12174	EVT LoadVT, SDLoc DL,
12175	ArrayRef<SDValue> Ops,
12176	MachineMemOperand *MMO,
12177	bool IsTFE) const {
12178	EVT IntVT = LoadVT.changeTypeToInteger();
12179
12180	if (IsTFE) {
12181	unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12182	? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12183	: AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12184	MachineFunction &MF = DAG.getMachineFunction();
12185	MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: `0`, Size: `8`);
12186	SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
12187	SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
12188	SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12189	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
12190	SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12191	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32));
12192	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
12193	SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
12194	return DAG.getMergeValues(Ops: {Value, Status, SDValue (Op.getNode(), `1`)}, dl: DL);
12195	}
12196
12197	unsigned Opc = LoadVT.getScalarType() == MVT::i8
12198	? AMDGPUISD::BUFFER_LOAD_UBYTE
12199	: AMDGPUISD::BUFFER_LOAD_USHORT;
12200
12201	SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
12202	SDValue BufferLoad =
12203	DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
12204	SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
12205	LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
12206
12207	return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: `1`)}, dl: DL);
12208	}
12209
12210	// Handle 8 bit and 16 bit buffer stores
12211	SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12212	EVT VDataType, SDLoc DL,
12213	SDValue Ops[],
12214	MemSDNode M) const* {
12215	if (VDataType == MVT::f16 \|\| VDataType == MVT::bf16)
12216	Ops[`1`] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[`1`]);
12217
12218	SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[`1`]);
12219	Ops[`1`] = BufferStoreExt;
12220	unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12221	: AMDGPUISD::BUFFER_STORE_SHORT;
12222	ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[`0`], `9`);
12223	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
12224	MMO: M->getMemOperand());
12225	}
12226
12227	static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
12228	SDValue Op, const SDLoc &SL, EVT VT) {
12229	if (VT.bitsLT(VT: Op.getValueType()))
12230	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
12231
12232	switch (ExtType) {
12233	case ISD::SEXTLOAD:
12234	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
12235	case ISD::ZEXTLOAD:
12236	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
12237	case ISD::EXTLOAD:
12238	return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
12239	case ISD::NON_EXTLOAD:
12240	return Op;
12241	}
12242
12243	llvm_unreachable("invalid ext type");
12244	}
12245
12246	// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12247	// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12248	SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12249	DAGCombinerInfo &DCI) const {
12250	SelectionDAG &DAG = DCI.DAG;
12251	if (Ld->getAlign() < Align (`4`) \|\| Ld->isDivergent())
12252	return SDValue ();
12253
12254	// FIXME: Constant loads should all be marked invariant.
12255	unsigned AS = Ld->getAddressSpace();
12256	if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12257	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
12258	(AS != AMDGPUAS::GLOBAL_ADDRESS \|\| !Ld->isInvariant()))
12259	return SDValue ();
12260
12261	// Don't do this early, since it may interfere with adjacent load merging for
12262	// illegal types. We can avoid losing alignment information for exotic types
12263	// pre-legalize.
12264	EVT MemVT = Ld->getMemoryVT();
12265	if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) \|\|
12266	MemVT.getSizeInBits() >= `32`)
12267	return SDValue ();
12268
12269	SDLoc SL(Ld);
12270
12271	assert((!MemVT.isVector() \|\| Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12272	"unexpected vector extload");
12273
12274	// TODO: Drop only high part of range.
12275	SDValue Ptr = Ld->getBasePtr();
12276	SDValue NewLoad = DAG.getLoad(
12277	AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
12278	Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
12279	MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
12280	Ranges: nullptr); // Drop ranges
12281
12282	EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
12283	if (MemVT.isFloatingPoint()) {
12284	assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
12285	"unexpected fp extload");
12286	TruncVT = MemVT.changeTypeToInteger();
12287	}
12288
12289	SDValue Cvt = NewLoad;
12290	if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12291	Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
12292	N2: DAG.getValueType(TruncVT));
12293	} else if (Ld->getExtensionType() == ISD::ZEXTLOAD \|\|
12294	Ld->getExtensionType() == ISD::NON_EXTLOAD) {
12295	Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
12296	} else {
12297	assert(Ld->getExtensionType() == ISD::EXTLOAD);
12298	}
12299
12300	EVT VT = Ld->getValueType(ResNo: `0`);
12301	EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
12302
12303	DCI.AddToWorklist(N: Cvt.getNode());
12304
12305	// We may need to handle exotic cases, such as i16->i64 extloads, so insert
12306	// the appropriate extension from the 32-bit load.
12307	Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
12308	DCI.AddToWorklist(N: Cvt.getNode());
12309
12310	// Handle conversion back to floating point if necessary.
12311	Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
12312
12313	return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: `1`)}, dl: SL);
12314	}
12315
12316	static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
12317	const SIMachineFunctionInfo &Info) {
12318	// TODO: Should check if the address can definitely not access stack.
12319	if (Info.isEntryFunction())
12320	return Info.getUserSGPRInfo().hasFlatScratchInit();
12321	return true;
12322	}
12323
12324	SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12325	SDLoc DL(Op);
12326	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
12327	ISD::LoadExtType ExtType = Load->getExtensionType();
12328	EVT MemVT = Load->getMemoryVT();
12329	MachineMemOperand *MMO = Load->getMemOperand();
12330
12331	if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < `32`) {
12332	if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
12333	return SDValue ();
12334
12335	// FIXME: Copied from PPC
12336	// First, load into 32 bits, then truncate to 1 bit.
12337
12338	SDValue Chain = Load->getChain();
12339	SDValue BasePtr = Load->getBasePtr();
12340
12341	EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12342
12343	SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
12344	MemVT: RealMemVT, MMO);
12345
12346	if (!MemVT.isVector()) {
12347	SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
12348	NewLD.getValue(R: `1`)};
12349
12350	return DAG.getMergeValues(Ops, dl: DL);
12351	}
12352
12353	SmallVector<SDValue, `3`> Elts;
12354	for (unsigned I = `0`, N = MemVT.getVectorNumElements(); I != N; ++I) {
12355	SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
12356	N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
12357
12358	Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
12359	}
12360
12361	SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: `1`)};
12362
12363	return DAG.getMergeValues(Ops, dl: DL);
12364	}
12365
12366	if (!MemVT.isVector())
12367	return SDValue ();
12368
12369	assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12370	"Custom lowering for non-i32 vectors hasn't been implemented.");
12371
12372	Align Alignment = Load->getAlign();
12373	unsigned AS = Load->getAddressSpace();
12374	if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12375	AS == AMDGPUAS::FLAT_ADDRESS &&
12376	Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > `32`) {
12377	return SplitVectorLoad(Op, DAG);
12378	}
12379
12380	MachineFunction &MF = DAG.getMachineFunction();
12381	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12382	// If there is a possibility that flat instruction access scratch memory
12383	// then we need to use the same legalization rules we use for private.
12384	if (AS == AMDGPUAS::FLAT_ADDRESS &&
12385	!Subtarget->hasMultiDwordFlatScratchAddressing())
12386	AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
12387	? AMDGPUAS::PRIVATE_ADDRESS
12388	: AMDGPUAS::GLOBAL_ADDRESS;
12389
12390	unsigned NumElements = MemVT.getVectorNumElements();
12391
12392	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
12393	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
12394	(AS == AMDGPUAS::GLOBAL_ADDRESS &&
12395	Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12396	(Load->isInvariant() \|\| isMemOpHasNoClobberedMemOperand(N: Load)))) {
12397	if ((!Op ->isDivergent() \|\| AMDGPU::isUniformMMO(MMO)) &&
12398	Alignment >= Align (`4`) && NumElements < `32`) {
12399	if (MemVT.isPow2VectorType() \|\|
12400	(Subtarget->hasScalarDwordx3Loads() && NumElements == `3`))
12401	return SDValue ();
12402	return WidenOrSplitVectorLoad(Op, DAG);
12403	}
12404	// Non-uniform loads will be selected to MUBUF instructions, so they
12405	// have the same legalization requirements as global and private
12406	// loads.
12407	//
12408	}
12409	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
12410	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
12411	AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS) {
12412	if (NumElements > `4`)
12413	return SplitVectorLoad(Op, DAG);
12414	// v3 loads not supported on SI.
12415	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
12416	return WidenOrSplitVectorLoad(Op, DAG);
12417
12418	// v3 and v4 loads are supported for private and global memory.
12419	return SDValue ();
12420	}
12421	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12422	// Depending on the setting of the private_element_size field in the
12423	// resource descriptor, we can only make private accesses up to a certain
12424	// size.
12425	switch (Subtarget->getMaxPrivateElementSize()) {
12426	case `4`: {
12427	auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
12428	return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12429	}
12430	case `8`:
12431	if (NumElements > `2`)
12432	return SplitVectorLoad(Op, DAG);
12433	return SDValue ();
12434	case `16`:
12435	// Same as global/flat
12436	if (NumElements > `4`)
12437	return SplitVectorLoad(Op, DAG);
12438	// v3 loads not supported on SI.
12439	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
12440	return WidenOrSplitVectorLoad(Op, DAG);
12441
12442	return SDValue ();
12443	default:
12444	llvm_unreachable("unsupported private_element_size");
12445	}
12446	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
12447	unsigned Fast = `0`;
12448	auto Flags = Load->getMemOperand()->getFlags();
12449	if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
12450	Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
12451	Fast > `1`)
12452	return SDValue ();
12453
12454	if (MemVT.isVector())
12455	return SplitVectorLoad(Op, DAG);
12456	}
12457
12458	if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
12459	VT: MemVT, MMO: *Load->getMemOperand())) {
12460	auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
12461	return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12462	}
12463
12464	return SDValue ();
12465	}
12466
12467	SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12468	EVT VT = Op.getValueType();
12469	if (VT.getSizeInBits() == `128` \|\| VT.getSizeInBits() == `256` \|\|
12470	VT.getSizeInBits() == `512`)
12471	return splitTernaryVectorOp(Op, DAG);
12472
12473	assert(VT.getSizeInBits() == `64`);
12474
12475	SDLoc DL(Op);
12476	SDValue Cond = DAG.getFreeze(V: Op.getOperand(i: `0`));
12477
12478	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
12479	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
12480
12481	SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
12482	SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `2`));
12483
12484	SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
12485	SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
12486
12487	SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
12488
12489	SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
12490	SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
12491
12492	SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
12493
12494	SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
12495	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
12496	}
12497
12498	// Catch division cases where we can use shortcuts with rcp and rsq
12499	// instructions.
12500	SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12501	SelectionDAG &DAG) const {
12502	SDLoc SL(Op);
12503	SDValue LHS = Op.getOperand(i: `0`);
12504	SDValue RHS = Op.getOperand(i: `1`);
12505	EVT VT = Op.getValueType();
12506	const SDNodeFlags Flags = Op ->getFlags();
12507
12508	bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12509
12510	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
12511	// Without !fpmath accuracy information, we can't do more because we don't
12512	// know exactly whether rcp is accurate enough to meet !fpmath requirement.
12513	// f16 is always accurate enough
12514	if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12515	return SDValue ();
12516
12517	if (CLHS->isExactlyValue(V: `1.0`)) {
12518	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12519	// the CI documentation has a worst case error of 1 ulp.
12520	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12521	// use it as long as we aren't trying to use denormals.
12522	//
12523	// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12524
12525	// 1.0 / sqrt(x) -> rsq(x)
12526
12527	// XXX - Is afn sufficient to do this for f64? The maximum ULP
12528	// error seems really high at 2^29 ULP.
12529	// 1.0 / x -> rcp(x)
12530	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12531	}
12532
12533	// Same as for 1.0, but expand the sign out of the constant.
12534	if (CLHS->isExactlyValue(V: -`1.0`)) {
12535	// -1.0 / x -> rcp (fneg x)
12536	SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
12537	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
12538	}
12539	}
12540
12541	// For f16 and bf16 require afn or arcp.
12542	// For f32 require afn.
12543	if (!AllowInaccurateRcp &&
12544	((VT != MVT::f16 && VT != MVT::bf16) \|\| !Flags.hasAllowReciprocal()))
12545	return SDValue ();
12546
12547	// Turn into multiply by the reciprocal.
12548	// x / y -> x (1.0 / y)*
12549	SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12550	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
12551	}
12552
12553	SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12554	SelectionDAG &DAG) const {
12555	SDLoc SL(Op);
12556	SDValue X = Op.getOperand(i: `0`);
12557	SDValue Y = Op.getOperand(i: `1`);
12558	EVT VT = Op.getValueType();
12559	const SDNodeFlags Flags = Op ->getFlags();
12560
12561	bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12562	if (!AllowInaccurateDiv)
12563	return SDValue ();
12564
12565	SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
12566	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
12567
12568	SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
12569	SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12570
12571	R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
12572	SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12573	R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
12574	SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
12575	SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
12576	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
12577	}
12578
12579	static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12580	EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12581	SDNodeFlags Flags) {
12582	if (GlueChain ->getNumValues() <= `1`) {
12583	return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
12584	}
12585
12586	assert(GlueChain->getNumValues() == `3`);
12587
12588	SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12589	switch (Opcode) {
12590	default:
12591	llvm_unreachable("no chain equivalent for opcode");
12592	case ISD::FMUL:
12593	Opcode = AMDGPUISD::FMUL_W_CHAIN;
12594	break;
12595	}
12596
12597	return DAG.getNode(Opcode, DL: SL, VTList,
12598	Ops: {GlueChain.getValue(R: `1`), A, B, GlueChain.getValue(R: `2`)},
12599	Flags);
12600	}
12601
12602	static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12603	EVT VT, SDValue A, SDValue B, SDValue C,
12604	SDValue GlueChain, SDNodeFlags Flags) {
12605	if (GlueChain ->getNumValues() <= `1`) {
12606	return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
12607	}
12608
12609	assert(GlueChain->getNumValues() == `3`);
12610
12611	SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12612	switch (Opcode) {
12613	default:
12614	llvm_unreachable("no chain equivalent for opcode");
12615	case ISD::FMA:
12616	Opcode = AMDGPUISD::FMA_W_CHAIN;
12617	break;
12618	}
12619
12620	return DAG.getNode(Opcode, DL: SL, VTList,
12621	Ops: {GlueChain.getValue(R: `1`), A, B, C, GlueChain.getValue(R: `2`)},
12622	Flags);
12623	}
12624
12625	SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12626	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12627	return FastLowered;
12628
12629	SDLoc SL(Op);
12630	EVT VT = Op.getValueType();
12631	SDValue LHS = Op.getOperand(i: `0`);
12632	SDValue RHS = Op.getOperand(i: `1`);
12633
12634	SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
12635	SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
12636
12637	if (VT == MVT::bf16) {
12638	SDValue ExtDiv =
12639	DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT: MVT::f32, N1: LHSExt, N2: RHSExt, Flags: Op ->getFlags());
12640	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ExtDiv,
12641	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
12642	}
12643
12644	assert(VT == MVT::f16);
12645
12646	// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12647	// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12648	// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12649	// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n rcp*
12650	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
12651	// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n rcp*
12652	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
12653	// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12654	// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12655	// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12656	// q16.u = opx(V_CVT_F16_F32, q32.u);
12657	// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12658
12659	// We will use ISD::FMA on targets that don't support ISD::FMAD.
12660	unsigned FMADOpCode =
12661	isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
12662	SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
12663	SDValue Rcp =
12664	DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op ->getFlags());
12665	SDValue Quot =
12666	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op ->getFlags());
12667	SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12668	Flags: Op ->getFlags());
12669	Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op ->getFlags());
12670	Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12671	Flags: Op ->getFlags());
12672	SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op ->getFlags());
12673	SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
12674	TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
12675	N2: DAG.getConstant(Val: `0xff800000`, DL: SL, VT: MVT::i32));
12676	Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
12677	Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op ->getFlags());
12678	SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
12679	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
12680	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
12681	Flags: Op ->getFlags());
12682	}
12683
12684	// Faster 2.5 ULP division that does not support denormals.
12685	SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12686	SDNodeFlags Flags = Op ->getFlags();
12687	SDLoc SL(Op);
12688	SDValue LHS = Op.getOperand(i: `1`);
12689	SDValue RHS = Op.getOperand(i: `2`);
12690
12691	// TODO: The combiner should probably handle elimination of redundant fabs.
12692	SDValue r1 = DAG.SignBitIsZeroFP(Op: RHS)
12693	? RHS
12694	: DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
12695
12696	const APFloat K0Val(`0x1p+96f`);
12697	const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
12698
12699	const APFloat K1Val(`0x1p-32f`);
12700	const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
12701
12702	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f32);
12703
12704	EVT SetCCVT =
12705	getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
12706
12707	SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
12708
12709	SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
12710
12711	r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
12712
12713	// rcp does not support denormals.
12714	SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
12715
12716	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
12717
12718	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
12719	}
12720
12721	// Returns immediate value for setting the F32 denorm mode when using the
12722	// S_DENORM_MODE instruction.
12723	static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
12724	const SIMachineFunctionInfo *Info,
12725	const GCNSubtarget *ST) {
12726	assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12727	uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12728	uint32_t Mode = SPDenormMode \| (DPDenormModeDefault << `2`);
12729	return DAG.getTargetConstant(Val: Mode, DL: SDLoc (), VT: MVT::i32);
12730	}
12731
12732	SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12733	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12734	return FastLowered;
12735
12736	// The selection matcher assumes anything with a chain selecting to a
12737	// mayRaiseFPException machine instruction. Since we're introducing a chain
12738	// here, we need to explicitly report nofpexcept for the regular fdiv
12739	// lowering.
12740	SDNodeFlags Flags = Op ->getFlags();
12741	Flags.setNoFPExcept(true);
12742
12743	SDLoc SL(Op);
12744	SDValue LHS = Op.getOperand(i: `0`);
12745	SDValue RHS = Op.getOperand(i: `1`);
12746
12747	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f32);
12748
12749	SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
12750
12751	SDValue DenominatorScaled =
12752	DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
12753	SDValue NumeratorScaled =
12754	DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
12755
12756	// Denominator is scaled to not be denormal, so using rcp is ok.
12757	SDValue ApproxRcp =
12758	DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12759	SDValue NegDivScale0 =
12760	DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12761
12762	using namespace AMDGPU::Hwreg;
12763	const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: `4`, Values: `2`);
12764	const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
12765
12766	const MachineFunction &MF = DAG.getMachineFunction();
12767	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12768	const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12769
12770	const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12771	const bool HasDynamicDenormals =
12772	(DenormMode.Input == DenormalMode::Dynamic) \|\|
12773	(DenormMode.Output == DenormalMode::Dynamic);
12774
12775	SDValue SavedDenormMode;
12776
12777	if (!PreservesDenormals) {
12778	// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12779	// lowering. The chain dependence is insufficient, and we need glue. We do
12780	// not need the glue variants in a strictfp function.
12781
12782	SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12783
12784	SDValue Glue = DAG.getEntryNode();
12785	if (HasDynamicDenormals) {
12786	SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
12787	VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
12788	Ops: {BitField, Glue});
12789	SavedDenormMode = SDValue (GetReg, `0`);
12790
12791	Glue = DAG.getMergeValues(
12792	Ops: {DAG.getEntryNode(), SDValue (GetReg, `0`), SDValue (GetReg, `1`)}, dl: SL);
12793	}
12794
12795	SDNode *EnableDenorm;
12796	if (Subtarget->hasDenormModeInst()) {
12797	const SDValue EnableDenormValue =
12798	getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
12799
12800	EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
12801	N2: EnableDenormValue)
12802	.getNode();
12803	} else {
12804	const SDValue EnableDenormValue =
12805	DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
12806	EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
12807	Ops: {EnableDenormValue, BitField, Glue});
12808	}
12809
12810	SDValue Ops[`3`] = {NegDivScale0, SDValue (EnableDenorm, `0`),
12811	SDValue (EnableDenorm, `1`)};
12812
12813	NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
12814	}
12815
12816	SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
12817	B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
12818
12819	SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
12820	C: ApproxRcp, GlueChain: Fma0, Flags);
12821
12822	SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
12823	GlueChain: Fma1, Flags);
12824
12825	SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
12826	C: NumeratorScaled, GlueChain: Mul, Flags);
12827
12828	SDValue Fma3 =
12829	getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
12830
12831	SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
12832	C: NumeratorScaled, GlueChain: Fma3, Flags);
12833
12834	if (!PreservesDenormals) {
12835	SDNode *DisableDenorm;
12836	if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12837	const SDValue DisableDenormValue = getSPDenormModeValue(
12838	FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
12839
12840	SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12841	DisableDenorm =
12842	DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
12843	N1: Fma4.getValue(R: `1`), N2: DisableDenormValue, N3: Fma4.getValue(R: `2`))
12844	.getNode();
12845	} else {
12846	assert(HasDynamicDenormals == (bool)SavedDenormMode);
12847	const SDValue DisableDenormValue =
12848	HasDynamicDenormals
12849	? SavedDenormMode
12850	: DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
12851
12852	DisableDenorm = DAG.getMachineNode(
12853	Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
12854	Ops: {DisableDenormValue, BitField, Fma4.getValue(R: `1`), Fma4.getValue(R: `2`)});
12855	}
12856
12857	SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
12858	N1: SDValue (DisableDenorm, `0`), N2: DAG.getRoot());
12859	DAG.setRoot(OutputChain);
12860	}
12861
12862	SDValue Scale = NumeratorScaled.getValue(R: `1`);
12863	SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
12864	Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
12865
12866	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
12867	}
12868
12869	SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12870	if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12871	return FastLowered;
12872
12873	SDLoc SL(Op);
12874	SDValue X = Op.getOperand(i: `0`);
12875	SDValue Y = Op.getOperand(i: `1`);
12876
12877	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f64);
12878
12879	SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
12880
12881	SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
12882
12883	SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
12884
12885	SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
12886
12887	SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
12888
12889	SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
12890
12891	SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
12892
12893	SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
12894
12895	SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
12896	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
12897
12898	SDValue Fma4 =
12899	DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
12900
12901	SDValue Scale;
12902
12903	if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12904	// Workaround a hardware bug on SI where the condition output from div_scale
12905	// is not usable.
12906
12907	const SDValue Hi = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
12908
12909	// Figure out if the scale to use for div_fmas.
12910	SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
12911	SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
12912	SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
12913	SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
12914
12915	SDValue NumHi =
12916	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
12917	SDValue DenHi =
12918	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
12919
12920	SDValue Scale0Hi =
12921	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
12922	SDValue Scale1Hi =
12923	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
12924
12925	SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
12926	SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
12927	Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
12928	} else {
12929	Scale = DivScale1.getValue(R: `1`);
12930	}
12931
12932	SDValue Fmas =
12933	DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
12934
12935	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
12936	}
12937
12938	SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12939	EVT VT = Op.getValueType();
12940
12941	if (VT == MVT::f32)
12942	return LowerFDIV32(Op, DAG);
12943
12944	if (VT == MVT::f64)
12945	return LowerFDIV64(Op, DAG);
12946
12947	if (VT == MVT::f16 \|\| VT == MVT::bf16)
12948	return LowerFDIV16(Op, DAG);
12949
12950	llvm_unreachable("Unexpected type for fdiv");
12951	}
12952
12953	SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12954	SDLoc dl(Op);
12955	SDValue Val = Op.getOperand(i: `0`);
12956	EVT VT = Val.getValueType();
12957	EVT ResultExpVT = Op ->getValueType(ResNo: `1`);
12958	EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12959
12960	SDValue Mant = DAG.getNode(
12961	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12962	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
12963
12964	SDValue Exp = DAG.getNode(
12965	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
12966	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
12967
12968	if (Subtarget->hasFractBug()) {
12969	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
12970	SDValue Inf =
12971	DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
12972
12973	SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
12974	SDValue Zero = DAG.getConstant(Val: `0`, DL: dl, VT: InstrExpVT);
12975	Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
12976	Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
12977	}
12978
12979	SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
12980	return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
12981	}
12982
12983	SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12984	SDLoc DL(Op);
12985	StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
12986	EVT VT = Store->getMemoryVT();
12987
12988	if (VT == MVT::i1) {
12989	return DAG.getTruncStore(
12990	Chain: Store->getChain(), dl: DL,
12991	Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
12992	Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
12993	}
12994
12995	assert(VT.isVector() &&
12996	Store->getValue().getValueType().getScalarType() == MVT::i32);
12997
12998	unsigned AS = Store->getAddressSpace();
12999	if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13000	AS == AMDGPUAS::FLAT_ADDRESS &&
13001	Store->getAlign().value() < VT.getStoreSize() &&
13002	VT.getSizeInBits() > `32`) {
13003	return SplitVectorStore(Op, DAG);
13004	}
13005
13006	MachineFunction &MF = DAG.getMachineFunction();
13007	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13008	// If there is a possibility that flat instruction access scratch memory
13009	// then we need to use the same legalization rules we use for private.
13010	if (AS == AMDGPUAS::FLAT_ADDRESS &&
13011	!Subtarget->hasMultiDwordFlatScratchAddressing())
13012	AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
13013	? AMDGPUAS::PRIVATE_ADDRESS
13014	: AMDGPUAS::GLOBAL_ADDRESS;
13015
13016	unsigned NumElements = VT.getVectorNumElements();
13017	if (AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS) {
13018	if (NumElements > `4`)
13019	return SplitVectorStore(Op, DAG);
13020	// v3 stores not supported on SI.
13021	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
13022	return SplitVectorStore(Op, DAG);
13023
13024	if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
13025	VT, MMO: *Store->getMemOperand()))
13026	return expandUnalignedStore(ST: Store, DAG);
13027
13028	return SDValue ();
13029	}
13030	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13031	switch (Subtarget->getMaxPrivateElementSize()) {
13032	case `4`:
13033	return scalarizeVectorStore(ST: Store, DAG);
13034	case `8`:
13035	if (NumElements > `2`)
13036	return SplitVectorStore(Op, DAG);
13037	return SDValue ();
13038	case `16`:
13039	if (NumElements > `4` \|\|
13040	(NumElements == `3` && !Subtarget->hasFlatScratchEnabled()))
13041	return SplitVectorStore(Op, DAG);
13042	return SDValue ();
13043	default:
13044	llvm_unreachable("unsupported private_element_size");
13045	}
13046	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
13047	unsigned Fast = `0`;
13048	auto Flags = Store->getMemOperand()->getFlags();
13049	if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
13050	Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
13051	Fast > `1`)
13052	return SDValue ();
13053
13054	if (VT.isVector())
13055	return SplitVectorStore(Op, DAG);
13056
13057	return expandUnalignedStore(ST: Store, DAG);
13058	}
13059
13060	// Probably an invalid store. If so we'll end up emitting a selection error.
13061	return SDValue ();
13062	}
13063
13064	// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13065	SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13066	SDLoc SL(Op);
13067	assert(!Subtarget->has16BitInsts());
13068	SDNodeFlags Flags = Op ->getFlags();
13069	SDValue Ext =
13070	DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: `0`), Flags);
13071
13072	SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
13073	SDValue Sqrt =
13074	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
13075
13076	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
13077	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
13078	}
13079
13080	SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13081	SDLoc DL(Op);
13082	SDNodeFlags Flags = Op ->getFlags();
13083	MVT VT = Op.getValueType().getSimpleVT();
13084	const SDValue X = Op.getOperand(i: `0`);
13085
13086	if (allowApproxFunc(DAG, Flags)) {
13087	// Instruction is 1ulp but ignores denormals.
13088	return DAG.getNode(
13089	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
13090	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
13091	}
13092
13093	SDValue ScaleThreshold = DAG.getConstantFP(Val: `0x1.0p-96f`, DL, VT);
13094	SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
13095
13096	SDValue ScaleUpFactor = DAG.getConstantFP(Val: `0x1.0p+32f`, DL, VT);
13097
13098	SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
13099
13100	SDValue SqrtX =
13101	DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
13102
13103	SDValue SqrtS;
13104	if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
13105	SDValue SqrtID =
13106	DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
13107	SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
13108
13109	SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
13110	SDValue SqrtSNextDownInt =
13111	DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13112	N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
13113	SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
13114
13115	SDValue NegSqrtSNextDown =
13116	DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
13117
13118	SDValue SqrtVP =
13119	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
13120
13121	SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13122	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
13123	SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
13124
13125	SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
13126	SDValue SqrtVS =
13127	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
13128
13129	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL, VT);
13130	SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
13131
13132	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
13133	Flags);
13134
13135	SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
13136	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
13137	Flags);
13138	} else {
13139	SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
13140
13141	SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
13142
13143	SDValue Half = DAG.getConstantFP(Val: `0.5f`, DL, VT);
13144	SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
13145	SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
13146
13147	SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
13148	SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
13149	SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
13150
13151	SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
13152	SDValue SqrtD =
13153	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
13154	SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
13155	}
13156
13157	SDValue ScaleDownFactor = DAG.getConstantFP(Val: `0x1.0p-16f`, DL, VT);
13158
13159	SDValue ScaledDown =
13160	DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
13161
13162	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
13163	SDValue IsZeroOrInf =
13164	DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13165	N2: DAG.getTargetConstant(Val: fcZero \| fcPosInf, DL, VT: MVT::i32));
13166
13167	return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
13168	}
13169
13170	SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13171	// For double type, the SQRT and RSQ instructions don't have required
13172	// precision, we apply Goldschmidt's algorithm to improve the result:
13173	//
13174	// y0 = rsq(x)
13175	// g0 = x y0*
13176	// h0 = 0.5 y0*
13177	//
13178	// r0 = 0.5 - h0 g0*
13179	// g1 = g0 r0 + g0*
13180	// h1 = h0 r0 + h0*
13181	//
13182	// r1 = 0.5 - h1 g1 => d0 = x - g1 * g1*
13183	// g2 = g1 r1 + g1 g2 = d0 * h1 + g1*
13184	// h2 = h1 r1 + h1*
13185	//
13186	// r2 = 0.5 - h2 g2 => d1 = x - g2 * g2*
13187	// g3 = g2 r2 + g2 g3 = d1 * h1 + g2*
13188	//
13189	// sqrt(x) = g3
13190
13191	SDNodeFlags Flags = Op ->getFlags();
13192
13193	SDLoc DL(Op);
13194
13195	SDValue X = Op.getOperand(i: `0`);
13196	SDValue ScaleConstant = DAG.getConstantFP(Val: `0x1.0p-767`, DL, VT: MVT::f64);
13197
13198	SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
13199
13200	SDValue ZeroInt = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
13201
13202	// Scale up input if it is too small.
13203	SDValue ScaleUpFactor = DAG.getConstant(Val: `256`, DL, VT: MVT::i32);
13204	SDValue ScaleUp =
13205	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
13206	SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
13207
13208	SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
13209
13210	SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
13211
13212	SDValue Half = DAG.getConstantFP(Val: `0.5`, DL, VT: MVT::f64);
13213	SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
13214
13215	SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
13216	SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
13217
13218	SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
13219
13220	SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
13221
13222	SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
13223	SDValue SqrtD0 =
13224	DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
13225
13226	SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
13227
13228	SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
13229	SDValue SqrtD1 =
13230	DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
13231
13232	SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
13233
13234	SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -`128`, DL, VT: MVT::i32);
13235	SDValue ScaleDown =
13236	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
13237	SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
13238
13239	// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13240	// with finite only or nsz because rsq(+/-0) = +/-inf
13241
13242	// TODO: Check for DAZ and expand to subnormals
13243	SDValue IsZeroOrInf =
13244	DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13245	N2: DAG.getTargetConstant(Val: fcZero \| fcPosInf, DL, VT: MVT::i32));
13246
13247	// If x is +INF, +0, or -0, use its original value
13248	return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
13249	Flags);
13250	}
13251
13252	SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13253	SDLoc DL(Op);
13254	EVT VT = Op.getValueType();
13255	SDValue Arg = Op.getOperand(i: `0`);
13256	SDValue TrigVal;
13257
13258	// Propagate fast-math flags so that the multiply we introduce can be folded
13259	// if Arg is already the result of a multiply by constant.
13260	auto Flags = Op ->getFlags();
13261
13262	// AMDGPUISD nodes of vector type must be unrolled here since
13263	// they will not be expanded elsewhere.
13264	auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13265	if (!V.getValueType().isVector())
13266	return V;
13267
13268	return DAG.UnrollVectorOp(N: cast<SDNode>(Val&: V));
13269	};
13270
13271	SDValue OneOver2Pi = DAG.getConstantFP(Val: `0.5` * numbers::inv_pi, DL, VT);
13272
13273	if (Subtarget->hasTrigReducedRange()) {
13274	SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13275	TrigVal = UnrollIfVec (DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags));
13276	} else {
13277	TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13278	}
13279
13280	switch (Op.getOpcode()) {
13281	case ISD::FCOS:
13282	TrigVal = DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc (Op), VT, Operand: TrigVal, Flags);
13283	break;
13284	case ISD::FSIN:
13285	TrigVal = DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc (Op), VT, Operand: TrigVal, Flags);
13286	break;
13287	default:
13288	llvm_unreachable("Wrong trig opcode");
13289	}
13290
13291	return UnrollIfVec (TrigVal);
13292	}
13293
13294	SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13295	SelectionDAG &DAG) const {
13296	AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
13297	assert(AtomicNode->isCompareAndSwap());
13298	unsigned AS = AtomicNode->getAddressSpace();
13299
13300	// No custom lowering required for local address space
13301	if (!AMDGPU::isFlatGlobalAddrSpace(AS))
13302	return Op;
13303
13304	// Non-local address space requires custom lowering for atomic compare
13305	// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13306	SDLoc DL(Op);
13307	SDValue ChainIn = Op.getOperand(i: `0`);
13308	SDValue Addr = Op.getOperand(i: `1`);
13309	SDValue Old = Op.getOperand(i: `2`);
13310	SDValue New = Op.getOperand(i: `3`);
13311	EVT VT = Op.getValueType();
13312	MVT SimpleVT = VT.getSimpleVT();
13313	MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: `2`);
13314
13315	SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
13316	SDValue Ops[] = {ChainIn, Addr, NewOld};
13317
13318	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
13319	VTList: Op ->getVTList(), Ops, MemVT: VT,
13320	MMO: AtomicNode->getMemOperand());
13321	}
13322
13323	//===----------------------------------------------------------------------===//
13324	// Custom DAG optimizations
13325	//===----------------------------------------------------------------------===//
13326
13327	SDValue
13328	SITargetLowering::performUCharToFloatCombine(SDNode *N,
13329	DAGCombinerInfo &DCI) const {
13330	EVT VT = N->getValueType(ResNo: `0`);
13331	EVT ScalarVT = VT.getScalarType();
13332	if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13333	return SDValue ();
13334
13335	SelectionDAG &DAG = DCI.DAG;
13336	SDLoc DL(N);
13337
13338	SDValue Src = N->getOperand(Num: `0`);
13339	EVT SrcVT = Src.getValueType();
13340
13341	// TODO: We could try to match extracting the higher bytes, which would be
13342	// easier if i8 vectors weren't promoted to i32 vectors, particularly after
13343	// types are legalized. v4i8 -> v4f32 is probably the only case to worry
13344	// about in practice.
13345	if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13346	if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `24`))) {
13347	SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
13348	DCI.AddToWorklist(N: Cvt.getNode());
13349
13350	// For the f16 case, fold to a cast to f32 and then cast back to f16.
13351	if (ScalarVT != MVT::f32) {
13352	Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
13353	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
13354	}
13355	return Cvt;
13356	}
13357	}
13358
13359	return SDValue ();
13360	}
13361
13362	SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13363	DAGCombinerInfo &DCI) const {
13364	SDValue MagnitudeOp = N->getOperand(Num: `0`);
13365	SDValue SignOp = N->getOperand(Num: `1`);
13366
13367	// The generic combine for fcopysign + fp cast is too conservative with
13368	// vectors, and also gets confused by the splitting we will perform here, so
13369	// peek through FP casts.
13370	if (SignOp.getOpcode() == ISD::FP_EXTEND \|\|
13371	SignOp.getOpcode() == ISD::FP_ROUND)
13372	SignOp = SignOp.getOperand(i: `0`);
13373
13374	SelectionDAG &DAG = DCI.DAG;
13375	SDLoc DL(N);
13376	EVT SignVT = SignOp.getValueType();
13377
13378	// f64 fcopysign is really an f32 copysign on the high bits, so replace the
13379	// lower half with a copy.
13380	// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13381	EVT MagVT = MagnitudeOp.getValueType();
13382
13383	unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : `1`;
13384
13385	if (MagVT.getScalarType() == MVT::f64) {
13386	EVT F32VT = MagVT.isVector()
13387	? EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::f32, NumElements: `2` NumElts)
13388	: MVT::v2f32;
13389
13390	SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
13391
13392	SmallVector<SDValue, `8`> NewElts;
13393	for (unsigned I = `0`; I != NumElts; ++I) {
13394	SDValue MagLo =
13395	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13396	N2: DAG.getConstant(Val: `2` * I, DL, VT: MVT::i32));
13397	SDValue MagHi =
13398	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13399	N2: DAG.getConstant(Val: `2` * I + `1`, DL, VT: MVT::i32));
13400
13401	SDValue SignOpElt =
13402	MagVT.isVector()
13403	? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
13404	N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
13405	: SignOp;
13406
13407	SDValue HiOp =
13408	DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
13409
13410	SDValue Vector =
13411	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
13412
13413	SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
13414	NewElts.push_back(Elt: NewElt);
13415	}
13416
13417	if (NewElts.size() == `1`)
13418	return NewElts [`0`];
13419
13420	return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
13421	}
13422
13423	if (SignVT.getScalarType() != MVT::f64)
13424	return SDValue ();
13425
13426	// Reduce width of sign operand, we only need the highest bit.
13427	//
13428	// fcopysign f64:x, f64:y ->
13429	// fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13430	// TODO: In some cases it might make sense to go all the way to f16.
13431
13432	EVT F32VT = MagVT.isVector()
13433	? EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::f32, NumElements: `2` NumElts)
13434	: MVT::v2f32;
13435
13436	SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
13437
13438	SmallVector<SDValue, `8`> F32Signs;
13439	for (unsigned I = `0`; I != NumElts; ++I) {
13440	// Take sign from odd elements of cast vector
13441	SDValue SignAsF32 =
13442	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
13443	N2: DAG.getConstant(Val: `2` * I + `1`, DL, VT: MVT::i32));
13444	F32Signs.push_back(Elt: SignAsF32);
13445	}
13446
13447	SDValue NewSign =
13448	NumElts == `1`
13449	? F32Signs.back()
13450	: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
13451	VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
13452	Ops: F32Signs);
13453
13454	return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `0`),
13455	N2: NewSign);
13456	}
13457
13458	// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13459	// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13460	// bits
13461
13462	// This is a variant of
13463	// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13464	//
13465	// The normal DAG combiner will do this, but only if the add has one use since
13466	// that would increase the number of instructions.
13467	//
13468	// This prevents us from seeing a constant offset that can be folded into a
13469	// memory instruction's addressing mode. If we know the resulting add offset of
13470	// a pointer can be folded into an addressing offset, we can replace the pointer
13471	// operand with the add of new constant offset. This eliminates one of the uses,
13472	// and may allow the remaining use to also be simplified.
13473	//
13474	SDValue SITargetLowering::performSHLPtrCombine(SDNode N, unsigned* AddrSpace,
13475	EVT MemVT,
13476	DAGCombinerInfo &DCI) const {
13477	SDValue N0 = N->getOperand(Num: `0`);
13478	SDValue N1 = N->getOperand(Num: `1`);
13479
13480	// We only do this to handle cases where it's profitable when there are
13481	// multiple uses of the add, so defer to the standard combine.
13482	if ((!N0 ->isAnyAdd() && N0.getOpcode() != ISD::OR) \|\| N0 ->hasOneUse())
13483	return SDValue ();
13484
13485	const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
13486	if (!CN1)
13487	return SDValue ();
13488
13489	const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
13490	if (!CAdd)
13491	return SDValue ();
13492
13493	SelectionDAG &DAG = DCI.DAG;
13494
13495	if (N0 ->getOpcode() == ISD::OR &&
13496	!DAG.haveNoCommonBitsSet(A: N0.getOperand(i: `0`), B: N0.getOperand(i: `1`)))
13497	return SDValue ();
13498
13499	// If the resulting offset is too large, we can't fold it into the
13500	// addressing mode offset.
13501	APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13502	Type Ty = MemVT.getTypeForEVT(Context&: DCI.DAG.getContext());
13503
13504	AddrMode AM;
13505	AM.HasBaseReg = true;
13506	AM.BaseOffs = Offset.getSExtValue();
13507	if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
13508	return SDValue ();
13509
13510	SDLoc SL(N);
13511	EVT VT = N->getValueType(ResNo: `0`);
13512
13513	SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: `0`), N2: N1);
13514	SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
13515
13516	SDNodeFlags Flags;
13517	Flags.setNoUnsignedWrap(
13518	N->getFlags().hasNoUnsignedWrap() &&
13519	(N0.getOpcode() == ISD::OR \|\| N0 ->getFlags().hasNoUnsignedWrap()));
13520
13521	// Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13522	// be sure that the new left operand is a proper base pointer.
13523	return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
13524	}
13525
13526	/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13527	/// by the chain and intrinsic ID. Theoretically we would also need to check the
13528	/// specific intrinsic, but they all place the pointer operand first.
13529	static unsigned getBasePtrIndex(const MemSDNode *N) {
13530	switch (N->getOpcode()) {
13531	case ISD::STORE:
13532	case ISD::INTRINSIC_W_CHAIN:
13533	case ISD::INTRINSIC_VOID:
13534	return `2`;
13535	default:
13536	return `1`;
13537	}
13538	}
13539
13540	SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13541	DAGCombinerInfo &DCI) const {
13542	SelectionDAG &DAG = DCI.DAG;
13543
13544	unsigned PtrIdx = getBasePtrIndex(N);
13545	SDValue Ptr = N->getOperand(Num: PtrIdx);
13546
13547	// TODO: We could also do this for multiplies.
13548	if (Ptr.getOpcode() == ISD::SHL) {
13549	SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
13550	MemVT: N->getMemoryVT(), DCI);
13551	if (NewPtr) {
13552	SmallVector<SDValue, `8`> NewOps(N->ops());
13553
13554	NewOps [PtrIdx] = NewPtr;
13555	return SDValue (DAG.UpdateNodeOperands(N, Ops: NewOps), `0`);
13556	}
13557	}
13558
13559	return SDValue ();
13560	}
13561
13562	static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13563	return (Opc == ISD::AND && (Val == `0` \|\| Val == `0xffffffff`)) \|\|
13564	(Opc == ISD::OR && (Val == `0xffffffff` \|\| Val == `0`)) \|\|
13565	(Opc == ISD::XOR && Val == `0`);
13566	}
13567
13568	// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13569	// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13570	// integer combine opportunities since most 64-bit operations are decomposed
13571	// this way. TODO: We won't want this for SALU especially if it is an inline
13572	// immediate.
13573	SDValue SITargetLowering::splitBinaryBitConstantOp(
13574	DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13575	const ConstantSDNode CRHS) const* {
13576	uint64_t Val = CRHS->getZExtValue();
13577	uint32_t ValLo = Lo_32(Value: Val);
13578	uint32_t ValHi = Hi_32(Value: Val);
13579	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13580
13581	if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) \|\|
13582	bitOpWithConstantIsReducible(Opc, Val: ValHi)) \|\|
13583	(CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
13584	// We have 64-bit scalar and/or/xor, but do not have vector forms.
13585	if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13586	!CRHS->user_begin()->isDivergent())
13587	return SDValue ();
13588
13589	// If we need to materialize a 64-bit immediate, it will be split up later
13590	// anyway. Avoid creating the harder to understand 64-bit immediate
13591	// materialization.
13592	return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13593	}
13594
13595	return SDValue ();
13596	}
13597
13598	bool llvm::isBoolSGPR(SDValue V) {
13599	if (V.getValueType() != MVT::i1)
13600	return false;
13601	switch (V.getOpcode()) {
13602	default:
13603	break;
13604	case ISD::SETCC:
13605	case ISD::IS_FPCLASS:
13606	case AMDGPUISD::FP_CLASS:
13607	return true;
13608	case ISD::AND:
13609	case ISD::OR:
13610	case ISD::XOR:
13611	return isBoolSGPR(V: V.getOperand(i: `0`)) && isBoolSGPR(V: V.getOperand(i: `1`));
13612	case ISD::SADDO:
13613	case ISD::UADDO:
13614	case ISD::SSUBO:
13615	case ISD::USUBO:
13616	case ISD::SMULO:
13617	case ISD::UMULO:
13618	return V.getResNo() == `1`;
13619	case ISD::INTRINSIC_WO_CHAIN: {
13620	unsigned IntrinsicID = V.getConstantOperandVal(i: `0`);
13621	switch (IntrinsicID) {
13622	case Intrinsic::amdgcn_is_shared:
13623	case Intrinsic::amdgcn_is_private:
13624	return true;
13625	default:
13626	return false;
13627	}
13628
13629	return false;
13630	}
13631	}
13632	return false;
13633	}
13634
13635	// If a constant has all zeroes or all ones within each byte return it.
13636	// Otherwise return 0.
13637	static uint32_t getConstantPermuteMask(uint32_t C) {
13638	// 0xff for any zero byte in the mask
13639	uint32_t ZeroByteMask = `0`;
13640	if (!(C & `0x000000ff`))
13641	ZeroByteMask \|= `0x000000ff`;
13642	if (!(C & `0x0000ff00`))
13643	ZeroByteMask \|= `0x0000ff00`;
13644	if (!(C & `0x00ff0000`))
13645	ZeroByteMask \|= `0x00ff0000`;
13646	if (!(C & `0xff000000`))
13647	ZeroByteMask \|= `0xff000000`;
13648	uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13649	if ((NonZeroByteMask & C) != NonZeroByteMask)
13650	return `0`; // Partial bytes selected.
13651	return C;
13652	}
13653
13654	// Check if a node selects whole bytes from its operand 0 starting at a byte
13655	// boundary while masking the rest. Returns select mask as in the v_perm_b32
13656	// or -1 if not succeeded.
13657	// Note byte select encoding:
13658	// value 0-3 selects corresponding source byte;
13659	// value 0xc selects zero;
13660	// value 0xff selects 0xff.
13661	static uint32_t getPermuteMask(SDValue V) {
13662	assert(V.getValueSizeInBits() == `32`);
13663
13664	if (V.getNumOperands() != `2`)
13665	return ~`0`;
13666
13667	ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: `1`));
13668	if (!N1)
13669	return ~`0`;
13670
13671	uint32_t C = N1->getZExtValue();
13672
13673	switch (V.getOpcode()) {
13674	default:
13675	break;
13676	case ISD::AND:
13677	if (uint32_t ConstMask = getConstantPermuteMask(C))
13678	return (`0x03020100` & ConstMask) \| (`0x0c0c0c0c` & ~ConstMask);
13679	break;
13680
13681	case ISD::OR:
13682	if (uint32_t ConstMask = getConstantPermuteMask(C))
13683	return (`0x03020100` & ~ConstMask) \| ConstMask;
13684	break;
13685
13686	case ISD::SHL:
13687	if (C % `8`)
13688	return ~`0`;
13689
13690	return uint32_t((`0x030201000c0c0c0cull` << C) >> `32`);
13691
13692	case ISD::SRL:
13693	if (C % `8`)
13694	return ~`0`;
13695
13696	return uint32_t(`0x0c0c0c0c03020100ull` >> C);
13697	}
13698
13699	return ~`0`;
13700	}
13701
13702	SDValue SITargetLowering::performAndCombine(SDNode *N,
13703	DAGCombinerInfo &DCI) const {
13704	if (DCI.isBeforeLegalize())
13705	return SDValue ();
13706
13707	SelectionDAG &DAG = DCI.DAG;
13708	EVT VT = N->getValueType(ResNo: `0`);
13709	SDValue LHS = N->getOperand(Num: `0`);
13710	SDValue RHS = N->getOperand(Num: `1`);
13711
13712	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13713	if (VT == MVT::i64 && CRHS) {
13714	if (SDValue Split =
13715	splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::AND, LHS, CRHS))
13716	return Split;
13717	}
13718
13719	if (CRHS && VT == MVT::i32) {
13720	// and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13721	// nb = number of trailing zeroes in mask
13722	// It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13723	// given that we are selecting 8 or 16 bit fields starting at byte boundary.
13724	uint64_t Mask = CRHS->getZExtValue();
13725	unsigned Bits = llvm::popcount(Value: Mask);
13726	if (getSubtarget()->hasSDWA() && LHS ->getOpcode() == ISD::SRL &&
13727	(Bits == `8` \|\| Bits == `16`) && isShiftedMask_64(Value: Mask) && !(Mask & `1`)) {
13728	if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`))) {
13729	unsigned Shift = CShift->getZExtValue();
13730	unsigned NB = CRHS->getAPIntValue().countr_zero();
13731	unsigned Offset = NB + Shift;
13732	if ((Offset & (Bits - `1`)) == `0`) { // Starts at a byte or word boundary.
13733	SDLoc SL(N);
13734	SDValue BFE =
13735	DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS ->getOperand(Num: `0`),
13736	N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
13737	N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
13738	EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
13739	SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
13740	N2: DAG.getValueType(NarrowVT));
13741	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc (LHS), VT, N1: Ext,
13742	N2: DAG.getConstant(Val: NB, DL: SDLoc (CRHS), VT: MVT::i32));
13743	return Shl;
13744	}
13745	}
13746	}
13747
13748	// and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13749	if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13750	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`))) {
13751	uint32_t Sel = getConstantPermuteMask(C: Mask);
13752	if (!Sel)
13753	return SDValue ();
13754
13755	// Select 0xc for all zero bytes
13756	Sel = (LHS.getConstantOperandVal(i: `2`) & Sel) \| (~Sel & `0x0c0c0c0c`);
13757	SDLoc DL(N);
13758	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
13759	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13760	}
13761	}
13762
13763	// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13764	// fp_class x, ~(s_nan \| q_nan \| n_infinity \| p_infinity)
13765	if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13766	ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: `2`))->get();
13767	ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: `2`))->get();
13768
13769	SDValue X = LHS.getOperand(i: `0`);
13770	SDValue Y = RHS.getOperand(i: `0`);
13771	if (Y.getOpcode() != ISD::FABS \|\| Y.getOperand(i: `0`) != X \|\|
13772	!isTypeLegal(VT: X.getValueType()))
13773	return SDValue ();
13774
13775	if (LCC == ISD::SETO) {
13776	if (X != LHS.getOperand(i: `1`))
13777	return SDValue ();
13778
13779	if (RCC == ISD::SETUNE) {
13780	const ConstantFPSDNode *C1 =
13781	dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: `1`));
13782	if (!C1 \|\| !C1->isInfinity() \|\| C1->isNegative())
13783	return SDValue ();
13784
13785	const uint32_t Mask = SIInstrFlags::N_NORMAL \|
13786	SIInstrFlags::N_SUBNORMAL \| SIInstrFlags::N_ZERO \|
13787	SIInstrFlags::P_ZERO \| SIInstrFlags::P_SUBNORMAL \|
13788	SIInstrFlags::P_NORMAL;
13789
13790	static_assert(
13791	((~(SIInstrFlags::S_NAN \| SIInstrFlags::Q_NAN \|
13792	SIInstrFlags::N_INFINITY \| SIInstrFlags::P_INFINITY)) &
13793	`0x3ff`) == Mask,
13794	"mask not equal");
13795
13796	SDLoc DL(N);
13797	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
13798	N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
13799	}
13800	}
13801	}
13802
13803	if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13804	std::swap(a&: LHS, b&: RHS);
13805
13806	if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13807	RHS.hasOneUse()) {
13808	ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: `2`))->get();
13809	// and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan \|
13810	// n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13811	// \| n_nan)
13812	const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: `1`));
13813	if ((LCC == ISD::SETO \|\| LCC == ISD::SETUO) && Mask &&
13814	(RHS.getOperand(i: `0`) == LHS.getOperand(i: `0`) &&
13815	LHS.getOperand(i: `0`) == LHS.getOperand(i: `1`))) {
13816	const unsigned OrdMask = SIInstrFlags::S_NAN \| SIInstrFlags::Q_NAN;
13817	unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13818	: Mask->getZExtValue() & OrdMask;
13819
13820	SDLoc DL(N);
13821	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: `0`),
13822	N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13823	}
13824	}
13825
13826	if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND \|\|
13827	LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13828	// and x, (sext cc from i1) => select cc, x, 0
13829	if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13830	std::swap(a&: LHS, b&: RHS);
13831	if (isBoolSGPR(V: RHS.getOperand(i: `0`)))
13832	return DAG.getSelect(DL: SDLoc (N), VT: MVT::i32, Cond: RHS.getOperand(i: `0`), LHS,
13833	RHS: DAG.getConstant(Val: `0`, DL: SDLoc (N), VT: MVT::i32));
13834	}
13835
13836	// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13837	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13838	if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13839	N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
13840	uint32_t LHSMask = getPermuteMask(V: LHS);
13841	uint32_t RHSMask = getPermuteMask(V: RHS);
13842	if (LHSMask != ~`0u` && RHSMask != ~`0u`) {
13843	// Canonicalize the expression in an attempt to have fewer unique masks
13844	// and therefore fewer registers used to hold the masks.
13845	if (LHSMask > RHSMask) {
13846	std::swap(a&: LHSMask, b&: RHSMask);
13847	std::swap(a&: LHS, b&: RHS);
13848	}
13849
13850	// Select 0xc for each lane used from source operand. Zero has 0xc mask
13851	// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13852	uint32_t LHSUsedLanes = ~(LHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
13853	uint32_t RHSUsedLanes = ~(RHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
13854
13855	// Check of we need to combine values from two sources within a byte.
13856	if (!(LHSUsedLanes & RHSUsedLanes) &&
13857	// If we select high and lower word keep it for SDWA.
13858	// TODO: teach SDWA to work with v_perm_b32 and remove the check.
13859	!(LHSUsedLanes == `0x0c0c0000` && RHSUsedLanes == `0x00000c0c`)) {
13860	// Each byte in each mask is either selector mask 0-3, or has higher
13861	// bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13862	// zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13863	// mask which is not 0xff wins. By anding both masks we have a correct
13864	// result except that 0x0c shall be corrected to give 0x0c only.
13865	uint32_t Mask = LHSMask & RHSMask;
13866	for (unsigned I = `0`; I < `32`; I += `8`) {
13867	uint32_t ByteSel = `0xff` << I;
13868	if ((LHSMask & ByteSel) == `0x0c` \|\| (RHSMask & ByteSel) == `0x0c`)
13869	Mask &= (`0x0c` << I) & `0xffffffff`;
13870	}
13871
13872	// Add 4 to each active LHS lane. It will not affect any existing 0xff
13873	// or 0x0c.
13874	uint32_t Sel = Mask \| (LHSUsedLanes & `0x04040404`);
13875	SDLoc DL(N);
13876
13877	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
13878	N2: RHS.getOperand(i: `0`),
13879	N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13880	}
13881	}
13882	}
13883
13884	return SDValue ();
13885	}
13886
13887	// A key component of v_perm is a mapping between byte position of the src
13888	// operands, and the byte position of the dest. To provide such, we need: 1. the
13889	// node that provides x byte of the dest of the OR, and 2. the byte of the node
13890	// used to provide that x byte. calculateByteProvider finds which node provides
13891	// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13892	// and finds an ultimate src and byte position For example: The supported
13893	// LoadCombine pattern for vector loads is as follows
13894	// t1
13895	// or
13896	// / \
13897	// t2 t3
13898	// zext shl
13899	// \| \| \
13900	// t4 t5 16
13901	// or anyext
13902	// / \ \|
13903	// t6 t7 t8
13904	// srl shl or
13905	// / \| / \ / \
13906	// t9 t10 t11 t12 t13 t14
13907	// trunc 8 trunc* 8 and and*
13908	// \| \| / \| \| \
13909	// t15 t16 t17 t18 t19 t20
13910	// trunc 255 srl -256*
13911	// \| / \
13912	// t15 t15 16
13913	//
13914	// In this example, the truncs are from i32->i16*
13915	//
13916	// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13917	// respectively. calculateSrcByte would find (given node) -> ultimate src &
13918	// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13919	// After finding the mapping, we can combine the tree into vperm t15, t16,
13920	// 0x05000407
13921
13922	// Find the source and byte position from a node.
13923	// \p DestByte is the byte position of the dest of the or that the src
13924	// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13925	// dest of the or byte. \p Depth tracks how many recursive iterations we have
13926	// performed.
13927	static const std::optional<ByteProvider<SDValue>>
13928	calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = `0`,
13929	unsigned Depth = `0`) {
13930	// We may need to recursively traverse a series of SRLs
13931	if (Depth >= `6`)
13932	return std::nullopt;
13933
13934	if (Op.getValueSizeInBits() < `8`)
13935	return std::nullopt;
13936
13937	if (Op.getValueType().isVector())
13938	return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13939
13940	switch (Op ->getOpcode()) {
13941	case ISD::TRUNCATE: {
13942	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
13943	}
13944
13945	case ISD::ANY_EXTEND:
13946	case ISD::SIGN_EXTEND:
13947	case ISD::ZERO_EXTEND:
13948	case ISD::SIGN_EXTEND_INREG: {
13949	SDValue NarrowOp = Op ->getOperand(Num: `0`);
13950	auto NarrowVT = NarrowOp.getValueType();
13951	if (Op ->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13952	auto *VTSign = cast<VTSDNode>(Val: Op ->getOperand(Num: `1`));
13953	NarrowVT = VTSign->getVT();
13954	}
13955	if (!NarrowVT.isByteSized())
13956	return std::nullopt;
13957	uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13958
13959	if (SrcIndex >= NarrowByteWidth)
13960	return std::nullopt;
13961	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
13962	}
13963
13964	case ISD::SRA:
13965	case ISD::SRL: {
13966	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
13967	if (!ShiftOp)
13968	return std::nullopt;
13969
13970	uint64_t BitShift = ShiftOp->getZExtValue();
13971
13972	if (BitShift % `8` != `0`)
13973	return std::nullopt;
13974
13975	SrcIndex += BitShift / `8`;
13976
13977	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
13978	}
13979
13980	default: {
13981	return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13982	}
13983	}
13984	llvm_unreachable("fully handled switch");
13985	}
13986
13987	// For a byte position in the result of an Or, traverse the tree and find the
13988	// node (and the byte of the node) which ultimately provides this {Or,
13989	// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13990	// the byte position of the Op that corresponds with the originally requested
13991	// byte of the Or \p Depth tracks how many recursive iterations we have
13992	// performed. \p StartingIndex is the originally requested byte of the Or
13993	static const std::optional<ByteProvider<SDValue>>
13994	calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13995	unsigned StartingIndex = `0`) {
13996	// Finding Src tree of RHS of or typically requires at least 1 additional
13997	// depth
13998	if (Depth > `6`)
13999	return std::nullopt;
14000
14001	unsigned BitWidth = Op.getScalarValueSizeInBits();
14002	if (BitWidth % `8` != `0`)
14003	return std::nullopt;
14004	if (Index > BitWidth / `8` - `1`)
14005	return std::nullopt;
14006
14007	bool IsVec = Op.getValueType().isVector();
14008	switch (Op.getOpcode()) {
14009	case ISD::OR: {
14010	if (IsVec)
14011	return std::nullopt;
14012
14013	auto RHS = calculateByteProvider(Op: Op.getOperand(i: `1`), Index, Depth: Depth + `1`,
14014	StartingIndex);
14015	if (!RHS)
14016	return std::nullopt;
14017	auto LHS = calculateByteProvider(Op: Op.getOperand(i: `0`), Index, Depth: Depth + `1`,
14018	StartingIndex);
14019	if (!LHS)
14020	return std::nullopt;
14021	// A well formed Or will have two ByteProviders for each byte, one of which
14022	// is constant zero
14023	if (!LHS ->isConstantZero() && !RHS ->isConstantZero())
14024	return std::nullopt;
14025	if (!LHS \|\| LHS ->isConstantZero())
14026	return RHS;
14027	if (!RHS \|\| RHS ->isConstantZero())
14028	return LHS;
14029	return std::nullopt;
14030	}
14031
14032	case ISD::AND: {
14033	if (IsVec)
14034	return std::nullopt;
14035
14036	auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
14037	if (!BitMaskOp)
14038	return std::nullopt;
14039
14040	uint32_t BitMask = BitMaskOp->getZExtValue();
14041	// Bits we expect for our StartingIndex
14042	uint32_t IndexMask = `0xFF` << (Index * `8`);
14043
14044	if ((IndexMask & BitMask) != IndexMask) {
14045	// If the result of the and partially provides the byte, then it
14046	// is not well formatted
14047	if (IndexMask & BitMask)
14048	return std::nullopt;
14049	return ByteProvider<SDValue>::getConstantZero();
14050	}
14051
14052	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte: StartingIndex, SrcIndex: Index);
14053	}
14054
14055	case ISD::FSHR: {
14056	if (IsVec)
14057	return std::nullopt;
14058
14059	// fshr(X,Y,Z): (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
14060	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`));
14061	if (!ShiftOp \|\| Op.getValueType().isVector())
14062	return std::nullopt;
14063
14064	uint64_t BitsProvided = Op.getValueSizeInBits();
14065	if (BitsProvided % `8` != `0`)
14066	return std::nullopt;
14067
14068	uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
14069	if (BitShift % `8`)
14070	return std::nullopt;
14071
14072	uint64_t ConcatSizeInBytes = BitsProvided / `4`;
14073	uint64_t ByteShift = BitShift / `8`;
14074
14075	uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14076	uint64_t BytesProvided = BitsProvided / `8`;
14077	SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? `0` : `1`);
14078	NewIndex %= BytesProvided;
14079	return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + `1`, StartingIndex);
14080	}
14081
14082	case ISD::SRA:
14083	case ISD::SRL: {
14084	if (IsVec)
14085	return std::nullopt;
14086
14087	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
14088	if (!ShiftOp)
14089	return std::nullopt;
14090
14091	uint64_t BitShift = ShiftOp->getZExtValue();
14092	if (BitShift % `8`)
14093	return std::nullopt;
14094
14095	auto BitsProvided = Op.getScalarValueSizeInBits();
14096	if (BitsProvided % `8` != `0`)
14097	return std::nullopt;
14098
14099	uint64_t BytesProvided = BitsProvided / `8`;
14100	uint64_t ByteShift = BitShift / `8`;
14101	// The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14102	// If the byte we are trying to provide (as tracked by index) falls in this
14103	// range, then the SRL provides the byte. The byte of interest of the src of
14104	// the SRL is Index + ByteShift
14105	return BytesProvided - ByteShift > Index
14106	? calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte: StartingIndex,
14107	SrcIndex: Index + ByteShift)
14108	: ByteProvider<SDValue>::getConstantZero();
14109	}
14110
14111	case ISD::SHL: {
14112	if (IsVec)
14113	return std::nullopt;
14114
14115	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
14116	if (!ShiftOp)
14117	return std::nullopt;
14118
14119	uint64_t BitShift = ShiftOp->getZExtValue();
14120	if (BitShift % `8` != `0`)
14121	return std::nullopt;
14122	uint64_t ByteShift = BitShift / `8`;
14123
14124	// If we are shifting by an amount greater than (or equal to)
14125	// the index we are trying to provide, then it provides 0s. If not,
14126	// then this bytes are not definitively 0s, and the corresponding byte
14127	// of interest is Index - ByteShift of the src
14128	return Index < ByteShift
14129	? ByteProvider<SDValue>::getConstantZero()
14130	: calculateByteProvider(Op: Op.getOperand(i: `0`), Index: Index - ByteShift,
14131	Depth: Depth + `1`, StartingIndex);
14132	}
14133	case ISD::ANY_EXTEND:
14134	case ISD::SIGN_EXTEND:
14135	case ISD::ZERO_EXTEND:
14136	case ISD::SIGN_EXTEND_INREG:
14137	case ISD::AssertZext:
14138	case ISD::AssertSext: {
14139	if (IsVec)
14140	return std::nullopt;
14141
14142	SDValue NarrowOp = Op ->getOperand(Num: `0`);
14143	unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14144	if (Op ->getOpcode() == ISD::SIGN_EXTEND_INREG \|\|
14145	Op ->getOpcode() == ISD::AssertZext \|\|
14146	Op ->getOpcode() == ISD::AssertSext) {
14147	auto *VTSign = cast<VTSDNode>(Val: Op ->getOperand(Num: `1`));
14148	NarrowBitWidth = VTSign->getVT().getSizeInBits();
14149	}
14150	if (NarrowBitWidth % `8` != `0`)
14151	return std::nullopt;
14152	uint64_t NarrowByteWidth = NarrowBitWidth / `8`;
14153
14154	if (Index >= NarrowByteWidth)
14155	return Op.getOpcode() == ISD::ZERO_EXTEND
14156	? std::optional<ByteProvider<SDValue>>(
14157	ByteProvider<SDValue>::getConstantZero())
14158	: std::nullopt;
14159	return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + `1`, StartingIndex);
14160	}
14161
14162	case ISD::TRUNCATE: {
14163	if (IsVec)
14164	return std::nullopt;
14165
14166	uint64_t NarrowByteWidth = BitWidth / `8`;
14167
14168	if (NarrowByteWidth >= Index) {
14169	return calculateByteProvider(Op: Op.getOperand(i: `0`), Index, Depth: Depth + `1`,
14170	StartingIndex);
14171	}
14172
14173	return std::nullopt;
14174	}
14175
14176	case ISD::CopyFromReg: {
14177	if (BitWidth / `8` > Index)
14178	return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14179
14180	return std::nullopt;
14181	}
14182
14183	case ISD::LOAD: {
14184	auto *L = cast<LoadSDNode>(Val: Op.getNode());
14185
14186	unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14187	if (NarrowBitWidth % `8` != `0`)
14188	return std::nullopt;
14189	uint64_t NarrowByteWidth = NarrowBitWidth / `8`;
14190
14191	// If the width of the load does not reach byte we are trying to provide for
14192	// and it is not a ZEXTLOAD, then the load does not provide for the byte in
14193	// question
14194	if (Index >= NarrowByteWidth) {
14195	return L->getExtensionType() == ISD::ZEXTLOAD
14196	? std::optional<ByteProvider<SDValue>>(
14197	ByteProvider<SDValue>::getConstantZero())
14198	: std::nullopt;
14199	}
14200
14201	if (NarrowByteWidth > Index) {
14202	return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14203	}
14204
14205	return std::nullopt;
14206	}
14207
14208	case ISD::BSWAP: {
14209	if (IsVec)
14210	return std::nullopt;
14211
14212	return calculateByteProvider(Op: Op ->getOperand(Num: `0`), Index: BitWidth / `8` - Index - `1`,
14213	Depth: Depth + `1`, StartingIndex);
14214	}
14215
14216	case ISD::EXTRACT_VECTOR_ELT: {
14217	auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
14218	if (!IdxOp)
14219	return std::nullopt;
14220	auto VecIdx = IdxOp->getZExtValue();
14221	auto ScalarSize = Op.getScalarValueSizeInBits();
14222	if (ScalarSize < `32`)
14223	Index = ScalarSize == `8` ? VecIdx : VecIdx * `2` + Index;
14224	return calculateSrcByte(Op: ScalarSize >= `32` ? Op : Op.getOperand(i: `0`),
14225	DestByte: StartingIndex, SrcIndex: Index);
14226	}
14227
14228	case AMDGPUISD::PERM: {
14229	if (IsVec)
14230	return std::nullopt;
14231
14232	auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`));
14233	if (!PermMask)
14234	return std::nullopt;
14235
14236	auto IdxMask =
14237	(PermMask->getZExtValue() & (`0xFF` << (Index * `8`))) >> (Index * `8`);
14238	if (IdxMask > `0x07` && IdxMask != `0x0c`)
14239	return std::nullopt;
14240
14241	auto NextOp = Op.getOperand(i: IdxMask > `0x03` ? `0` : `1`);
14242	auto NextIndex = IdxMask > `0x03` ? IdxMask % `4` : IdxMask;
14243
14244	return IdxMask != `0x0c` ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
14245	: ByteProvider<SDValue>(
14246	ByteProvider<SDValue>::getConstantZero());
14247	}
14248
14249	default: {
14250	return std::nullopt;
14251	}
14252	}
14253
14254	llvm_unreachable("fully handled switch");
14255	}
14256
14257	// Returns true if the Operand is a scalar and is 16 bits
14258	static bool isExtendedFrom16Bits(SDValue &Operand) {
14259
14260	switch (Operand.getOpcode()) {
14261	case ISD::ANY_EXTEND:
14262	case ISD::SIGN_EXTEND:
14263	case ISD::ZERO_EXTEND: {
14264	auto OpVT = Operand.getOperand(i: `0`).getValueType();
14265	return !OpVT.isVector() && OpVT.getSizeInBits() == `16`;
14266	}
14267	case ISD::LOAD: {
14268	LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
14269	auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
14270	if (ExtType == ISD::ZEXTLOAD \|\| ExtType == ISD::SEXTLOAD \|\|
14271	ExtType == ISD::EXTLOAD) {
14272	auto MemVT = L->getMemoryVT();
14273	return !MemVT.isVector() && MemVT.getSizeInBits() == `16`;
14274	}
14275	return L->getMemoryVT().getSizeInBits() == `16`;
14276	}
14277	default:
14278	return false;
14279	}
14280	}
14281
14282	// Returns true if the mask matches consecutive bytes, and the first byte
14283	// begins at a power of 2 byte offset from 0th byte
14284	static bool addresses16Bits(int Mask) {
14285	int Low8 = Mask & `0xff`;
14286	int Hi8 = (Mask & `0xff00`) >> `8`;
14287
14288	assert(Low8 < `8` && Hi8 < `8`);
14289	// Are the bytes contiguous in the order of increasing addresses.
14290	bool IsConsecutive = (Hi8 - Low8 == `1`);
14291	// Is the first byte at location that is aligned for 16 bit instructions.
14292	// A counter example is taking 2 consecutive bytes starting at the 8th bit.
14293	// In this case, we still need code to extract the 16 bit operand, so it
14294	// is better to use i8 v_perm
14295	bool Is16Aligned = !(Low8 % `2`);
14296
14297	return IsConsecutive && Is16Aligned;
14298	}
14299
14300	// Do not lower into v_perm if the operands are actually 16 bit
14301	// and the selected bits (based on PermMask) correspond with two
14302	// easily addressable 16 bit operands.
14303	static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
14304	SDValue &OtherOp) {
14305	int Low16 = PermMask & `0xffff`;
14306	int Hi16 = (PermMask & `0xffff0000`) >> `16`;
14307
14308	auto TempOp = peekThroughBitcasts(V: Op);
14309	auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
14310
14311	auto OpIs16Bit =
14312	TempOtherOp.getValueSizeInBits() == `16` \|\| isExtendedFrom16Bits(Operand&: TempOp);
14313	if (!OpIs16Bit)
14314	return true;
14315
14316	auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == `16` \|\|
14317	isExtendedFrom16Bits(Operand&: TempOtherOp);
14318	if (!OtherOpIs16Bit)
14319	return true;
14320
14321	// Do we cleanly address both
14322	return !addresses16Bits(Mask: Low16) \|\| !addresses16Bits(Mask: Hi16);
14323	}
14324
14325	static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
14326	unsigned DWordOffset) {
14327	SDValue Ret;
14328
14329	auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14330	// ByteProvider must be at least 8 bits
14331	assert(Src.getValueSizeInBits().isKnownMultipleOf(`8`));
14332
14333	if (TypeSize <= `32`)
14334	return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
14335
14336	if (Src.getValueType().isVector()) {
14337	auto ScalarTySize = Src.getScalarValueSizeInBits();
14338	auto ScalarTy = Src.getValueType().getScalarType();
14339	if (ScalarTySize == `32`) {
14340	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
14341	N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
14342	}
14343	if (ScalarTySize > `32`) {
14344	Ret = DAG.getNode(
14345	Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
14346	N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / `32`), DL: SL, VT: MVT::i32));
14347	auto ShiftVal = `32` * (DWordOffset % (ScalarTySize / `32`));
14348	if (ShiftVal)
14349	Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
14350	N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14351	return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14352	}
14353
14354	assert(ScalarTySize < `32`);
14355	auto NumElements = TypeSize / ScalarTySize;
14356	auto Trunc32Elements = (ScalarTySize * NumElements) / `32`;
14357	auto NormalizedTrunc = Trunc32Elements * `32` / ScalarTySize;
14358	auto NumElementsIn32 = `32` / ScalarTySize;
14359	auto NumAvailElements = DWordOffset < Trunc32Elements
14360	? NumElementsIn32
14361	: NumElements - NormalizedTrunc;
14362
14363	SmallVector<SDValue, `4`> VecSrcs;
14364	DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
14365	Count: NumAvailElements);
14366
14367	Ret = DAG.getBuildVector(
14368	VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
14369	Ops: VecSrcs);
14370	return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14371	}
14372
14373	/// Scalar Type
14374	auto ShiftVal = `32` * DWordOffset;
14375	Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
14376	N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14377	return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14378	}
14379
14380	static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14381	SelectionDAG &DAG = DCI.DAG;
14382	[[maybe_unused]] EVT VT = N->getValueType(ResNo: `0`);
14383	SmallVector<ByteProvider<SDValue>, `8`> PermNodes;
14384
14385	// VT is known to be MVT::i32, so we need to provide 4 bytes.
14386	assert(VT == MVT::i32);
14387	for (int i = `0`; i < `4`; i++) {
14388	// Find the ByteProvider that provides the ith byte of the result of OR
14389	std::optional<ByteProvider<SDValue>> P =
14390	calculateByteProvider(Op: SDValue (N, `0`), Index: i, Depth: `0`, /StartingIndex = / i);
14391	// TODO support constantZero
14392	if (!P \|\| P ->isConstantZero())
14393	return SDValue ();
14394
14395	PermNodes.push_back(Elt: *P);
14396	}
14397	if (PermNodes.size() != `4`)
14398	return SDValue ();
14399
14400	std::pair<unsigned, unsigned> FirstSrc(`0`, PermNodes [`0`].SrcOffset / `4`);
14401	std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14402	uint64_t PermMask = `0x00000000`;
14403	for (size_t i = `0`; i < PermNodes.size(); i++) {
14404	auto PermOp = PermNodes [i];
14405	// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14406	// by sizeof(Src2) = 4
14407	int SrcByteAdjust = `4`;
14408
14409	// If the Src uses a byte from a different DWORD, then it corresponds
14410	// with a difference source
14411	if (!PermOp.hasSameSrc(Other: PermNodes [FirstSrc.first]) \|\|
14412	((PermOp.SrcOffset / `4`) != FirstSrc.second)) {
14413	if (SecondSrc)
14414	if (!PermOp.hasSameSrc(Other: PermNodes [SecondSrc ->first]) \|\|
14415	((PermOp.SrcOffset / `4`) != SecondSrc ->second))
14416	return SDValue ();
14417
14418	// Set the index of the second distinct Src node
14419	SecondSrc = {i, PermNodes [i].SrcOffset / `4`};
14420	assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % `8`));
14421	SrcByteAdjust = `0`;
14422	}
14423	assert((PermOp.SrcOffset % `4`) + SrcByteAdjust < `8`);
14424	assert(!DAG.getDataLayout().isBigEndian());
14425	PermMask \|= ((PermOp.SrcOffset % `4`) + SrcByteAdjust) << (i * `8`);
14426	}
14427	SDLoc DL(N);
14428	SDValue Op = *PermNodes [FirstSrc.first].Src;
14429	Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
14430	assert(Op.getValueSizeInBits() == `32`);
14431
14432	// Check that we are not just extracting the bytes in order from an op
14433	if (!SecondSrc) {
14434	int Low16 = PermMask & `0xffff`;
14435	int Hi16 = (PermMask & `0xffff0000`) >> `16`;
14436
14437	bool WellFormedLow = (Low16 == `0x0504`) \|\| (Low16 == `0x0100`);
14438	bool WellFormedHi = (Hi16 == `0x0706`) \|\| (Hi16 == `0x0302`);
14439
14440	// The perm op would really just produce Op. So combine into Op
14441	if (WellFormedLow && WellFormedHi)
14442	return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: `32`), V: Op);
14443	}
14444
14445	SDValue OtherOp = SecondSrc ? *PermNodes [SecondSrc ->first].Src : Op;
14446
14447	if (SecondSrc) {
14448	OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc ->second);
14449	assert(OtherOp.getValueSizeInBits() == `32`);
14450	}
14451
14452	// Check that we haven't just recreated the same FSHR node.
14453	if (N->getOpcode() == ISD::FSHR &&
14454	(N->getOperand(Num: `0`) == Op \|\| N->getOperand(Num: `0`) == OtherOp) &&
14455	(N->getOperand(Num: `1`) == Op \|\| N->getOperand(Num: `1`) == OtherOp))
14456	return SDValue ();
14457
14458	if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14459
14460	assert(Op.getValueType().isByteSized() &&
14461	OtherOp.getValueType().isByteSized());
14462
14463	// If the ultimate src is less than 32 bits, then we will only be
14464	// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14465	// CalculateByteProvider would not have returned Op as source if we
14466	// used a byte that is outside its ValueType. Thus, we are free to
14467	// ANY_EXTEND as the extended bits are dont-cares.
14468	Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
14469	OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
14470
14471	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
14472	N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14473	}
14474	return SDValue ();
14475	}
14476
14477	SDValue SITargetLowering::performOrCombine(SDNode *N,
14478	DAGCombinerInfo &DCI) const {
14479	SelectionDAG &DAG = DCI.DAG;
14480	SDValue LHS = N->getOperand(Num: `0`);
14481	SDValue RHS = N->getOperand(Num: `1`);
14482
14483	EVT VT = N->getValueType(ResNo: `0`);
14484	if (VT == MVT::i1) {
14485	// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 \| c2)
14486	if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14487	RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14488	SDValue Src = LHS.getOperand(i: `0`);
14489	if (Src != RHS.getOperand(i: `0`))
14490	return SDValue ();
14491
14492	const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
14493	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: `1`));
14494	if (!CLHS \|\| !CRHS)
14495	return SDValue ();
14496
14497	// Only 10 bits are used.
14498	static const uint32_t MaxMask = `0x3ff`;
14499
14500	uint32_t NewMask =
14501	(CLHS->getZExtValue() \| CRHS->getZExtValue()) & MaxMask;
14502	SDLoc DL(N);
14503	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
14504	N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
14505	}
14506
14507	return SDValue ();
14508	}
14509
14510	// or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14511	if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
14512	LHS.getOpcode() == AMDGPUISD::PERM &&
14513	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`))) {
14514	uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: `1`));
14515	if (!Sel)
14516	return SDValue ();
14517
14518	Sel \|= LHS.getConstantOperandVal(i: `2`);
14519	SDLoc DL(N);
14520	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
14521	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14522	}
14523
14524	// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14525	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14526	if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14527	N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
14528
14529	// If all the uses of an or need to extract the individual elements, do not
14530	// attempt to lower into v_perm
14531	auto usesCombinedOperand = [](SDNode *OrUse) {
14532	// If we have any non-vectorized use, then it is a candidate for v_perm
14533	if (OrUse->getOpcode() != ISD::BITCAST \|\|
14534	!OrUse->getValueType(ResNo: `0`).isVector())
14535	return true;
14536
14537	// If we have any non-vectorized use, then it is a candidate for v_perm
14538	for (auto *VUser : OrUse->users()) {
14539	if (!VUser->getValueType(ResNo: `0`).isVector())
14540	return true;
14541
14542	// If the use of a vector is a store, then combining via a v_perm
14543	// is beneficial.
14544	// TODO -- whitelist more uses
14545	for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14546	if (VUser->getOpcode() == VectorwiseOp)
14547	return true;
14548	}
14549	return false;
14550	};
14551
14552	if (!any_of(Range: N->users(), P: usesCombinedOperand))
14553	return SDValue ();
14554
14555	uint32_t LHSMask = getPermuteMask(V: LHS);
14556	uint32_t RHSMask = getPermuteMask(V: RHS);
14557
14558	if (LHSMask != ~`0u` && RHSMask != ~`0u`) {
14559	// Canonicalize the expression in an attempt to have fewer unique masks
14560	// and therefore fewer registers used to hold the masks.
14561	if (LHSMask > RHSMask) {
14562	std::swap(a&: LHSMask, b&: RHSMask);
14563	std::swap(a&: LHS, b&: RHS);
14564	}
14565
14566	// Select 0xc for each lane used from source operand. Zero has 0xc mask
14567	// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14568	uint32_t LHSUsedLanes = ~(LHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
14569	uint32_t RHSUsedLanes = ~(RHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
14570
14571	// Check of we need to combine values from two sources within a byte.
14572	if (!(LHSUsedLanes & RHSUsedLanes) &&
14573	// If we select high and lower word keep it for SDWA.
14574	// TODO: teach SDWA to work with v_perm_b32 and remove the check.
14575	!(LHSUsedLanes == `0x0c0c0000` && RHSUsedLanes == `0x00000c0c`)) {
14576	// Kill zero bytes selected by other mask. Zero value is 0xc.
14577	LHSMask &= ~RHSUsedLanes;
14578	RHSMask &= ~LHSUsedLanes;
14579	// Add 4 to each active LHS lane
14580	LHSMask \|= LHSUsedLanes & `0x04040404`;
14581	// Combine masks
14582	uint32_t Sel = LHSMask \| RHSMask;
14583	SDLoc DL(N);
14584
14585	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
14586	N2: RHS.getOperand(i: `0`),
14587	N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14588	}
14589	}
14590	if (LHSMask == ~`0u` \|\| RHSMask == ~`0u`) {
14591	if (SDValue Perm = matchPERM(N, DCI))
14592	return Perm;
14593	}
14594	}
14595
14596	// Detect identity v2i32 OR and replace with identity source node.
14597	// Specifically an Or that has operands constructed from the same source node
14598	// via extract_vector_elt and build_vector. I.E.
14599	// v2i32 or(
14600	// v2i32 build_vector(
14601	// i32 extract_elt(%IdentitySrc, 0),
14602	// i32 0
14603	// ),
14604	// v2i32 build_vector(
14605	// i32 0,
14606	// i32 extract_elt(%IdentitySrc, 1)
14607	// ) )
14608	// =>
14609	// v2i32 %IdentitySrc
14610
14611	if (VT == MVT::v2i32 && LHS ->getOpcode() == ISD::BUILD_VECTOR &&
14612	RHS ->getOpcode() == ISD::BUILD_VECTOR) {
14613
14614	ConstantSDNode *LC = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`));
14615	ConstantSDNode *RC = dyn_cast<ConstantSDNode>(Val: RHS ->getOperand(Num: `0`));
14616
14617	// Test for and normalise build vectors.
14618	if (LC && RC && LC->getZExtValue() == `0` && RC->getZExtValue() == `0`) {
14619
14620	// Get the extract_vector_element operands.
14621	SDValue LEVE = LHS ->getOperand(Num: `0`);
14622	SDValue REVE = RHS ->getOperand(Num: `1`);
14623
14624	if (LEVE ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14625	REVE ->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
14626	// Check that different elements from the same vector are
14627	// extracted.
14628	if (LEVE ->getOperand(Num: `0`) == REVE ->getOperand(Num: `0`) &&
14629	LEVE ->getOperand(Num: `1`) != REVE ->getOperand(Num: `1`)) {
14630	SDValue IdentitySrc = LEVE.getOperand(i: `0`);
14631	return IdentitySrc;
14632	}
14633	}
14634	}
14635	}
14636
14637	if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())
14638	return SDValue ();
14639
14640	// TODO: This could be a generic combine with a predicate for extracting the
14641	// high half of an integer being free.
14642
14643	// (or i64:x, (zero_extend i32:y)) ->
14644	// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14645	if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14646	RHS.getOpcode() != ISD::ZERO_EXTEND)
14647	std::swap(a&: LHS, b&: RHS);
14648
14649	if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14650	SDValue ExtSrc = RHS.getOperand(i: `0`);
14651	EVT SrcVT = ExtSrc.getValueType();
14652	if (SrcVT == MVT::i32) {
14653	SDLoc SL(N);
14654	auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
14655	SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
14656
14657	DCI.AddToWorklist(N: LowOr.getNode());
14658	DCI.AddToWorklist(N: HiBits.getNode());
14659
14660	SDValue Vec =
14661	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
14662	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
14663	}
14664	}
14665
14666	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
14667	if (CRHS) {
14668	if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::OR,
14669	LHS: N->getOperand(Num: `0`), CRHS))
14670	return Split;
14671	}
14672
14673	return SDValue ();
14674	}
14675
14676	SDValue SITargetLowering::performXorCombine(SDNode *N,
14677	DAGCombinerInfo &DCI) const {
14678	if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
14679	return RV;
14680
14681	SDValue LHS = N->getOperand(Num: `0`);
14682	SDValue RHS = N->getOperand(Num: `1`);
14683
14684	const ConstantSDNode *CRHS = isConstOrConstSplat(N: RHS);
14685	SelectionDAG &DAG = DCI.DAG;
14686
14687	EVT VT = N->getValueType(ResNo: `0`);
14688	if (CRHS && VT == MVT::i64) {
14689	if (SDValue Split =
14690	splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::XOR, LHS, CRHS))
14691	return Split;
14692	}
14693
14694	// v2i32 (xor (vselect cc, x, y), K) ->
14695	// (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14696	// replaced with source modifiers when the select is lowered to CNDMASK.
14697	unsigned Opc = LHS.getOpcode();
14698	if (((Opc == ISD::VSELECT && VT == MVT::v2i32) \|\|
14699	(Opc == ISD::SELECT && VT == MVT::i64)) &&
14700	CRHS && CRHS->getAPIntValue().isSignMask()) {
14701	SDValue CC = LHS ->getOperand(Num: `0`);
14702	SDValue TRUE = LHS ->getOperand(Num: `1`);
14703	SDValue FALSE = LHS ->getOperand(Num: `2`);
14704	SDValue XTrue = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc (N), VT, N1: TRUE, N2: RHS);
14705	SDValue XFalse = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc (N), VT, N1: FALSE, N2: RHS);
14706	SDValue XSelect =
14707	DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT, N1: CC, N2: XTrue, N3: XFalse);
14708	return XSelect;
14709	}
14710
14711	// Make sure to apply the 64-bit constant splitting fold before trying to fold
14712	// fneg-like xors into 64-bit select.
14713	if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14714	// This looks like an fneg, try to fold as a source modifier.
14715	if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14716	shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
14717	// xor (select c, a, b), 0x80000000 ->
14718	// bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14719	SDLoc DL(N);
14720	SDValue CastLHS =
14721	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS ->getOperand(Num: `1`));
14722	SDValue CastRHS =
14723	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS ->getOperand(Num: `2`));
14724	SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
14725	SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
14726	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
14727	N1: LHS ->getOperand(Num: `0`), N2: FNegLHS, N3: FNegRHS);
14728	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
14729	}
14730	}
14731
14732	return SDValue ();
14733	}
14734
14735	SDValue
14736	SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14737	DAGCombinerInfo &DCI) const {
14738	if (!Subtarget->has16BitInsts() \|\|
14739	DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14740	return SDValue ();
14741
14742	EVT VT = N->getValueType(ResNo: `0`);
14743	if (VT != MVT::i32)
14744	return SDValue ();
14745
14746	SDValue Src = N->getOperand(Num: `0`);
14747	if (Src.getValueType() != MVT::i16)
14748	return SDValue ();
14749
14750	if (!Src ->hasOneUse())
14751	return SDValue ();
14752
14753	// TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14754	// possible we're missing out on some combine opportunities, but we'd need to
14755	// weigh the cost of extracting the byte from the upper dwords.
14756
14757	std::optional<ByteProvider<SDValue>> BP0 =
14758	calculateByteProvider(Op: SDValue (N, `0`), Index: `0`, Depth: `0`, StartingIndex: `0`);
14759	if (!BP0 \|\| BP0 ->SrcOffset >= `4` \|\| !BP0 ->Src)
14760	return SDValue ();
14761	SDValue V0 = *BP0 ->Src;
14762
14763	std::optional<ByteProvider<SDValue>> BP1 =
14764	calculateByteProvider(Op: SDValue (N, `0`), Index: `1`, Depth: `0`, StartingIndex: `1`);
14765	if (!BP1 \|\| BP1 ->SrcOffset >= `4` \|\| !BP1 ->Src)
14766	return SDValue ();
14767
14768	SDValue V1 = *BP1 ->Src;
14769
14770	if (V0 == V1)
14771	return SDValue ();
14772
14773	SelectionDAG &DAG = DCI.DAG;
14774	SDLoc DL(N);
14775	uint32_t PermMask = `0x0c0c0c0c`;
14776	if (V0) {
14777	V0 = DAG.getBitcastedAnyExtOrTrunc(Op: V0, DL, VT: MVT::i32);
14778	PermMask = (PermMask & ~`0xFF`) \| (BP0 ->SrcOffset + `4`);
14779	}
14780
14781	if (V1) {
14782	V1 = DAG.getBitcastedAnyExtOrTrunc(Op: V1, DL, VT: MVT::i32);
14783	PermMask = (PermMask & ~(`0xFF` << `8`)) \| (BP1 ->SrcOffset << `8`);
14784	}
14785
14786	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: V0, N2: V1,
14787	N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14788	}
14789
14790	SDValue
14791	SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14792	DAGCombinerInfo &DCI) const {
14793	SDValue Src = N->getOperand(Num: `0`);
14794	auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: `1`));
14795
14796	// Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14797	// with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14798	if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14799	VTSign->getVT() == MVT::i8) \|\|
14800	(Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14801	VTSign->getVT() == MVT::i16))) {
14802	assert(Subtarget->hasScalarSubwordLoads() &&
14803	"s_buffer_load_{u8, i8} are supported "
14804	"in GFX12 (or newer) architectures.");
14805	EVT VT = Src.getValueType();
14806	unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14807	? AMDGPUISD::SBUFFER_LOAD_BYTE
14808	: AMDGPUISD::SBUFFER_LOAD_SHORT;
14809	SDLoc DL(N);
14810	SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
14811	SDValue Ops[] = {
14812	Src.getOperand(i: `0`), // source register
14813	Src.getOperand(i: `1`), // offset
14814	Src.getOperand(i: `2`) // cachePolicy
14815	};
14816	auto *M = cast<MemSDNode>(Val&: Src);
14817	SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14818	Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14819	SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
14820	return LoadVal;
14821	}
14822	if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14823	VTSign->getVT() == MVT::i8) \|\|
14824	(Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14825	VTSign->getVT() == MVT::i16)) &&
14826	Src.hasOneUse()) {
14827	auto *M = cast<MemSDNode>(Val&: Src);
14828	SDValue Ops[] = {Src.getOperand(i: `0`), // Chain
14829	Src.getOperand(i: `1`), // rsrc
14830	Src.getOperand(i: `2`), // vindex
14831	Src.getOperand(i: `3`), // voffset
14832	Src.getOperand(i: `4`), // soffset
14833	Src.getOperand(i: `5`), // offset
14834	Src.getOperand(i: `6`), Src.getOperand(i: `7`)};
14835	// replace with BUFFER_LOAD_BYTE/SHORT
14836	SDVTList ResList =
14837	DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: `0`).getValueType());
14838	unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14839	? AMDGPUISD::BUFFER_LOAD_BYTE
14840	: AMDGPUISD::BUFFER_LOAD_SHORT;
14841	SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14842	Opcode: Opc, dl: SDLoc (N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14843	return DCI.DAG.getMergeValues(
14844	Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: `1`)}, dl: SDLoc (N));
14845	}
14846	return SDValue ();
14847	}
14848
14849	SDValue SITargetLowering::performClassCombine(SDNode *N,
14850	DAGCombinerInfo &DCI) const {
14851	SelectionDAG &DAG = DCI.DAG;
14852	SDValue Mask = N->getOperand(Num: `1`);
14853
14854	// fp_class x, 0 -> false
14855	if (isNullConstant(V: Mask))
14856	return DAG.getConstant(Val: `0`, DL: SDLoc (N), VT: MVT::i1);
14857
14858	if (N->getOperand(Num: `0`).isUndef())
14859	return DAG.getUNDEF(VT: MVT::i1);
14860
14861	return SDValue ();
14862	}
14863
14864	SDValue SITargetLowering::performRcpCombine(SDNode *N,
14865	DAGCombinerInfo &DCI) const {
14866	EVT VT = N->getValueType(ResNo: `0`);
14867	SDValue N0 = N->getOperand(Num: `0`);
14868
14869	if (N0.isUndef()) {
14870	return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
14871	DL: SDLoc (N), VT);
14872	}
14873
14874	if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP \|\|
14875	N0.getOpcode() == ISD::SINT_TO_FP)) {
14876	return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc (N), VT, Operand: N0,
14877	Flags: N->getFlags());
14878	}
14879
14880	// TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14881	if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14882	N->getFlags().hasAllowContract() && N0 ->getFlags().hasAllowContract()) {
14883	return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc (N), VT, Operand: N0.getOperand(i: `0`),
14884	Flags: N->getFlags());
14885	}
14886
14887	return AMDGPUTargetLowering::performRcpCombine(N, DCI);
14888	}
14889
14890	bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
14891	SDNodeFlags UserFlags,
14892	unsigned MaxDepth) const {
14893	unsigned Opcode = Op.getOpcode();
14894	if (Opcode == ISD::FCANONICALIZE)
14895	return true;
14896
14897	if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
14898	const auto &F = CFP->getValueAPF();
14899	if (F.isNaN() && F.isSignaling())
14900	return false;
14901	if (!F.isDenormal())
14902	return true;
14903
14904	DenormalMode Mode =
14905	DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
14906	return Mode == DenormalMode::getIEEE();
14907	}
14908
14909	// If source is a result of another standard FP operation it is already in
14910	// canonical form.
14911	if (MaxDepth == `0`)
14912	return false;
14913
14914	switch (Opcode) {
14915	// These will flush denorms if required.
14916	case ISD::FADD:
14917	case ISD::FSUB:
14918	case ISD::FMUL:
14919	case ISD::FCEIL:
14920	case ISD::FFLOOR:
14921	case ISD::FMA:
14922	case ISD::FMAD:
14923	case ISD::FSQRT:
14924	case ISD::FDIV:
14925	case ISD::FREM:
14926	case ISD::FP_ROUND:
14927	case ISD::FP_EXTEND:
14928	case ISD::FP16_TO_FP:
14929	case ISD::FP_TO_FP16:
14930	case ISD::BF16_TO_FP:
14931	case ISD::FP_TO_BF16:
14932	case ISD::FLDEXP:
14933	case AMDGPUISD::FMUL_LEGACY:
14934	case AMDGPUISD::FMAD_FTZ:
14935	case AMDGPUISD::RCP:
14936	case AMDGPUISD::RSQ:
14937	case AMDGPUISD::RSQ_CLAMP:
14938	case AMDGPUISD::RCP_LEGACY:
14939	case AMDGPUISD::RCP_IFLAG:
14940	case AMDGPUISD::LOG:
14941	case AMDGPUISD::EXP:
14942	case AMDGPUISD::DIV_SCALE:
14943	case AMDGPUISD::DIV_FMAS:
14944	case AMDGPUISD::DIV_FIXUP:
14945	case AMDGPUISD::FRACT:
14946	case AMDGPUISD::CVT_PKRTZ_F16_F32:
14947	case AMDGPUISD::CVT_F32_UBYTE0:
14948	case AMDGPUISD::CVT_F32_UBYTE1:
14949	case AMDGPUISD::CVT_F32_UBYTE2:
14950	case AMDGPUISD::CVT_F32_UBYTE3:
14951	case AMDGPUISD::FP_TO_FP16:
14952	case AMDGPUISD::SIN_HW:
14953	case AMDGPUISD::COS_HW:
14954	return true;
14955
14956	// It can/will be lowered or combined as a bit operation.
14957	// Need to check their input recursively to handle.
14958	case ISD::FNEG:
14959	case ISD::FABS:
14960	case ISD::FCOPYSIGN:
14961	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), UserFlags: MaxDepth - `1`);
14962
14963	case ISD::AND:
14964	if (Op.getValueType() == MVT::i32) {
14965	// Be careful as we only know it is a bitcast floating point type. It
14966	// could be f32, v2f16, we have no way of knowing. Luckily the constant
14967	// value that we optimize for, which comes up in fp32 to bf16 conversions,
14968	// is valid to optimize for all types.
14969	if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
14970	if (RHS->getZExtValue() == `0xffff0000`) {
14971	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), UserFlags: MaxDepth - `1`);
14972	}
14973	}
14974	}
14975	break;
14976
14977	case ISD::FSIN:
14978	case ISD::FCOS:
14979	case ISD::FSINCOS:
14980	return Op.getValueType().getScalarType() != MVT::f16;
14981
14982	case ISD::FMINNUM:
14983	case ISD::FMAXNUM:
14984	case ISD::FMINNUM_IEEE:
14985	case ISD::FMAXNUM_IEEE:
14986	case ISD::FMINIMUM:
14987	case ISD::FMAXIMUM:
14988	case ISD::FMINIMUMNUM:
14989	case ISD::FMAXIMUMNUM:
14990	case AMDGPUISD::CLAMP:
14991	case AMDGPUISD::FMED3:
14992	case AMDGPUISD::FMAX3:
14993	case AMDGPUISD::FMIN3:
14994	case AMDGPUISD::FMAXIMUM3:
14995	case AMDGPUISD::FMINIMUM3: {
14996	// FIXME: Shouldn't treat the generic operations different based these.
14997	// However, we aren't really required to flush the result from
14998	// minnum/maxnum..
14999
15000	// snans will be quieted, so we only need to worry about denormals.
15001	if (Subtarget->supportsMinMaxDenormModes() \|\|
15002	// FIXME: denormalsEnabledForType is broken for dynamic
15003	denormalsEnabledForType(DAG, VT: Op.getValueType()))
15004	return true;
15005
15006	// Flushing may be required.
15007	// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15008	// targets need to check their input recursively.
15009
15010	// FIXME: Does this apply with clamp? It's implemented with max.
15011	for (unsigned I = `0`, E = Op.getNumOperands(); I != E; ++I) {
15012	if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), UserFlags: MaxDepth - `1`))
15013	return false;
15014	}
15015
15016	return true;
15017	}
15018	case ISD::SELECT: {
15019	return isCanonicalized(DAG, Op: Op.getOperand(i: `1`), UserFlags: MaxDepth - `1`) &&
15020	isCanonicalized(DAG, Op: Op.getOperand(i: `2`), UserFlags: MaxDepth - `1`);
15021	}
15022	case ISD::BUILD_VECTOR: {
15023	for (unsigned i = `0`, e = Op.getNumOperands(); i != e; ++i) {
15024	SDValue SrcOp = Op.getOperand(i);
15025	if (!isCanonicalized(DAG, Op: SrcOp, UserFlags: MaxDepth - `1`))
15026	return false;
15027	}
15028
15029	return true;
15030	}
15031	case ISD::EXTRACT_VECTOR_ELT:
15032	case ISD::EXTRACT_SUBVECTOR: {
15033	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), UserFlags: MaxDepth - `1`);
15034	}
15035	case ISD::INSERT_VECTOR_ELT: {
15036	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), UserFlags: MaxDepth - `1`) &&
15037	isCanonicalized(DAG, Op: Op.getOperand(i: `1`), UserFlags: MaxDepth - `1`);
15038	}
15039	case ISD::UNDEF:
15040	// Could be anything.
15041	return false;
15042
15043	case ISD::BITCAST:
15044	// TODO: This is incorrect as it loses track of the operand's type. We may
15045	// end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15046	// same bits that are canonicalized in one type need not be in the other.
15047	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), UserFlags: MaxDepth - `1`);
15048	case ISD::TRUNCATE: {
15049	// Hack round the mess we make when legalizing extract_vector_elt
15050	if (Op.getValueType() == MVT::i16) {
15051	SDValue TruncSrc = Op.getOperand(i: `0`);
15052	if (TruncSrc.getValueType() == MVT::i32 &&
15053	TruncSrc.getOpcode() == ISD::BITCAST &&
15054	TruncSrc.getOperand(i: `0`).getValueType() == MVT::v2f16) {
15055	return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: `0`), UserFlags: MaxDepth - `1`);
15056	}
15057	}
15058	return false;
15059	}
15060	case ISD::INTRINSIC_WO_CHAIN: {
15061	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
15062	// TODO: Handle more intrinsics
15063	switch (IntrinsicID) {
15064	case Intrinsic::amdgcn_cvt_pkrtz:
15065	case Intrinsic::amdgcn_cubeid:
15066	case Intrinsic::amdgcn_frexp_mant:
15067	case Intrinsic::amdgcn_fdot2:
15068	case Intrinsic::amdgcn_rcp:
15069	case Intrinsic::amdgcn_rsq:
15070	case Intrinsic::amdgcn_rsq_clamp:
15071	case Intrinsic::amdgcn_rcp_legacy:
15072	case Intrinsic::amdgcn_rsq_legacy:
15073	case Intrinsic::amdgcn_trig_preop:
15074	case Intrinsic::amdgcn_tanh:
15075	case Intrinsic::amdgcn_log:
15076	case Intrinsic::amdgcn_exp2:
15077	case Intrinsic::amdgcn_sqrt:
15078	return true;
15079	default:
15080	break;
15081	}
15082
15083	break;
15084	}
15085	default:
15086	break;
15087	}
15088
15089	// FIXME: denormalsEnabledForType is broken for dynamic
15090	return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
15091	(UserFlags.hasNoNaNs() \|\| DAG.isKnownNeverSNaN(Op));
15092	}
15093
15094	bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
15095	unsigned MaxDepth) const {
15096	const MachineRegisterInfo &MRI = MF.getRegInfo();
15097	MachineInstr *MI = MRI.getVRegDef(Reg);
15098	unsigned Opcode = MI->getOpcode();
15099
15100	if (Opcode == AMDGPU::G_FCANONICALIZE)
15101	return true;
15102
15103	std::optional<FPValueAndVReg> FCR;
15104	// Constant splat (can be padded with undef) or scalar constant.
15105	if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
15106	if (FCR ->Value.isSignaling())
15107	return false;
15108	if (!FCR ->Value.isDenormal())
15109	return true;
15110
15111	DenormalMode Mode = MF.getDenormalMode(FPType: FCR ->Value.getSemantics());
15112	return Mode == DenormalMode::getIEEE();
15113	}
15114
15115	if (MaxDepth == `0`)
15116	return false;
15117
15118	switch (Opcode) {
15119	case AMDGPU::G_FADD:
15120	case AMDGPU::G_FSUB:
15121	case AMDGPU::G_FMUL:
15122	case AMDGPU::G_FCEIL:
15123	case AMDGPU::G_FFLOOR:
15124	case AMDGPU::G_FRINT:
15125	case AMDGPU::G_FNEARBYINT:
15126	case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15127	case AMDGPU::G_INTRINSIC_TRUNC:
15128	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15129	case AMDGPU::G_FMA:
15130	case AMDGPU::G_FMAD:
15131	case AMDGPU::G_FSQRT:
15132	case AMDGPU::G_FDIV:
15133	case AMDGPU::G_FREM:
15134	case AMDGPU::G_FPOW:
15135	case AMDGPU::G_FPEXT:
15136	case AMDGPU::G_FLOG:
15137	case AMDGPU::G_FLOG2:
15138	case AMDGPU::G_FLOG10:
15139	case AMDGPU::G_FPTRUNC:
15140	case AMDGPU::G_AMDGPU_RCP_IFLAG:
15141	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15142	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15143	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15144	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15145	return true;
15146	case AMDGPU::G_FNEG:
15147	case AMDGPU::G_FABS:
15148	case AMDGPU::G_FCOPYSIGN:
15149	return isCanonicalized(Reg: MI->getOperand(i: `1`).getReg(), MF, MaxDepth: MaxDepth - `1`);
15150	case AMDGPU::G_FMINNUM:
15151	case AMDGPU::G_FMAXNUM:
15152	case AMDGPU::G_FMINNUM_IEEE:
15153	case AMDGPU::G_FMAXNUM_IEEE:
15154	case AMDGPU::G_FMINIMUM:
15155	case AMDGPU::G_FMAXIMUM:
15156	case AMDGPU::G_FMINIMUMNUM:
15157	case AMDGPU::G_FMAXIMUMNUM: {
15158	if (Subtarget->supportsMinMaxDenormModes() \|\|
15159	// FIXME: denormalsEnabledForType is broken for dynamic
15160	denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
15161	return true;
15162
15163	[[fallthrough]];
15164	}
15165	case AMDGPU::G_BUILD_VECTOR:
15166	for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
15167	if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - `1`))
15168	return false;
15169	return true;
15170	case AMDGPU::G_INTRINSIC:
15171	case AMDGPU::G_INTRINSIC_CONVERGENT:
15172	switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15173	case Intrinsic::amdgcn_fmul_legacy:
15174	case Intrinsic::amdgcn_fmad_ftz:
15175	case Intrinsic::amdgcn_sqrt:
15176	case Intrinsic::amdgcn_fmed3:
15177	case Intrinsic::amdgcn_sin:
15178	case Intrinsic::amdgcn_cos:
15179	case Intrinsic::amdgcn_log:
15180	case Intrinsic::amdgcn_exp2:
15181	case Intrinsic::amdgcn_log_clamp:
15182	case Intrinsic::amdgcn_rcp:
15183	case Intrinsic::amdgcn_rcp_legacy:
15184	case Intrinsic::amdgcn_rsq:
15185	case Intrinsic::amdgcn_rsq_clamp:
15186	case Intrinsic::amdgcn_rsq_legacy:
15187	case Intrinsic::amdgcn_div_scale:
15188	case Intrinsic::amdgcn_div_fmas:
15189	case Intrinsic::amdgcn_div_fixup:
15190	case Intrinsic::amdgcn_fract:
15191	case Intrinsic::amdgcn_cvt_pkrtz:
15192	case Intrinsic::amdgcn_cubeid:
15193	case Intrinsic::amdgcn_cubema:
15194	case Intrinsic::amdgcn_cubesc:
15195	case Intrinsic::amdgcn_cubetc:
15196	case Intrinsic::amdgcn_frexp_mant:
15197	case Intrinsic::amdgcn_fdot2:
15198	case Intrinsic::amdgcn_trig_preop:
15199	case Intrinsic::amdgcn_tanh:
15200	return true;
15201	default:
15202	break;
15203	}
15204
15205	[[fallthrough]];
15206	default:
15207	return false;
15208	}
15209
15210	llvm_unreachable("invalid operation");
15211	}
15212
15213	// Constant fold canonicalize.
15214	SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15215	const SDLoc &SL, EVT VT,
15216	const APFloat &C) const {
15217	// Flush denormals to 0 if not enabled.
15218	if (C.isDenormal()) {
15219	DenormalMode Mode =
15220	DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
15221	if (Mode == DenormalMode::getPreserveSign()) {
15222	return DAG.getConstantFP(
15223	Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
15224	}
15225
15226	if (Mode != DenormalMode::getIEEE())
15227	return SDValue ();
15228	}
15229
15230	if (C.isNaN()) {
15231	APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
15232	if (C.isSignaling()) {
15233	// Quiet a signaling NaN.
15234	// FIXME: Is this supposed to preserve payload bits?
15235	return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15236	}
15237
15238	// Make sure it is the canonical NaN bitpattern.
15239	//
15240	// TODO: Can we use -1 as the canonical NaN value since it's an inline
15241	// immediate?
15242	if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15243	return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15244	}
15245
15246	// Already canonical.
15247	return DAG.getConstantFP(Val: C, DL: SL, VT);
15248	}
15249
15250	static bool vectorEltWillFoldAway(SDValue Op) {
15251	return Op.isUndef() \|\| isa<ConstantFPSDNode>(Val: Op);
15252	}
15253
15254	SDValue
15255	SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15256	DAGCombinerInfo &DCI) const {
15257	SelectionDAG &DAG = DCI.DAG;
15258	SDValue N0 = N->getOperand(Num: `0`);
15259	EVT VT = N->getValueType(ResNo: `0`);
15260
15261	// fcanonicalize undef -> qnan
15262	if (N0.isUndef()) {
15263	APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
15264	return DAG.getConstantFP(Val: QNaN, DL: SDLoc (N), VT);
15265	}
15266
15267	if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
15268	EVT VT = N->getValueType(ResNo: `0`);
15269	return getCanonicalConstantFP(DAG, SL: SDLoc (N), VT, C: CFP->getValueAPF());
15270	}
15271
15272	// fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15273	// (fcanonicalize k)
15274	//
15275	// fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15276
15277	// TODO: This could be better with wider vectors that will be split to v2f16,
15278	// and to consider uses since there aren't that many packed operations.
15279	if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15280	isTypeLegal(VT: MVT::v2f16)) {
15281	SDLoc SL(N);
15282	SDValue NewElts[`2`];
15283	SDValue Lo = N0.getOperand(i: `0`);
15284	SDValue Hi = N0.getOperand(i: `1`);
15285	EVT EltVT = Lo.getValueType();
15286
15287	if (vectorEltWillFoldAway(Op: Lo) \|\| vectorEltWillFoldAway(Op: Hi)) {
15288	for (unsigned I = `0`; I != `2`; ++I) {
15289	SDValue Op = N0.getOperand(i: I);
15290	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15291	NewElts[I] =
15292	getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
15293	} else if (Op.isUndef()) {
15294	// Handled below based on what the other operand is.
15295	NewElts[I] = Op;
15296	} else {
15297	NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
15298	}
15299	}
15300
15301	// If one half is undef, and one is constant, prefer a splat vector rather
15302	// than the normal qNaN. If it's a register, prefer 0.0 since that's
15303	// cheaper to use and may be free with a packed operation.
15304	if (NewElts[`0`].isUndef()) {
15305	if (isa<ConstantFPSDNode>(Val: NewElts[`1`]))
15306	NewElts[`0`] = isa<ConstantFPSDNode>(Val: NewElts[`1`])
15307	? NewElts[`1`]
15308	: DAG.getConstantFP(Val: `0.0f`, DL: SL, VT: EltVT);
15309	}
15310
15311	if (NewElts[`1`].isUndef()) {
15312	NewElts[`1`] = isa<ConstantFPSDNode>(Val: NewElts[`0`])
15313	? NewElts[`0`]
15314	: DAG.getConstantFP(Val: `0.0f`, DL: SL, VT: EltVT);
15315	}
15316
15317	return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
15318	}
15319	}
15320
15321	return SDValue ();
15322	}
15323
15324	static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15325	switch (Opc) {
15326	case ISD::FMAXNUM:
15327	case ISD::FMAXNUM_IEEE:
15328	case ISD::FMAXIMUMNUM:
15329	return AMDGPUISD::FMAX3;
15330	case ISD::FMAXIMUM:
15331	return AMDGPUISD::FMAXIMUM3;
15332	case ISD::SMAX:
15333	return AMDGPUISD::SMAX3;
15334	case ISD::UMAX:
15335	return AMDGPUISD::UMAX3;
15336	case ISD::FMINNUM:
15337	case ISD::FMINNUM_IEEE:
15338	case ISD::FMINIMUMNUM:
15339	return AMDGPUISD::FMIN3;
15340	case ISD::FMINIMUM:
15341	return AMDGPUISD::FMINIMUM3;
15342	case ISD::SMIN:
15343	return AMDGPUISD::SMIN3;
15344	case ISD::UMIN:
15345	return AMDGPUISD::UMIN3;
15346	default:
15347	llvm_unreachable("Not a min/max opcode");
15348	}
15349	}
15350
15351	SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15352	const SDLoc &SL, SDValue Src,
15353	SDValue MinVal,
15354	SDValue MaxVal,
15355	bool Signed) const {
15356
15357	// med3 comes from
15358	// min(max(x, K0), K1), K0 < K1
15359	// max(min(x, K0), K1), K1 < K0
15360	//
15361	// "MinVal" and "MaxVal" respectively refer to the rhs of the
15362	// min/max op.
15363	ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
15364	ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
15365
15366	if (!MinK \|\| !MaxK)
15367	return SDValue ();
15368
15369	if (Signed) {
15370	if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
15371	return SDValue ();
15372	} else {
15373	if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
15374	return SDValue ();
15375	}
15376
15377	EVT VT = MinK->getValueType(ResNo: `0`);
15378	unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15379	if (VT == MVT::i32 \|\| (VT == MVT::i16 && Subtarget->hasMed3_16()))
15380	return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
15381
15382	// Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15383	// not available, but this is unlikely to be profitable as constants
15384	// will often need to be materialized & extended, especially on
15385	// pre-GFX10 where VOP3 instructions couldn't take literal operands.
15386	return SDValue ();
15387	}
15388
15389	static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
15390	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
15391	return C;
15392
15393	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15394	if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15395	return C;
15396	}
15397
15398	return nullptr;
15399	}
15400
15401	SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15402	const SDLoc &SL, SDValue Op0,
15403	SDValue Op1,
15404	bool IsKnownNoNaNs) const {
15405	ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
15406	if (!K1)
15407	return SDValue ();
15408
15409	ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: `1`));
15410	if (!K0)
15411	return SDValue ();
15412
15413	// Ordered >= (although NaN inputs should have folded away by now).
15414	if (K0->getValueAPF() > K1->getValueAPF())
15415	return SDValue ();
15416
15417	// med3 with a nan input acts like
15418	// v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15419	//
15420	// So the result depends on whether the IEEE mode bit is enabled or not with a
15421	// signaling nan input.
15422	// ieee=1
15423	// s0 snan: yields s2
15424	// s1 snan: yields s2
15425	// s2 snan: qnan
15426
15427	// s0 qnan: min(s1, s2)
15428	// s1 qnan: min(s0, s2)
15429	// s2 qnan: min(s0, s1)
15430
15431	// ieee=0
15432	// s0 snan: min(s1, s2)
15433	// s1 snan: min(s0, s2)
15434	// s2 snan: qnan
15435
15436	// s0 qnan: min(s1, s2)
15437	// s1 qnan: min(s0, s2)
15438	// s2 qnan: min(s0, s1)
15439	const MachineFunction &MF = DAG.getMachineFunction();
15440	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15441
15442	// TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15443	// whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15444	// can only form if op0 is fmaxnum_ieee if IEEE=1.
15445	EVT VT = Op0.getValueType();
15446	if (Info->getMode().DX10Clamp) {
15447	// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15448	// hardware fmed3 behavior converting to a min.
15449	// FIXME: Should this be allowing -0.0?
15450	if (K1->isExactlyValue(V: `1.0`) && K0->isExactlyValue(V: `0.0`))
15451	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: `0`));
15452	}
15453
15454	// med3 for f16 is only available on gfx9+, and not available for v2f16.
15455	if (VT == MVT::f32 \|\| (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15456	// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15457	// signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15458	// then give the other result, which is different from med3 with a NaN
15459	// input.
15460	SDValue Var = Op0.getOperand(i: `0`);
15461	if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Op: Var))
15462	return SDValue ();
15463
15464	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15465
15466	if ((!K0->hasOneUse() \|\| TII->isInlineConstant(Imm: K0->getValueAPF())) &&
15467	(!K1->hasOneUse() \|\| TII->isInlineConstant(Imm: K1->getValueAPF()))) {
15468	return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: `0`), N1: Var,
15469	N2: SDValue (K0, `0`), N3: SDValue (K1, `0`));
15470	}
15471	}
15472
15473	return SDValue ();
15474	}
15475
15476	/// \return true if the subtarget supports minimum3 and maximum3 with the given
15477	/// base min/max opcode \p Opc for type \p VT.
15478	static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15479	EVT VT) {
15480	switch (Opc) {
15481	case ISD::FMINNUM:
15482	case ISD::FMAXNUM:
15483	case ISD::FMINNUM_IEEE:
15484	case ISD::FMAXNUM_IEEE:
15485	case ISD::FMINIMUMNUM:
15486	case ISD::FMAXIMUMNUM:
15487	case AMDGPUISD::FMIN_LEGACY:
15488	case AMDGPUISD::FMAX_LEGACY:
15489	return (VT == MVT::f32) \|\| (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) \|\|
15490	(VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15491	case ISD::FMINIMUM:
15492	case ISD::FMAXIMUM:
15493	return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) \|\|
15494	(VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) \|\|
15495	(VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15496	case ISD::SMAX:
15497	case ISD::SMIN:
15498	case ISD::UMAX:
15499	case ISD::UMIN:
15500	return (VT == MVT::i32) \|\| (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15501	default:
15502	return false;
15503	}
15504
15505	llvm_unreachable("not a min/max opcode");
15506	}
15507
15508	SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15509	DAGCombinerInfo &DCI) const {
15510	SelectionDAG &DAG = DCI.DAG;
15511
15512	EVT VT = N->getValueType(ResNo: `0`);
15513	unsigned Opc = N->getOpcode();
15514	SDValue Op0 = N->getOperand(Num: `0`);
15515	SDValue Op1 = N->getOperand(Num: `1`);
15516
15517	// Only do this if the inner op has one use since this will just increases
15518	// register pressure for no benefit.
15519
15520	if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
15521	// max(max(a, b), c) -> max3(a, b, c)
15522	// min(min(a, b), c) -> min3(a, b, c)
15523	if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15524	SDLoc DL(N);
15525	return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: `0`),
15526	N1: Op0.getOperand(i: `0`), N2: Op0.getOperand(i: `1`), N3: Op1);
15527	}
15528
15529	// Try commuted.
15530	// max(a, max(b, c)) -> max3(a, b, c)
15531	// min(a, min(b, c)) -> min3(a, b, c)
15532	if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15533	SDLoc DL(N);
15534	return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: `0`),
15535	N1: Op0, N2: Op1.getOperand(i: `0`), N3: Op1.getOperand(i: `1`));
15536	}
15537	}
15538
15539	// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15540	// max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15541	if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15542	if (SDValue Med3 = performIntMed3ImmCombine(
15543	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op1, MaxVal: Op0 ->getOperand(Num: `1`), Signed: true))
15544	return Med3;
15545	}
15546	if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15547	if (SDValue Med3 = performIntMed3ImmCombine(
15548	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op0 ->getOperand(Num: `1`), MaxVal: Op1, Signed: true))
15549	return Med3;
15550	}
15551
15552	if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15553	if (SDValue Med3 = performIntMed3ImmCombine(
15554	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op1, MaxVal: Op0 ->getOperand(Num: `1`), Signed: false))
15555	return Med3;
15556	}
15557	if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15558	if (SDValue Med3 = performIntMed3ImmCombine(
15559	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op0 ->getOperand(Num: `1`), MaxVal: Op1, Signed: false))
15560	return Med3;
15561	}
15562
15563	// if !is_snan(x):
15564	// fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15565	// fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15566	// fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15567	// fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15568	if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) \|\|
15569	(Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) \|\|
15570	(Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) \|\|
15571	(Opc == AMDGPUISD::FMIN_LEGACY &&
15572	Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15573	(VT == MVT::f32 \|\| VT == MVT::f64 \|\|
15574	(VT == MVT::f16 && Subtarget->has16BitInsts()) \|\|
15575	(VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) \|\|
15576	(VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) \|\|
15577	(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15578	Op0.hasOneUse()) {
15579	if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc (N), Op0, Op1,
15580	IsKnownNoNaNs: N->getFlags().hasNoNaNs()))
15581	return Res;
15582	}
15583
15584	// Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15585	// for some types, but at a higher cost since it's implemented with a 3
15586	// operand form.
15587	const SDNodeFlags Flags = N->getFlags();
15588	if ((Opc == ISD::FMINIMUM \|\| Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15589	!Subtarget->hasIEEEMinimumMaximumInsts() &&
15590	isOperationLegal(Op: ISD::FMINNUM_IEEE, VT: VT.getScalarType())) {
15591	unsigned NewOpc =
15592	Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15593	return DAG.getNode(Opcode: NewOpc, DL: SDLoc (N), VT, N1: Op0, N2: Op1, Flags);
15594	}
15595
15596	return SDValue ();
15597	}
15598
15599	static bool isClampZeroToOne(SDValue A, SDValue B) {
15600	if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
15601	if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
15602	// FIXME: Should this be allowing -0.0?
15603	return (CA->isExactlyValue(V: `0.0`) && CB->isExactlyValue(V: `1.0`)) \|\|
15604	(CA->isExactlyValue(V: `1.0`) && CB->isExactlyValue(V: `0.0`));
15605	}
15606	}
15607
15608	return false;
15609	}
15610
15611	// FIXME: Should only worry about snans for version with chain.
15612	SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15613	DAGCombinerInfo &DCI) const {
15614	EVT VT = N->getValueType(ResNo: `0`);
15615	// v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15616	// NaNs. With a NaN input, the order of the operands may change the result.
15617
15618	SelectionDAG &DAG = DCI.DAG;
15619	SDLoc SL(N);
15620
15621	SDValue Src0 = N->getOperand(Num: `0`);
15622	SDValue Src1 = N->getOperand(Num: `1`);
15623	SDValue Src2 = N->getOperand(Num: `2`);
15624
15625	if (isClampZeroToOne(A: Src0, B: Src1)) {
15626	// const_a, const_b, x -> clamp is safe in all cases including signaling
15627	// nans.
15628	// FIXME: Should this be allowing -0.0?
15629	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
15630	}
15631
15632	const MachineFunction &MF = DAG.getMachineFunction();
15633	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15634
15635	// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15636	// handling no dx10-clamp?
15637	if (Info->getMode().DX10Clamp) {
15638	// If NaNs is clamped to 0, we are free to reorder the inputs.
15639
15640	if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15641	std::swap(a&: Src0, b&: Src1);
15642
15643	if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
15644	std::swap(a&: Src1, b&: Src2);
15645
15646	if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15647	std::swap(a&: Src0, b&: Src1);
15648
15649	if (isClampZeroToOne(A: Src1, B: Src2))
15650	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
15651	}
15652
15653	return SDValue ();
15654	}
15655
15656	SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15657	DAGCombinerInfo &DCI) const {
15658	SDValue Src0 = N->getOperand(Num: `0`);
15659	SDValue Src1 = N->getOperand(Num: `1`);
15660	if (Src0.isUndef() && Src1.isUndef())
15661	return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: `0`));
15662	return SDValue ();
15663	}
15664
15665	// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15666	// expanded into a set of cmp/select instructions.
15667	bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
15668	unsigned NumElem,
15669	bool IsDivergentIdx,
15670	const GCNSubtarget *Subtarget) {
15671	if (UseDivergentRegisterIndexing)
15672	return false;
15673
15674	unsigned VecSize = EltSize * NumElem;
15675
15676	// Sub-dword vectors of size 2 dword or less have better implementation.
15677	if (VecSize <= `64` && EltSize < `32`)
15678	return false;
15679
15680	// Always expand the rest of sub-dword instructions, otherwise it will be
15681	// lowered via memory.
15682	if (EltSize < `32`)
15683	return true;
15684
15685	// Always do this if var-idx is divergent, otherwise it will become a loop.
15686	if (IsDivergentIdx)
15687	return true;
15688
15689	// Large vectors would yield too many compares and v_cndmask_b32 instructions.
15690	unsigned NumInsts = NumElem / Number of compares / +
15691	((EltSize + `31`) / `32`) * NumElem / Number of cndmasks /;
15692
15693	// On some architectures (GFX9) movrel is not available and it's better
15694	// to expand.
15695	if (Subtarget->useVGPRIndexMode())
15696	return NumInsts <= `16`;
15697
15698	// If movrel is available, use it instead of expanding for vector of 8
15699	// elements.
15700	if (Subtarget->hasMovrel())
15701	return NumInsts <= `15`;
15702
15703	return true;
15704	}
15705
15706	bool SITargetLowering::shouldExpandVectorDynExt(SDNode N) const* {
15707	SDValue Idx = N->getOperand(Num: N->getNumOperands() - `1`);
15708	if (isa<ConstantSDNode>(Val: Idx))
15709	return false;
15710
15711	SDValue Vec = N->getOperand(Num: `0`);
15712	EVT VecVT = Vec.getValueType();
15713	EVT EltVT = VecVT.getVectorElementType();
15714	unsigned EltSize = EltVT.getSizeInBits();
15715	unsigned NumElem = VecVT.getVectorNumElements();
15716
15717	return SITargetLowering::shouldExpandVectorDynExt(
15718	EltSize, NumElem, IsDivergentIdx: Idx ->isDivergent(), Subtarget: getSubtarget());
15719	}
15720
15721	SDValue
15722	SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15723	DAGCombinerInfo &DCI) const {
15724	SDValue Vec = N->getOperand(Num: `0`);
15725	SelectionDAG &DAG = DCI.DAG;
15726
15727	EVT VecVT = Vec.getValueType();
15728	EVT VecEltVT = VecVT.getVectorElementType();
15729	EVT ResVT = N->getValueType(ResNo: `0`);
15730
15731	unsigned VecSize = VecVT.getSizeInBits();
15732	unsigned VecEltSize = VecEltVT.getSizeInBits();
15733
15734	if ((Vec.getOpcode() == ISD::FNEG \|\| Vec.getOpcode() == ISD::FABS) &&
15735	allUsesHaveSourceMods(N)) {
15736	SDLoc SL(N);
15737	SDValue Idx = N->getOperand(Num: `1`);
15738	SDValue Elt =
15739	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: `0`), N2: Idx);
15740	return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
15741	}
15742
15743	// (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15744	// -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15745	// There are optimisations to transform 64-bit shifts into 32-bit shifts
15746	// depending on the shift operand. See e.g. performSraCombine().
15747	// This combine ensures that the optimisation is compatible with v2i32
15748	// legalised AND.
15749	if (VecVT == MVT::v2i32 && Vec ->getOpcode() == ISD::AND &&
15750	Vec ->getOperand(Num: `1`)->getOpcode() == ISD::BUILD_VECTOR) {
15751
15752	const ConstantSDNode *C = isConstOrConstSplat(N: Vec.getOperand(i: `1`));
15753	if (!C \|\| C->getZExtValue() != `0x1f`)
15754	return SDValue ();
15755
15756	SDLoc SL(N);
15757	SDValue AndMask = DAG.getConstant(Val: `0x1f`, DL: SL, VT: MVT::i32);
15758	SDValue EVE = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32,
15759	N1: Vec ->getOperand(Num: `0`), N2: N->getOperand(Num: `1`));
15760	SDValue A = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: EVE, N2: AndMask);
15761	DAG.ReplaceAllUsesWith(From: N, To: A.getNode());
15762	}
15763
15764	// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15765	// =>
15766	// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15767	// Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15768	// ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15769	if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15770	SDLoc SL(N);
15771	SDValue Idx = N->getOperand(Num: `1`);
15772	unsigned Opc = Vec.getOpcode();
15773
15774	switch (Opc) {
15775	default:
15776	break;
15777	// TODO: Support other binary operations.
15778	case ISD::FADD:
15779	case ISD::FSUB:
15780	case ISD::FMUL:
15781	case ISD::ADD:
15782	case ISD::UMIN:
15783	case ISD::UMAX:
15784	case ISD::SMIN:
15785	case ISD::SMAX:
15786	case ISD::FMAXNUM:
15787	case ISD::FMINNUM:
15788	case ISD::FMAXNUM_IEEE:
15789	case ISD::FMINNUM_IEEE:
15790	case ISD::FMAXIMUM:
15791	case ISD::FMINIMUM: {
15792	SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15793	N1: Vec.getOperand(i: `0`), N2: Idx);
15794	SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15795	N1: Vec.getOperand(i: `1`), N2: Idx);
15796
15797	DCI.AddToWorklist(N: Elt0.getNode());
15798	DCI.AddToWorklist(N: Elt1.getNode());
15799	return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec ->getFlags());
15800	}
15801	}
15802	}
15803
15804	// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15805	if (shouldExpandVectorDynExt(N)) {
15806	SDLoc SL(N);
15807	SDValue Idx = N->getOperand(Num: `1`);
15808	SDValue V;
15809	for (unsigned I = `0`, E = VecVT.getVectorNumElements(); I < E; ++I) {
15810	SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
15811	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
15812	if (I == `0`)
15813	V = Elt;
15814	else
15815	V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
15816	}
15817	return V;
15818	}
15819
15820	// EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
15821	// =>
15822	// i32:Lo(k) if Idx == 0, or
15823	// i32:Hi(k) if Idx == 1
15824	auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
15825	if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15826	SDLoc SL(N);
15827	SDValue PeekThrough = Vec.getOperand(i: `0`);
15828	auto *KImm = dyn_cast<ConstantSDNode>(Val&: PeekThrough);
15829	if (KImm && KImm->getValueType(ResNo: `0`).getSizeInBits() == `64`) {
15830	uint64_t KImmValue = KImm->getZExtValue();
15831	return DAG.getConstant(
15832	Val: (KImmValue >> (`32` * Idx->getZExtValue())) & `0xffffffff`, DL: SL, VT: MVT::i32);
15833	}
15834	auto *KFPImm = dyn_cast<ConstantFPSDNode>(Val&: PeekThrough);
15835	if (KFPImm && KFPImm->getValueType(ResNo: `0`).getSizeInBits() == `64`) {
15836	uint64_t KFPImmValue =
15837	KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15838	return DAG.getConstant(Val: (KFPImmValue >> (`32` * Idx->getZExtValue())) &
15839	`0xffffffff`,
15840	DL: SL, VT: MVT::i32);
15841	}
15842	}
15843
15844	if (!DCI.isBeforeLegalize())
15845	return SDValue ();
15846
15847	// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15848	// elements. This exposes more load reduction opportunities by replacing
15849	// multiple small extract_vector_elements with a single 32-bit extract.
15850	if (isa<MemSDNode>(Val: Vec) && VecEltSize <= `16` && VecEltVT.isByteSized() &&
15851	VecSize > `32` && VecSize % `32` == `0` && Idx) {
15852	EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
15853
15854	unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15855	unsigned EltIdx = BitIndex / `32`;
15856	unsigned LeftoverBitIdx = BitIndex % `32`;
15857	SDLoc SL(N);
15858
15859	SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
15860	DCI.AddToWorklist(N: Cast.getNode());
15861
15862	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
15863	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
15864	DCI.AddToWorklist(N: Elt.getNode());
15865	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
15866	N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
15867	DCI.AddToWorklist(N: Srl.getNode());
15868
15869	EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15870	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
15871	DCI.AddToWorklist(N: Trunc.getNode());
15872
15873	if (VecEltVT == ResVT) {
15874	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
15875	}
15876
15877	assert(ResVT.isScalarInteger());
15878	return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
15879	}
15880
15881	return SDValue ();
15882	}
15883
15884	SDValue
15885	SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15886	DAGCombinerInfo &DCI) const {
15887	SDValue Vec = N->getOperand(Num: `0`);
15888	SDValue Idx = N->getOperand(Num: `2`);
15889	EVT VecVT = Vec.getValueType();
15890	EVT EltVT = VecVT.getVectorElementType();
15891
15892	// INSERT_VECTOR_ELT (<n x e>, var-idx)
15893	// => BUILD_VECTOR n x select (e, const-idx)
15894	if (!shouldExpandVectorDynExt(N))
15895	return SDValue ();
15896
15897	SelectionDAG &DAG = DCI.DAG;
15898	SDLoc SL(N);
15899	SDValue Ins = N->getOperand(Num: `1`);
15900	EVT IdxVT = Idx.getValueType();
15901
15902	SmallVector<SDValue, `16`> Ops;
15903	for (unsigned I = `0`, E = VecVT.getVectorNumElements(); I < E; ++I) {
15904	SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
15905	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
15906	SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
15907	Ops.push_back(Elt: V);
15908	}
15909
15910	return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
15911	}
15912
15913	/// Return the source of an fp_extend from f16 to f32, or a converted FP
15914	/// constant.
15915	static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
15916	if (Src.getOpcode() == ISD::FP_EXTEND &&
15917	Src.getOperand(i: `0`).getValueType() == MVT::f16) {
15918	return Src.getOperand(i: `0`);
15919	}
15920
15921	if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
15922	APFloat Val = CFP->getValueAPF();
15923	bool LosesInfo = true;
15924	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
15925	if (!LosesInfo)
15926	return DAG.getConstantFP(Val, DL: SDLoc (Src), VT: MVT::f16);
15927	}
15928
15929	return SDValue ();
15930	}
15931
15932	SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15933	DAGCombinerInfo &DCI) const {
15934	assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15935	"combine only useful on gfx8");
15936
15937	SDValue TruncSrc = N->getOperand(Num: `0`);
15938	EVT VT = N->getValueType(ResNo: `0`);
15939	if (VT != MVT::f16)
15940	return SDValue ();
15941
15942	if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 \|\|
15943	TruncSrc.getValueType() != MVT::f32 \|\| !TruncSrc.hasOneUse())
15944	return SDValue ();
15945
15946	SelectionDAG &DAG = DCI.DAG;
15947	SDLoc SL(N);
15948
15949	// Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15950	// and expanding it with min/max saves 1 instruction vs. casting to f32 and
15951	// casting back.
15952
15953	// fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15954	// fmin(fmax(a, b), fmax(fmin(a, b), c))
15955	SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `0`));
15956	if (!A)
15957	return SDValue ();
15958
15959	SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `1`));
15960	if (!B)
15961	return SDValue ();
15962
15963	SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `2`));
15964	if (!C)
15965	return SDValue ();
15966
15967	// This changes signaling nan behavior. If an input is a signaling nan, it
15968	// would have been quieted by the fpext originally. We don't care because
15969	// these are unconstrained ops. If we needed to insert quieting canonicalizes
15970	// we would be worse off than just doing the promotion.
15971	SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15972	SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15973	SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
15974	return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
15975	}
15976
15977	unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15978	const SDNode *N0,
15979	const SDNode N1) const* {
15980	EVT VT = N0->getValueType(ResNo: `0`);
15981
15982	// Only do this if we are not trying to support denormals. v_mad_f32 does not
15983	// support denormals ever.
15984	if (((VT == MVT::f32 &&
15985	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) \|\|
15986	(VT == MVT::f16 && Subtarget->hasMadF16() &&
15987	denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
15988	isOperationLegal(Op: ISD::FMAD, VT))
15989	return ISD::FMAD;
15990
15991	const TargetOptions &Options = DAG.getTarget().Options;
15992	if ((Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
15993	(N0->getFlags().hasAllowContract() &&
15994	N1->getFlags().hasAllowContract())) &&
15995	isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
15996	return ISD::FMA;
15997	}
15998
15999	return `0`;
16000	}
16001
16002	// For a reassociatable opcode perform:
16003	// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16004	SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16005	SelectionDAG &DAG) const {
16006	EVT VT = N->getValueType(ResNo: `0`);
16007	if (VT != MVT::i32 && VT != MVT::i64)
16008	return SDValue ();
16009
16010	if (DAG.isBaseWithConstantOffset(Op: SDValue (N, `0`)))
16011	return SDValue ();
16012
16013	unsigned Opc = N->getOpcode();
16014	SDValue Op0 = N->getOperand(Num: `0`);
16015	SDValue Op1 = N->getOperand(Num: `1`);
16016
16017	if (!(Op0 ->isDivergent() ^ Op1 ->isDivergent()))
16018	return SDValue ();
16019
16020	if (Op0 ->isDivergent())
16021	std::swap(a&: Op0, b&: Op1);
16022
16023	if (Op1.getOpcode() != Opc \|\| !Op1.hasOneUse())
16024	return SDValue ();
16025
16026	SDValue Op2 = Op1.getOperand(i: `1`);
16027	Op1 = Op1.getOperand(i: `0`);
16028	if (!(Op1 ->isDivergent() ^ Op2 ->isDivergent()))
16029	return SDValue ();
16030
16031	if (Op1 ->isDivergent())
16032	std::swap(a&: Op1, b&: Op2);
16033
16034	SDLoc SL(N);
16035	SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
16036	return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
16037	}
16038
16039	static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16040	SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16041	unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
16042	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
16043	SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
16044	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
16045	}
16046
16047	// Fold
16048	// y = lshr i64 x, 32
16049	// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16050	// with Const.hi == -1
16051	// To
16052	// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16053	static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
16054	SDValue MulLHS, SDValue MulRHS,
16055	SDValue AddRHS) {
16056	if (MulRHS.getOpcode() == ISD::SRL)
16057	std::swap(a&: MulLHS, b&: MulRHS);
16058
16059	if (MulLHS.getValueType() != MVT::i64 \|\| MulLHS.getOpcode() != ISD::SRL)
16060	return SDValue ();
16061
16062	ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: `1`));
16063	if (!ShiftVal \|\| ShiftVal->getAsZExtVal() != `32` \|\|
16064	MulLHS.getOperand(i: `0`) != AddRHS)
16065	return SDValue ();
16066
16067	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
16068	if (!Const \|\| Hi_32(Value: Const->getZExtValue()) != uint32_t(-`1`))
16069	return SDValue ();
16070
16071	SDValue ConstMul =
16072	DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
16073	return getMad64_32(DAG, SL, VT: MVT::i64,
16074	N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
16075	N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
16076	}
16077
16078	// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16079	// multiplies, if any.
16080	//
16081	// Full 64-bit multiplies that feed into an addition are lowered here instead
16082	// of using the generic expansion. The generic expansion ends up with
16083	// a tree of ADD nodes that prevents us from using the "add" part of the
16084	// MAD instruction. The expansion produced here results in a chain of ADDs
16085	// instead of a tree.
16086	SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16087	DAGCombinerInfo &DCI) const {
16088	assert(N->isAnyAdd());
16089
16090	SelectionDAG &DAG = DCI.DAG;
16091	EVT VT = N->getValueType(ResNo: `0`);
16092	SDLoc SL(N);
16093	SDValue LHS = N->getOperand(Num: `0`);
16094	SDValue RHS = N->getOperand(Num: `1`);
16095
16096	if (VT.isVector())
16097	return SDValue ();
16098
16099	// S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16100	// result in scalar registers for uniform values.
16101	if (!N->isDivergent() && Subtarget->hasSMulHi())
16102	return SDValue ();
16103
16104	unsigned NumBits = VT.getScalarSizeInBits();
16105	if (NumBits <= `32` \|\| NumBits > `64`)
16106	return SDValue ();
16107
16108	if (LHS.getOpcode() != ISD::MUL) {
16109	assert(RHS.getOpcode() == ISD::MUL);
16110	std::swap(a&: LHS, b&: RHS);
16111	}
16112
16113	// Avoid the fold if it would unduly increase the number of multiplies due to
16114	// multiple uses, except on hardware with full-rate multiply-add (which is
16115	// part of full-rate 64-bit ops).
16116	if (!Subtarget->hasFullRate64Ops()) {
16117	unsigned NumUsers = `0`;
16118	for (SDNode *User : LHS ->users()) {
16119	// There is a use that does not feed into addition, so the multiply can't
16120	// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16121	if (!User->isAnyAdd())
16122	return SDValue ();
16123
16124	// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16125	// MUL + 3xADD + 3xADDC over 3xMAD.
16126	++NumUsers;
16127	if (NumUsers >= `3`)
16128	return SDValue ();
16129	}
16130	}
16131
16132	SDValue MulLHS = LHS.getOperand(i: `0`);
16133	SDValue MulRHS = LHS.getOperand(i: `1`);
16134	SDValue AddRHS = RHS;
16135
16136	if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16137	return FoldedMAD;
16138
16139	// Always check whether operands are small unsigned values, since that
16140	// knowledge is useful in more cases. Check for small signed values only if
16141	// doing so can unlock a shorter code sequence.
16142	bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= `32`;
16143	bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= `32`;
16144
16145	bool MulSignedLo = false;
16146	if (!MulLHSUnsigned32 \|\| !MulRHSUnsigned32) {
16147	MulSignedLo =
16148	numBitsSigned(Op: MulLHS, DAG) <= `32` && numBitsSigned(Op: MulRHS, DAG) <= `32`;
16149	}
16150
16151	// The operands and final result all have the same number of bits. If
16152	// operands need to be extended, they can be extended with garbage. The
16153	// resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16154	// truncated away in the end.
16155	if (VT != MVT::i64) {
16156	MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
16157	MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
16158	AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
16159	}
16160
16161	// The basic code generated is conceptually straightforward. Pseudo code:
16162	//
16163	// accum = mad_64_32 lhs.lo, rhs.lo, accum
16164	// accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16165	// accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16166	//
16167	// The second and third lines are optional, depending on whether the factors
16168	// are {sign,zero}-extended or not.
16169	//
16170	// The actual DAG is noisier than the pseudo code, but only due to
16171	// instructions that disassemble values into low and high parts, and
16172	// assemble the final result.
16173	SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
16174
16175	auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
16176	auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
16177	SDValue Accum =
16178	getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
16179
16180	if (!MulSignedLo && (!MulLHSUnsigned32 \|\| !MulRHSUnsigned32)) {
16181	auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
16182
16183	if (!MulLHSUnsigned32) {
16184	auto MulLHSHi =
16185	DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
16186	SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
16187	AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16188	}
16189
16190	if (!MulRHSUnsigned32) {
16191	auto MulRHSHi =
16192	DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
16193	SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
16194	AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16195	}
16196
16197	Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
16198	Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
16199	}
16200
16201	if (VT != MVT::i64)
16202	Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
16203	return Accum;
16204	}
16205
16206	SDValue
16207	SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16208	DAGCombinerInfo &DCI) const {
16209	SDValue RHS = N->getOperand(Num: `1`);
16210	auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
16211	if (!CRHS)
16212	return SDValue ();
16213
16214	// TODO: Worth using computeKnownBits? Maybe expensive since it's so
16215	// common.
16216	uint64_t Val = CRHS->getZExtValue();
16217	if (countr_zero(Val) >= `32`) {
16218	SelectionDAG &DAG = DCI.DAG;
16219	SDLoc SL(N);
16220	SDValue LHS = N->getOperand(Num: `0`);
16221
16222	// Avoid carry machinery if we know the low half of the add does not
16223	// contribute to the final result.
16224	//
16225	// add i64:x, K if computeTrailingZeros(K) >= 32
16226	// => build_pair (add x.hi, K.hi), x.lo
16227
16228	// Breaking the 64-bit add here with this strange constant is unlikely
16229	// to interfere with addressing mode patterns.
16230
16231	SDValue Hi = getHiHalf64(Op: LHS, DAG);
16232	SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
16233	unsigned Opcode = N->getOpcode();
16234	if (Opcode == ISD::PTRADD)
16235	Opcode = ISD::ADD;
16236	SDValue AddHi =
16237	DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
16238
16239	SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
16240	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
16241	}
16242
16243	return SDValue ();
16244	}
16245
16246	// Collect the ultimate src of each of the mul node's operands, and confirm
16247	// each operand is 8 bytes.
16248	static std::optional<ByteProvider<SDValue>>
16249	handleMulOperand(const SDValue &MulOperand) {
16250	auto Byte0 = calculateByteProvider(Op: MulOperand, Index: `0`, Depth: `0`);
16251	if (!Byte0 \|\| Byte0 ->isConstantZero()) {
16252	return std::nullopt;
16253	}
16254	auto Byte1 = calculateByteProvider(Op: MulOperand, Index: `1`, Depth: `0`);
16255	if (Byte1 && !Byte1 ->isConstantZero()) {
16256	return std::nullopt;
16257	}
16258	return Byte0;
16259	}
16260
16261	static unsigned addPermMasks(unsigned First, unsigned Second) {
16262	unsigned FirstCs = First & `0x0c0c0c0c`;
16263	unsigned SecondCs = Second & `0x0c0c0c0c`;
16264	unsigned FirstNoCs = First & ~`0x0c0c0c0c`;
16265	unsigned SecondNoCs = Second & ~`0x0c0c0c0c`;
16266
16267	assert((FirstCs & `0xFF`) \| (SecondCs & `0xFF`));
16268	assert((FirstCs & `0xFF00`) \| (SecondCs & `0xFF00`));
16269	assert((FirstCs & `0xFF0000`) \| (SecondCs & `0xFF0000`));
16270	assert((FirstCs & `0xFF000000`) \| (SecondCs & `0xFF000000`));
16271
16272	return (FirstNoCs \| SecondNoCs) \| (FirstCs & SecondCs);
16273	}
16274
16275	struct DotSrc {
16276	SDValue SrcOp;
16277	int64_t PermMask;
16278	int64_t DWordOffset;
16279	};
16280
16281	static void placeSources(ByteProvider<SDValue> &Src0,
16282	ByteProvider<SDValue> &Src1,
16283	SmallVectorImpl<DotSrc> &Src0s,
16284	SmallVectorImpl<DotSrc> &Src1s, int Step) {
16285
16286	assert(Src0.Src.has_value() && Src1.Src.has_value());
16287	// Src0s and Src1s are empty, just place arbitrarily.
16288	if (Step == `0`) {
16289	Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % `4`) << `24`) + `0x0c0c0c`,
16290	.DWordOffset: Src0.SrcOffset / `4`});
16291	Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % `4`) << `24`) + `0x0c0c0c`,
16292	.DWordOffset: Src1.SrcOffset / `4`});
16293	return;
16294	}
16295
16296	for (int BPI = `0`; BPI < `2`; BPI++) {
16297	std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16298	if (BPI == `1`) {
16299	BPP = {Src1, Src0};
16300	}
16301	unsigned ZeroMask = `0x0c0c0c0c`;
16302	unsigned FMask = `0xFF` << (`8` * (`3` - Step));
16303
16304	unsigned FirstMask =
16305	(BPP.first.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask);
16306	unsigned SecondMask =
16307	(BPP.second.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask);
16308	// Attempt to find Src vector which contains our SDValue, if so, add our
16309	// perm mask to the existing one. If we are unable to find a match for the
16310	// first SDValue, attempt to find match for the second.
16311	int FirstGroup = -`1`;
16312	for (int I = `0`; I < `2`; I++) {
16313	SmallVectorImpl<DotSrc> &Srcs = I == `0` ? Src0s : Src1s;
16314	auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16315	return IterElt.SrcOp == *BPP.first.Src &&
16316	(IterElt.DWordOffset == (BPP.first.SrcOffset / `4`));
16317	};
16318
16319	auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
16320	if (Match != Srcs.end()) {
16321	Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
16322	FirstGroup = I;
16323	break;
16324	}
16325	}
16326	if (FirstGroup != -`1`) {
16327	SmallVectorImpl<DotSrc> &Srcs = FirstGroup == `1` ? Src0s : Src1s;
16328	auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16329	return IterElt.SrcOp == *BPP.second.Src &&
16330	(IterElt.DWordOffset == (BPP.second.SrcOffset / `4`));
16331	};
16332	auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
16333	if (Match != Srcs.end()) {
16334	Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
16335	} else
16336	Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / `4`});
16337	return;
16338	}
16339	}
16340
16341	// If we have made it here, then we could not find a match in Src0s or Src1s
16342	// for either Src0 or Src1, so just place them arbitrarily.
16343
16344	unsigned ZeroMask = `0x0c0c0c0c`;
16345	unsigned FMask = `0xFF` << (`8` * (`3` - Step));
16346
16347	Src0s.push_back(
16348	Elt: {.SrcOp: *Src0.Src,
16349	.PermMask: ((Src0.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask)),
16350	.DWordOffset: Src0.SrcOffset / `4`});
16351	Src1s.push_back(
16352	Elt: {.SrcOp: *Src1.Src,
16353	.PermMask: ((Src1.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask)),
16354	.DWordOffset: Src1.SrcOffset / `4`});
16355	}
16356
16357	static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
16358	SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16359	bool IsAny) {
16360
16361	// If we just have one source, just permute it accordingly.
16362	if (Srcs.size() == `1`) {
16363	auto *Elt = Srcs.begin();
16364	auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
16365
16366	// v_perm will produce the original value
16367	if (Elt->PermMask == `0x3020100`)
16368	return EltOp;
16369
16370	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16371	N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
16372	}
16373
16374	auto *FirstElt = Srcs.begin();
16375	auto *SecondElt = std::next(x: FirstElt);
16376
16377	SmallVector<SDValue, `2`> Perms;
16378
16379	// If we have multiple sources in the chain, combine them via perms (using
16380	// calculated perm mask) and Ors.
16381	while (true) {
16382	auto FirstMask = FirstElt->PermMask;
16383	auto SecondMask = SecondElt->PermMask;
16384
16385	unsigned FirstCs = FirstMask & `0x0c0c0c0c`;
16386	unsigned FirstPlusFour = FirstMask \| `0x04040404`;
16387	// 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16388	// original 0x0C.
16389	FirstMask = (FirstPlusFour & `0x0F0F0F0F`) \| FirstCs;
16390
16391	auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
16392	auto FirstVal =
16393	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16394	auto SecondVal =
16395	getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
16396
16397	Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
16398	N2: SecondVal,
16399	N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
16400
16401	FirstElt = std::next(x: SecondElt);
16402	if (FirstElt == Srcs.end())
16403	break;
16404
16405	SecondElt = std::next(x: FirstElt);
16406	// If we only have a FirstElt, then just combine that into the cumulative
16407	// source node.
16408	if (SecondElt == Srcs.end()) {
16409	auto EltOp =
16410	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16411
16412	Perms.push_back(
16413	Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16414	N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
16415	break;
16416	}
16417	}
16418
16419	assert(Perms.size() == `1` \|\| Perms.size() == `2`);
16420	return Perms.size() == `2`
16421	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms [`0`], N2: Perms [`1`])
16422	: Perms [`0`];
16423	}
16424
16425	static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16426	for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16427	EntryMask = EntryMask >> ((`4` - ChainLength) * `8`);
16428	auto ZeroMask = ChainLength == `2` ? `0x0c0c0000` : `0x0c000000`;
16429	EntryMask += ZeroMask;
16430	}
16431	}
16432
16433	static bool isMul(const SDValue Op) {
16434	auto Opcode = Op.getOpcode();
16435
16436	return (Opcode == ISD::MUL \|\| Opcode == AMDGPUISD::MUL_U24 \|\|
16437	Opcode == AMDGPUISD::MUL_I24);
16438	}
16439
16440	static std::optional<bool>
16441	checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
16442	ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16443	const SDValue &S1Op, const SelectionDAG &DAG) {
16444	// If we both ops are i8s (pre legalize-dag), then the signedness semantics
16445	// of the dot4 is irrelevant.
16446	if (S0Op.getValueSizeInBits() == `8` && S1Op.getValueSizeInBits() == `8`)
16447	return false;
16448
16449	auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: `0`);
16450	bool S0IsUnsigned = Known0.countMinLeadingZeros() > `0`;
16451	bool S0IsSigned = Known0.countMinLeadingOnes() > `0`;
16452	auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: `0`);
16453	bool S1IsUnsigned = Known1.countMinLeadingZeros() > `0`;
16454	bool S1IsSigned = Known1.countMinLeadingOnes() > `0`;
16455
16456	assert(!(S0IsUnsigned && S0IsSigned));
16457	assert(!(S1IsUnsigned && S1IsSigned));
16458
16459	// There are 9 possible permutations of
16460	// {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16461
16462	// In two permutations, the sign bits are known to be the same for both Ops,
16463	// so simply return Signed / Unsigned corresponding to the MSB
16464
16465	if ((S0IsUnsigned && S1IsUnsigned) \|\| (S0IsSigned && S1IsSigned))
16466	return S0IsSigned;
16467
16468	// In another two permutations, the sign bits are known to be opposite. In
16469	// this case return std::nullopt to indicate a bad match.
16470
16471	if ((S0IsUnsigned && S1IsSigned) \|\| (S0IsSigned && S1IsUnsigned))
16472	return std::nullopt;
16473
16474	// In the remaining five permutations, we don't know the value of the sign
16475	// bit for at least one Op. Since we have a valid ByteProvider, we know that
16476	// the upper bits must be extension bits. Thus, the only ways for the sign
16477	// bit to be unknown is if it was sign extended from unknown value, or if it
16478	// was any extended. In either case, it is correct to use the signed
16479	// version of the signedness semantics of dot4
16480
16481	// In two of such permutations, we known the sign bit is set for
16482	// one op, and the other is unknown. It is okay to used signed version of
16483	// dot4.
16484	if ((S0IsSigned && !(S1IsSigned \|\| S1IsUnsigned)) \|\|
16485	((S1IsSigned && !(S0IsSigned \|\| S0IsUnsigned))))
16486	return true;
16487
16488	// In one such permutation, we don't know either of the sign bits. It is okay
16489	// to used the signed version of dot4.
16490	if ((!(S1IsSigned \|\| S1IsUnsigned) && !(S0IsSigned \|\| S0IsUnsigned)))
16491	return true;
16492
16493	// In two of such permutations, we known the sign bit is unset for
16494	// one op, and the other is unknown. Return std::nullopt to indicate a
16495	// bad match.
16496	if ((S0IsUnsigned && !(S1IsSigned \|\| S1IsUnsigned)) \|\|
16497	((S1IsUnsigned && !(S0IsSigned \|\| S0IsUnsigned))))
16498	return std::nullopt;
16499
16500	llvm_unreachable("Fully covered condition");
16501	}
16502
16503	SDValue SITargetLowering::performAddCombine(SDNode *N,
16504	DAGCombinerInfo &DCI) const {
16505	SelectionDAG &DAG = DCI.DAG;
16506	EVT VT = N->getValueType(ResNo: `0`);
16507	SDLoc SL(N);
16508	SDValue LHS = N->getOperand(Num: `0`);
16509	SDValue RHS = N->getOperand(Num: `1`);
16510
16511	if (LHS.getOpcode() == ISD::MUL \|\| RHS.getOpcode() == ISD::MUL) {
16512	if (Subtarget->hasMad64_32()) {
16513	if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16514	return Folded;
16515	}
16516	}
16517
16518	if (SDValue V = reassociateScalarOps(N, DAG)) {
16519	return V;
16520	}
16521
16522	if (VT == MVT::i64) {
16523	if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16524	return Folded;
16525	}
16526
16527	if ((isMul(Op: LHS) \|\| isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
16528	(Subtarget->hasDot1Insts() \|\| Subtarget->hasDot8Insts())) {
16529	SDValue TempNode(N, `0`);
16530	std::optional<bool> IsSigned;
16531	SmallVector<DotSrc, `4`> Src0s;
16532	SmallVector<DotSrc, `4`> Src1s;
16533	SmallVector<SDValue, `4`> Src2s;
16534
16535	// Match the v_dot4 tree, while collecting src nodes.
16536	int ChainLength = `0`;
16537	for (int I = `0`; I < `4`; I++) {
16538	auto MulIdx = isMul(Op: LHS) ? `0` : isMul(Op: RHS) ? `1` : -`1`;
16539	if (MulIdx == -`1`)
16540	break;
16541	auto Src0 = handleMulOperand(MulOperand: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `0`));
16542	if (!Src0)
16543	break;
16544	auto Src1 = handleMulOperand(MulOperand: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `1`));
16545	if (!Src1)
16546	break;
16547
16548	auto IterIsSigned = checkDot4MulSignedness(
16549	N: TempNode ->getOperand(Num: MulIdx), Src0&: Src0, Src1&: Src1,
16550	S0Op: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `0`),
16551	S1Op: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `1`), DAG);
16552	if (!IterIsSigned)
16553	break;
16554	if (!IsSigned)
16555	IsSigned = *IterIsSigned;
16556	if (IterIsSigned != IsSigned)
16557	break;
16558	placeSources(Src0&: Src0, Src1&: Src1, Src0s, Src1s, Step: I);
16559	auto AddIdx = `1` - MulIdx;
16560	// Allow the special case where add (add (mul24, 0), mul24) became ->
16561	// add (mul24, mul24).
16562	if (I == `2` && isMul(Op: TempNode ->getOperand(Num: AddIdx))) {
16563	Src2s.push_back(Elt: TempNode ->getOperand(Num: AddIdx));
16564	auto Src0 =
16565	handleMulOperand(MulOperand: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `0`));
16566	if (!Src0)
16567	break;
16568	auto Src1 =
16569	handleMulOperand(MulOperand: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `1`));
16570	if (!Src1)
16571	break;
16572	auto IterIsSigned = checkDot4MulSignedness(
16573	N: TempNode ->getOperand(Num: AddIdx), Src0&: Src0, Src1&: Src1,
16574	S0Op: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `0`),
16575	S1Op: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `1`), DAG);
16576	if (!IterIsSigned)
16577	break;
16578	assert(IsSigned);
16579	if (IterIsSigned != IsSigned)
16580	break;
16581	placeSources(Src0&: Src0, Src1&: Src1, Src0s, Src1s, Step: I + `1`);
16582	Src2s.push_back(Elt: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
16583	ChainLength = I + `2`;
16584	break;
16585	}
16586
16587	TempNode = TempNode ->getOperand(Num: AddIdx);
16588	Src2s.push_back(Elt: TempNode);
16589	ChainLength = I + `1`;
16590	if (TempNode ->getNumOperands() < `2`)
16591	break;
16592	LHS = TempNode ->getOperand(Num: `0`);
16593	RHS = TempNode ->getOperand(Num: `1`);
16594	}
16595
16596	if (ChainLength < `2`)
16597	return SDValue ();
16598
16599	// Masks were constructed with assumption that we would find a chain of
16600	// length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16601	// 0x0c) so they do not affect dot calculation.
16602	if (ChainLength < `4`) {
16603	fixMasks(Srcs&: Src0s, ChainLength);
16604	fixMasks(Srcs&: Src1s, ChainLength);
16605	}
16606
16607	SDValue Src0, Src1;
16608
16609	// If we are just using a single source for both, and have permuted the
16610	// bytes consistently, we can just use the sources without permuting
16611	// (commutation).
16612	bool UseOriginalSrc = false;
16613	if (ChainLength == `4` && Src0s.size() == `1` && Src1s.size() == `1` &&
16614	Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16615	Src0s.begin()->SrcOp.getValueSizeInBits() >= `32` &&
16616	Src1s.begin()->SrcOp.getValueSizeInBits() >= `32`) {
16617	SmallVector<unsigned, `4`> SrcBytes;
16618	auto Src0Mask = Src0s.begin()->PermMask;
16619	SrcBytes.push_back(Elt: Src0Mask & `0xFF000000`);
16620	bool UniqueEntries = true;
16621	for (auto I = `1`; I < `4`; I++) {
16622	auto NextByte = Src0Mask & (`0xFF` << ((`3` - I) * `8`));
16623
16624	if (is_contained(Range&: SrcBytes, Element: NextByte)) {
16625	UniqueEntries = false;
16626	break;
16627	}
16628	SrcBytes.push_back(Elt: NextByte);
16629	}
16630
16631	if (UniqueEntries) {
16632	UseOriginalSrc = true;
16633
16634	auto *FirstElt = Src0s.begin();
16635	auto FirstEltOp =
16636	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16637
16638	auto *SecondElt = Src1s.begin();
16639	auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
16640	DWordOffset: SecondElt->DWordOffset);
16641
16642	Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
16643	VT: MVT::getIntegerVT(BitWidth: `32`));
16644	Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
16645	VT: MVT::getIntegerVT(BitWidth: `32`));
16646	}
16647	}
16648
16649	if (!UseOriginalSrc) {
16650	Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
16651	Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
16652	}
16653
16654	assert(IsSigned);
16655	SDValue Src2 =
16656	DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s [ChainLength - `1`], DL: SL, VT: MVT::i32);
16657
16658	SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
16659	: Intrinsic::amdgcn_udot4,
16660	DL: SL, VT: MVT::i64);
16661
16662	assert(!VT.isVector());
16663	auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
16664	N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1));
16665
16666	return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
16667	}
16668
16669	if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())
16670	return SDValue ();
16671
16672	// add x, zext (setcc) => uaddo_carry x, 0, setcc
16673	// add x, sext (setcc) => usubo_carry x, 0, setcc
16674	unsigned Opc = LHS.getOpcode();
16675	if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|
16676	Opc == ISD::ANY_EXTEND \|\| Opc == ISD::UADDO_CARRY)
16677	std::swap(a&: RHS, b&: LHS);
16678
16679	Opc = RHS.getOpcode();
16680	switch (Opc) {
16681	default:
16682	break;
16683	case ISD::ZERO_EXTEND:
16684	case ISD::SIGN_EXTEND:
16685	case ISD::ANY_EXTEND: {
16686	auto Cond = RHS.getOperand(i: `0`);
16687	// If this won't be a real VOPC output, we would still need to insert an
16688	// extra instruction anyway.
16689	if (!isBoolSGPR(V: Cond))
16690	break;
16691	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16692	SDValue Args[] = {LHS, DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), Cond};
16693	Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
16694	return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16695	}
16696	case ISD::UADDO_CARRY: {
16697	// add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16698	if (!isNullConstant(V: RHS.getOperand(i: `1`)))
16699	break;
16700	SDValue Args[] = {LHS, RHS.getOperand(i: `0`), RHS.getOperand(i: `2`)};
16701	return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc (N), VTList: RHS ->getVTList(), Ops: Args);
16702	}
16703	}
16704	return SDValue ();
16705	}
16706
16707	SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16708	DAGCombinerInfo &DCI) const {
16709	SelectionDAG &DAG = DCI.DAG;
16710	SDLoc DL(N);
16711	EVT VT = N->getValueType(ResNo: `0`);
16712	SDValue N0 = N->getOperand(Num: `0`);
16713	SDValue N1 = N->getOperand(Num: `1`);
16714
16715	// The following folds transform PTRADDs into regular arithmetic in cases
16716	// where the PTRADD wouldn't be folded as an immediate offset into memory
16717	// instructions anyway. They are target-specific in that other targets might
16718	// prefer to not lose information about the pointer arithmetic.
16719
16720	// Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16721	// Adapted from DAGCombiner::visitADDLikeCommutative.
16722	SDValue V, K;
16723	if (sd_match(N: N1, P: m_Shl(L: m_Neg(V: m_Value(N&: V)), R: m_Value(N&: K)))) {
16724	SDNodeFlags ShlFlags = N1 ->getFlags();
16725	// If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16726	// so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16727	// preserved.
16728	SDNodeFlags NewShlFlags =
16729	ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16730	? SDNodeFlags::NoSignedWrap
16731	: SDNodeFlags ();
16732	SDValue Inner = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: V, N2: K, Flags: NewShlFlags);
16733	DCI.AddToWorklist(N: Inner.getNode());
16734	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: Inner);
16735	}
16736
16737	// Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16738	// performAddCombine.
16739	if (N1.getOpcode() == ISD::MUL) {
16740	if (Subtarget->hasMad64_32()) {
16741	if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16742	return Folded;
16743	}
16744	}
16745
16746	// If the 32 low bits of the constant are all zero, there is nothing to fold
16747	// into an immediate offset, so it's better to eliminate the unnecessary
16748	// addition for the lower 32 bits than to preserve the PTRADD.
16749	// Analogous to a fold in performAddCombine.
16750	if (VT == MVT::i64) {
16751	if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16752	return Folded;
16753	}
16754
16755	if (N1.getOpcode() != ISD::ADD \|\| !N1.hasOneUse())
16756	return SDValue ();
16757
16758	SDValue X = N0;
16759	SDValue Y = N1.getOperand(i: `0`);
16760	SDValue Z = N1.getOperand(i: `1`);
16761	bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
16762	bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
16763
16764	if (!YIsConstant && !ZIsConstant && !X ->isDivergent() &&
16765	Y ->isDivergent() != Z ->isDivergent()) {
16766	// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16767	// y are uniform and z isn't.
16768	// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16769	// z are uniform and y isn't.
16770	// The goal is to push uniform operands up in the computation, so that they
16771	// can be handled with scalar operations. We can't use reassociateScalarOps
16772	// for this since it requires two identical commutative operations to
16773	// reassociate.
16774	if (Y ->isDivergent())
16775	std::swap(a&: Y, b&: Z);
16776	// If both additions in the original were NUW, reassociation preserves that.
16777	SDNodeFlags ReassocFlags =
16778	(N->getFlags() & N1 ->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16779	SDValue UniformInner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags: ReassocFlags);
16780	DCI.AddToWorklist(N: UniformInner.getNode());
16781	return DAG.getMemBasePlusOffset(Base: UniformInner, Offset: Z, DL, Flags: ReassocFlags);
16782	}
16783
16784	return SDValue ();
16785	}
16786
16787	SDValue SITargetLowering::performSubCombine(SDNode *N,
16788	DAGCombinerInfo &DCI) const {
16789	SelectionDAG &DAG = DCI.DAG;
16790	EVT VT = N->getValueType(ResNo: `0`);
16791
16792	if (VT == MVT::i64) {
16793	if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16794	return Folded;
16795	}
16796
16797	if (VT != MVT::i32)
16798	return SDValue ();
16799
16800	SDLoc SL(N);
16801	SDValue LHS = N->getOperand(Num: `0`);
16802	SDValue RHS = N->getOperand(Num: `1`);
16803
16804	// sub x, zext (setcc) => usubo_carry x, 0, setcc
16805	// sub x, sext (setcc) => uaddo_carry x, 0, setcc
16806	unsigned Opc = RHS.getOpcode();
16807	switch (Opc) {
16808	default:
16809	break;
16810	case ISD::ZERO_EXTEND:
16811	case ISD::SIGN_EXTEND:
16812	case ISD::ANY_EXTEND: {
16813	auto Cond = RHS.getOperand(i: `0`);
16814	// If this won't be a real VOPC output, we would still need to insert an
16815	// extra instruction anyway.
16816	if (!isBoolSGPR(V: Cond))
16817	break;
16818	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16819	SDValue Args[] = {LHS, DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), Cond};
16820	Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
16821	return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16822	}
16823	}
16824
16825	if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16826	// sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16827	if (!isNullConstant(V: LHS.getOperand(i: `1`)))
16828	return SDValue ();
16829	SDValue Args[] = {LHS.getOperand(i: `0`), RHS, LHS.getOperand(i: `2`)};
16830	return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc (N), VTList: LHS ->getVTList(), Ops: Args);
16831	}
16832	return SDValue ();
16833	}
16834
16835	SDValue
16836	SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16837	DAGCombinerInfo &DCI) const {
16838
16839	if (N->getValueType(ResNo: `0`) != MVT::i32)
16840	return SDValue ();
16841
16842	if (!isNullConstant(V: N->getOperand(Num: `1`)))
16843	return SDValue ();
16844
16845	SelectionDAG &DAG = DCI.DAG;
16846	SDValue LHS = N->getOperand(Num: `0`);
16847
16848	// uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16849	// usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16850	unsigned LHSOpc = LHS.getOpcode();
16851	unsigned Opc = N->getOpcode();
16852	if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) \|\|
16853	(LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16854	SDValue Args[] = {LHS.getOperand(i: `0`), LHS.getOperand(i: `1`), N->getOperand(Num: `2`)};
16855	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VTList: N->getVTList(), Ops: Args);
16856	}
16857	return SDValue ();
16858	}
16859
16860	SDValue SITargetLowering::performFAddCombine(SDNode *N,
16861	DAGCombinerInfo &DCI) const {
16862	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16863	return SDValue ();
16864
16865	SelectionDAG &DAG = DCI.DAG;
16866	EVT VT = N->getValueType(ResNo: `0`);
16867
16868	SDLoc SL(N);
16869	SDValue LHS = N->getOperand(Num: `0`);
16870	SDValue RHS = N->getOperand(Num: `1`);
16871
16872	// These should really be instruction patterns, but writing patterns with
16873	// source modifiers is a pain.
16874
16875	// fadd (fadd (a, a), b) -> mad 2.0, a, b
16876	if (LHS.getOpcode() == ISD::FADD) {
16877	SDValue A = LHS.getOperand(i: `0`);
16878	if (A == LHS.getOperand(i: `1`)) {
16879	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16880	if (FusedOp != `0`) {
16881	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
16882	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
16883	}
16884	}
16885	}
16886
16887	// fadd (b, fadd (a, a)) -> mad 2.0, a, b
16888	if (RHS.getOpcode() == ISD::FADD) {
16889	SDValue A = RHS.getOperand(i: `0`);
16890	if (A == RHS.getOperand(i: `1`)) {
16891	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16892	if (FusedOp != `0`) {
16893	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
16894	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
16895	}
16896	}
16897	}
16898
16899	return SDValue ();
16900	}
16901
16902	SDValue SITargetLowering::performFSubCombine(SDNode *N,
16903	DAGCombinerInfo &DCI) const {
16904	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16905	return SDValue ();
16906
16907	SelectionDAG &DAG = DCI.DAG;
16908	SDLoc SL(N);
16909	EVT VT = N->getValueType(ResNo: `0`);
16910	assert(!VT.isVector());
16911
16912	// Try to get the fneg to fold into the source modifier. This undoes generic
16913	// DAG combines and folds them into the mad.
16914	//
16915	// Only do this if we are not trying to support denormals. v_mad_f32 does
16916	// not support denormals ever.
16917	SDValue LHS = N->getOperand(Num: `0`);
16918	SDValue RHS = N->getOperand(Num: `1`);
16919	if (LHS.getOpcode() == ISD::FADD) {
16920	// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16921	SDValue A = LHS.getOperand(i: `0`);
16922	if (A == LHS.getOperand(i: `1`)) {
16923	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16924	if (FusedOp != `0`) {
16925	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
16926	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
16927
16928	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
16929	}
16930	}
16931	}
16932
16933	if (RHS.getOpcode() == ISD::FADD) {
16934	// (fsub c, (fadd a, a)) -> mad -2.0, a, c
16935
16936	SDValue A = RHS.getOperand(i: `0`);
16937	if (A == RHS.getOperand(i: `1`)) {
16938	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16939	if (FusedOp != `0`) {
16940	const SDValue NegTwo = DAG.getConstantFP(Val: -`2.0`, DL: SL, VT);
16941	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
16942	}
16943	}
16944	}
16945
16946	return SDValue ();
16947	}
16948
16949	SDValue SITargetLowering::performFDivCombine(SDNode *N,
16950	DAGCombinerInfo &DCI) const {
16951	SelectionDAG &DAG = DCI.DAG;
16952	SDLoc SL(N);
16953	EVT VT = N->getValueType(ResNo: `0`);
16954
16955	// fsqrt legality correlates to rsq availability.
16956	if ((VT != MVT::f16 && VT != MVT::bf16) \|\| !isOperationLegal(Op: ISD::FSQRT, VT))
16957	return SDValue ();
16958
16959	SDValue LHS = N->getOperand(Num: `0`);
16960	SDValue RHS = N->getOperand(Num: `1`);
16961
16962	SDNodeFlags Flags = N->getFlags();
16963	SDNodeFlags RHSFlags = RHS ->getFlags();
16964	if (!Flags.hasAllowContract() \|\| !RHSFlags.hasAllowContract() \|\|
16965	!RHS ->hasOneUse())
16966	return SDValue ();
16967
16968	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
16969	bool IsNegative = false;
16970	if (CLHS->isExactlyValue(V: `1.0`) \|\|
16971	(IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
16972	// fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16973	// fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16974	if (RHS.getOpcode() == ISD::FSQRT) {
16975	// TODO: Or in RHS flags, somehow missing from SDNodeFlags
16976	SDValue Rsq =
16977	DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: `0`), Flags);
16978	return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
16979	}
16980	}
16981	}
16982
16983	return SDValue ();
16984	}
16985
16986	SDValue SITargetLowering::performFMulCombine(SDNode *N,
16987	DAGCombinerInfo &DCI) const {
16988	SelectionDAG &DAG = DCI.DAG;
16989	EVT VT = N->getValueType(ResNo: `0`);
16990	EVT ScalarVT = VT.getScalarType();
16991	EVT IntVT = VT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
16992
16993	if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16994	(ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f16)) {
16995	// Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16996	return SDValue ();
16997	}
16998
16999	SDValue LHS = N->getOperand(Num: `0`);
17000	SDValue RHS = N->getOperand(Num: `1`);
17001
17002	// It is cheaper to realize i32 inline constants as compared against
17003	// materializing f16 or f64 (or even non-inline f32) values,
17004	// possible via ldexp usage, as shown below :
17005	//
17006	// Given : A = 2^a & B = 2^b ; where a and b are integers.
17007	// fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17008	// fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17009	if ((ScalarVT == MVT::f64 \|\| ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f16) &&
17010	(RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17011	const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: `1`));
17012	if (!TrueNode)
17013	return SDValue ();
17014	const ConstantFPSDNode *FalseNode =
17015	isConstOrConstSplatFP(N: RHS.getOperand(i: `2`));
17016	if (!FalseNode)
17017	return SDValue ();
17018
17019	if (TrueNode->isNegative() != FalseNode->isNegative())
17020	return SDValue ();
17021
17022	// For f32, only non-inline constants should be transformed.
17023	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17024	if (ScalarVT == MVT::f32 &&
17025	TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
17026	TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
17027	return SDValue ();
17028
17029	int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17030	if (TrueNodeExpVal == INT_MIN)
17031	return SDValue ();
17032	int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17033	if (FalseNodeExpVal == INT_MIN)
17034	return SDValue ();
17035
17036	SDLoc SL(N);
17037	SDValue SelectNode =
17038	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: `0`),
17039	N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
17040	N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
17041
17042	LHS = TrueNode->isNegative()
17043	? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS ->getFlags())
17044	: LHS;
17045
17046	return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
17047	}
17048
17049	return SDValue ();
17050	}
17051
17052	SDValue SITargetLowering::performFMACombine(SDNode *N,
17053	DAGCombinerInfo &DCI) const {
17054	SelectionDAG &DAG = DCI.DAG;
17055	EVT VT = N->getValueType(ResNo: `0`);
17056	SDLoc SL(N);
17057
17058	if (!Subtarget->hasDot10Insts() \|\| VT != MVT::f32)
17059	return SDValue ();
17060
17061	// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17062	// FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17063	SDValue Op1 = N->getOperand(Num: `0`);
17064	SDValue Op2 = N->getOperand(Num: `1`);
17065	SDValue FMA = N->getOperand(Num: `2`);
17066
17067	if (FMA.getOpcode() != ISD::FMA \|\| Op1.getOpcode() != ISD::FP_EXTEND \|\|
17068	Op2.getOpcode() != ISD::FP_EXTEND)
17069	return SDValue ();
17070
17071	// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17072	// regardless of the denorm mode setting. Therefore,
17073	// fp-contract is sufficient to allow generating fdot2.
17074	const TargetOptions &Options = DAG.getTarget().Options;
17075	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
17076	(N->getFlags().hasAllowContract() &&
17077	FMA ->getFlags().hasAllowContract())) {
17078	Op1 = Op1.getOperand(i: `0`);
17079	Op2 = Op2.getOperand(i: `0`);
17080	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
17081	Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17082	return SDValue ();
17083
17084	SDValue Vec1 = Op1.getOperand(i: `0`);
17085	SDValue Idx1 = Op1.getOperand(i: `1`);
17086	SDValue Vec2 = Op2.getOperand(i: `0`);
17087
17088	SDValue FMAOp1 = FMA.getOperand(i: `0`);
17089	SDValue FMAOp2 = FMA.getOperand(i: `1`);
17090	SDValue FMAAcc = FMA.getOperand(i: `2`);
17091
17092	if (FMAOp1.getOpcode() != ISD::FP_EXTEND \|\|
17093	FMAOp2.getOpcode() != ISD::FP_EXTEND)
17094	return SDValue ();
17095
17096	FMAOp1 = FMAOp1.getOperand(i: `0`);
17097	FMAOp2 = FMAOp2.getOperand(i: `0`);
17098	if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
17099	FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17100	return SDValue ();
17101
17102	SDValue Vec3 = FMAOp1.getOperand(i: `0`);
17103	SDValue Vec4 = FMAOp2.getOperand(i: `0`);
17104	SDValue Idx2 = FMAOp1.getOperand(i: `1`);
17105
17106	if (Idx1 != Op2.getOperand(i: `1`) \|\| Idx2 != FMAOp2.getOperand(i: `1`) \|\|
17107	// Idx1 and Idx2 cannot be the same.
17108	Idx1 == Idx2)
17109	return SDValue ();
17110
17111	if (Vec1 == Vec2 \|\| Vec3 == Vec4)
17112	return SDValue ();
17113
17114	if (Vec1.getValueType() != MVT::v2f16 \|\| Vec2.getValueType() != MVT::v2f16)
17115	return SDValue ();
17116
17117	if ((Vec1 == Vec3 && Vec2 == Vec4) \|\| (Vec1 == Vec4 && Vec2 == Vec3)) {
17118	return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
17119	N4: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1));
17120	}
17121	}
17122	return SDValue ();
17123	}
17124
17125	SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17126	DAGCombinerInfo &DCI) const {
17127	SelectionDAG &DAG = DCI.DAG;
17128	SDLoc SL(N);
17129
17130	SDValue LHS = N->getOperand(Num: `0`);
17131	SDValue RHS = N->getOperand(Num: `1`);
17132	EVT VT = LHS.getValueType();
17133	ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
17134
17135	auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
17136	if (!CRHS) {
17137	CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
17138	if (CRHS) {
17139	std::swap(a&: LHS, b&: RHS);
17140	CC = getSetCCSwappedOperands(Operation: CC);
17141	}
17142	}
17143
17144	if (CRHS) {
17145	if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17146	isBoolSGPR(V: LHS.getOperand(i: `0`))) {
17147	// setcc (sext from i1 cc), -1, ne\|sgt\|ult) => not cc => xor cc, -1
17148	// setcc (sext from i1 cc), -1, eq\|sle\|uge) => cc
17149	// setcc (sext from i1 cc), 0, eq\|sge\|ule) => not cc => xor cc, -1
17150	// setcc (sext from i1 cc), 0, ne\|ugt\|slt) => cc
17151	if ((CRHS->isAllOnes() &&
17152	(CC == ISD::SETNE \|\| CC == ISD::SETGT \|\| CC == ISD::SETULT)) \|\|
17153	(CRHS->isZero() &&
17154	(CC == ISD::SETEQ \|\| CC == ISD::SETGE \|\| CC == ISD::SETULE)))
17155	return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: `0`),
17156	N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
17157	if ((CRHS->isAllOnes() &&
17158	(CC == ISD::SETEQ \|\| CC == ISD::SETLE \|\| CC == ISD::SETUGE)) \|\|
17159	(CRHS->isZero() &&
17160	(CC == ISD::SETNE \|\| CC == ISD::SETUGT \|\| CC == ISD::SETLT)))
17161	return LHS.getOperand(i: `0`);
17162	}
17163
17164	const APInt &CRHSVal = CRHS->getAPIntValue();
17165	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
17166	LHS.getOpcode() == ISD::SELECT &&
17167	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
17168	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`)) &&
17169	isBoolSGPR(V: LHS.getOperand(i: `0`))) {
17170	// Given CT != FT:
17171	// setcc (select cc, CT, CF), CF, eq => xor cc, -1
17172	// setcc (select cc, CT, CF), CF, ne => cc
17173	// setcc (select cc, CT, CF), CT, ne => xor cc, -1
17174	// setcc (select cc, CT, CF), CT, eq => cc
17175	const APInt &CT = LHS.getConstantOperandAPInt(i: `1`);
17176	const APInt &CF = LHS.getConstantOperandAPInt(i: `2`);
17177
17178	if (CT != CF) {
17179	if ((CF == CRHSVal && CC == ISD::SETEQ) \|\|
17180	(CT == CRHSVal && CC == ISD::SETNE))
17181	return DAG.getNOT(DL: SL, Val: LHS.getOperand(i: `0`), VT: MVT::i1);
17182	if ((CF == CRHSVal && CC == ISD::SETNE) \|\|
17183	(CT == CRHSVal && CC == ISD::SETEQ))
17184	return LHS.getOperand(i: `0`);
17185	}
17186	}
17187	}
17188
17189	// Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17190	// following cases where information about the lower 32-bits of its operands
17191	// is known:
17192	//
17193	// If LHS.lo32 == RHS.lo32:
17194	// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17195	// If LHS.lo32 != RHS.lo32:
17196	// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17197	// If LHS.lo32 >= RHS.lo32 (unsigned):
17198	// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17199	// If LHS.lo32 > RHS.lo32 (unsigned):
17200	// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17201	// If LHS.lo32 <= RHS.lo32 (unsigned):
17202	// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17203	// If LHS.lo32 < RHS.lo32 (unsigned):
17204	// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17205	if (VT == MVT::i64) {
17206	const KnownBits LHSKnownLo32 = DAG.computeKnownBits(Op: LHS).trunc(BitWidth: `32`);
17207	const KnownBits RHSKnownLo32 = DAG.computeKnownBits(Op: RHS).trunc(BitWidth: `32`);
17208
17209	// NewCC is valid iff we can truncate the setcc to only test the upper 32
17210	// bits
17211	ISD::CondCode NewCC = ISD::SETCC_INVALID;
17212
17213	switch (CC) {
17214	default:
17215	break;
17216	case ISD::SETEQ: {
17217	const std::optional<bool> KnownEq =
17218	KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17219	if (KnownEq)
17220	NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17221
17222	break;
17223	}
17224	case ISD::SETNE: {
17225	const std::optional<bool> KnownEq =
17226	KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17227	if (KnownEq)
17228	NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17229
17230	break;
17231	}
17232	case ISD::SETULT:
17233	case ISD::SETUGE:
17234	case ISD::SETLT:
17235	case ISD::SETGE: {
17236	const std::optional<bool> KnownUge =
17237	KnownBits::uge(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17238	if (KnownUge) {
17239	if (*KnownUge) {
17240	// LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17241	NewCC = CC;
17242	} else {
17243	// LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17244	NewCC = CC == ISD::SETULT ? ISD::SETULE
17245	: CC == ISD::SETUGE ? ISD::SETUGT
17246	: CC == ISD::SETLT ? ISD::SETLE
17247	: ISD::SETGT;
17248	}
17249	}
17250	break;
17251	}
17252	case ISD::SETULE:
17253	case ISD::SETUGT:
17254	case ISD::SETLE:
17255	case ISD::SETGT: {
17256	const std::optional<bool> KnownUle =
17257	KnownBits::ule(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17258	if (KnownUle) {
17259	if (*KnownUle) {
17260	// LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17261	NewCC = CC;
17262	} else {
17263	// LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17264	NewCC = CC == ISD::SETULE ? ISD::SETULT
17265	: CC == ISD::SETUGT ? ISD::SETUGE
17266	: CC == ISD::SETLE ? ISD::SETLT
17267	: ISD::SETGE;
17268	}
17269	}
17270	break;
17271	}
17272	}
17273
17274	if (NewCC != ISD::SETCC_INVALID)
17275	return DAG.getSetCC(DL: SL, VT: N->getValueType(ResNo: `0`), LHS: getHiHalf64(Op: LHS, DAG),
17276	RHS: getHiHalf64(Op: RHS, DAG), Cond: NewCC);
17277	}
17278
17279	// Eliminate setcc by using carryout from add/sub instruction
17280
17281	// LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17282	// setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17283	// similarly for subtraction
17284
17285	// LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17286	// setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17287
17288	if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17289	sd_match(N: LHS, P: m_Add(L: m_Specific(N: RHS), R: m_Value()))) \|\|
17290	(CC == ISD::SETUGT &&
17291	sd_match(N: LHS, P: m_Sub(L: m_Specific(N: RHS), R: m_Value()))) \|\|
17292	(CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17293	sd_match(N: LHS, P: m_Add(L: m_Value(), R: m_One()))))) {
17294	bool IsAdd = LHS.getOpcode() == ISD::ADD;
17295
17296	SDValue Op0 = LHS.getOperand(i: `0`);
17297	SDValue Op1 = LHS.getOperand(i: `1`);
17298
17299	SDValue Op0Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op0);
17300	SDValue Op1Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op1);
17301
17302	SDValue Op0Hi = getHiHalf64(Op: Op0, DAG);
17303	SDValue Op1Hi = getHiHalf64(Op: Op1, DAG);
17304
17305	SDValue NodeLo =
17306	DAG.getNode(Opcode: IsAdd ? ISD::UADDO : ISD::USUBO, DL: SL,
17307	VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1), Ops: {Op0Lo, Op1Lo});
17308
17309	SDValue CarryInHi = NodeLo.getValue(R: `1`);
17310	SDValue NodeHi = DAG.getNode(Opcode: IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17311	DL: SL, VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1),
17312	Ops: {Op0Hi, Op1Hi, CarryInHi});
17313
17314	SDValue ResultLo = NodeLo.getValue(R: `0`);
17315	SDValue ResultHi = NodeHi.getValue(R: `0`);
17316
17317	SDValue JoinedResult =
17318	DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {ResultLo, ResultHi});
17319
17320	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: JoinedResult);
17321	SDValue Overflow = NodeHi.getValue(R: `1`);
17322	DCI.CombineTo(N: LHS.getNode(), Res: Result);
17323	return Overflow;
17324	}
17325
17326	if (VT != MVT::f32 && VT != MVT::f64 &&
17327	(!Subtarget->has16BitInsts() \|\| VT != MVT::f16))
17328	return SDValue ();
17329
17330	// Match isinf/isfinite pattern
17331	// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity \| n_infinity))
17332	// (fcmp one (fabs x), inf) -> (fp_class x,
17333	// (p_normal \| n_normal \| p_subnormal \| n_subnormal \| p_zero \| n_zero)
17334	if ((CC == ISD::SETOEQ \|\| CC == ISD::SETONE) &&
17335	LHS.getOpcode() == ISD::FABS) {
17336	const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
17337	if (!CRHS)
17338	return SDValue ();
17339
17340	const APFloat &APF = CRHS->getValueAPF();
17341	if (APF.isInfinity() && !APF.isNegative()) {
17342	const unsigned IsInfMask =
17343	SIInstrFlags::P_INFINITY \| SIInstrFlags::N_INFINITY;
17344	const unsigned IsFiniteMask =
17345	SIInstrFlags::N_ZERO \| SIInstrFlags::P_ZERO \| SIInstrFlags::N_NORMAL \|
17346	SIInstrFlags::P_NORMAL \| SIInstrFlags::N_SUBNORMAL \|
17347	SIInstrFlags::P_SUBNORMAL;
17348	unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17349	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: `0`),
17350	N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
17351	}
17352	}
17353
17354	return SDValue ();
17355	}
17356
17357	SDValue
17358	SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17359	DAGCombinerInfo &DCI) const {
17360	SelectionDAG &DAG = DCI.DAG;
17361	SDLoc SL(N);
17362	unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17363
17364	SDValue Src = N->getOperand(Num: `0`);
17365	SDValue Shift = N->getOperand(Num: `0`);
17366
17367	// TODO: Extend type shouldn't matter (assuming legal types).
17368	if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17369	Shift = Shift.getOperand(i: `0`);
17370
17371	if (Shift.getOpcode() == ISD::SRL \|\| Shift.getOpcode() == ISD::SHL) {
17372	// cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17373	// cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17374	// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17375	// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17376	// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17377	if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`))) {
17378	SDValue Shifted = DAG.getZExtOrTrunc(
17379	Op: Shift.getOperand(i: `0`), DL: SDLoc (Shift.getOperand(i: `0`)), VT: MVT::i32);
17380
17381	unsigned ShiftOffset = `8` * Offset;
17382	if (Shift.getOpcode() == ISD::SHL)
17383	ShiftOffset -= C->getZExtValue();
17384	else
17385	ShiftOffset += C->getZExtValue();
17386
17387	if (ShiftOffset < `32` && (ShiftOffset % `8`) == `0`) {
17388	return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / `8`, DL: SL,
17389	VT: MVT::f32, Operand: Shifted);
17390	}
17391	}
17392	}
17393
17394	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17395	APInt DemandedBits = APInt::getBitsSet(numBits: `32`, loBit: `8` * Offset, hiBit: `8` * Offset + `8`);
17396	if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
17397	// We simplified Src. If this node is not dead, visit it again so it is
17398	// folded properly.
17399	if (N->getOpcode() != ISD::DELETED_NODE)
17400	DCI.AddToWorklist(N);
17401	return SDValue (N, `0`);
17402	}
17403
17404	// Handle (or x, (srl y, 8)) pattern when known bits are zero.
17405	if (SDValue DemandedSrc =
17406	TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
17407	return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
17408
17409	return SDValue ();
17410	}
17411
17412	SDValue SITargetLowering::performClampCombine(SDNode *N,
17413	DAGCombinerInfo &DCI) const {
17414	ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: `0`));
17415	if (!CSrc)
17416	return SDValue ();
17417
17418	const MachineFunction &MF = DCI.DAG.getMachineFunction();
17419	const APFloat &F = CSrc->getValueAPF();
17420	APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
17421	if (F < Zero \|\|
17422	(F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17423	return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
17424	}
17425
17426	APFloat One(F.getSemantics(), "1.0");
17427	if (F > One)
17428	return DCI.DAG.getConstantFP(Val: One, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
17429
17430	return SDValue (CSrc, `0`);
17431	}
17432
17433	SDValue SITargetLowering::performSelectCombine(SDNode *N,
17434	DAGCombinerInfo &DCI) const {
17435
17436	// Try to fold CMP + SELECT patterns with shared constants (both FP and
17437	// integer).
17438	// Detect when CMP and SELECT use the same constant and fold them to avoid
17439	// loading the constant twice. Specifically handles patterns like:
17440	// %cmp = icmp eq i32 %val, 4242
17441	// %sel = select i1 %cmp, i32 4242, i32 %other
17442	// It can be optimized to reuse %val instead of 4242 in select.
17443	SDValue Cond = N->getOperand(Num: `0`);
17444	SDValue TrueVal = N->getOperand(Num: `1`);
17445	SDValue FalseVal = N->getOperand(Num: `2`);
17446
17447	// Check if condition is a comparison.
17448	if (Cond.getOpcode() != ISD::SETCC)
17449	return SDValue ();
17450
17451	SDValue LHS = Cond.getOperand(i: `0`);
17452	SDValue RHS = Cond.getOperand(i: `1`);
17453	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
17454
17455	bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17456	bool isInteger = LHS.getValueType().isInteger();
17457
17458	// Handle simple floating-point and integer types only.
17459	if (!isFloatingPoint && !isInteger)
17460	return SDValue ();
17461
17462	bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17463	bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17464	if (!isEquality && !isNonEquality)
17465	return SDValue ();
17466
17467	SDValue ArgVal, ConstVal;
17468	if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: RHS)) \|\|
17469	(isInteger && isa<ConstantSDNode>(Val: RHS))) {
17470	ConstVal = RHS;
17471	ArgVal = LHS;
17472	} else if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: LHS)) \|\|
17473	(isInteger && isa<ConstantSDNode>(Val: LHS))) {
17474	ConstVal = LHS;
17475	ArgVal = RHS;
17476	} else {
17477	return SDValue ();
17478	}
17479
17480	// Skip optimization for inlinable immediates.
17481	if (isFloatingPoint) {
17482	const APFloat &Val = cast<ConstantFPSDNode>(Val&: ConstVal)->getValueAPF();
17483	if (!Val.isNormal() \|\| Subtarget->getInstrInfo()->isInlineConstant(Imm: Val))
17484	return SDValue ();
17485	} else {
17486	if (AMDGPU::isInlinableIntLiteral(
17487	Literal: cast<ConstantSDNode>(Val&: ConstVal)->getSExtValue()))
17488	return SDValue ();
17489	}
17490
17491	// For equality and non-equality comparisons, patterns:
17492	// select (setcc x, const), const, y -> select (setcc x, const), x, y
17493	// select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17494	if (!(isEquality && TrueVal == ConstVal) &&
17495	!(isNonEquality && FalseVal == ConstVal))
17496	return SDValue ();
17497
17498	SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17499	SDValue SelectRHS =
17500	(isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17501	return DCI.DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Cond,
17502	N2: SelectLHS, N3: SelectRHS);
17503	}
17504
17505	SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
17506	DAGCombinerInfo &DCI) const {
17507	switch (N->getOpcode()) {
17508	case ISD::ADD:
17509	case ISD::SUB:
17510	case ISD::SHL:
17511	case ISD::SRL:
17512	case ISD::SRA:
17513	case ISD::AND:
17514	case ISD::OR:
17515	case ISD::XOR:
17516	case ISD::MUL:
17517	case ISD::SETCC:
17518	case ISD::SELECT:
17519	case ISD::SMIN:
17520	case ISD::SMAX:
17521	case ISD::UMIN:
17522	case ISD::UMAX:
17523	if (auto Res = promoteUniformOpToI32(Op: SDValue (N, `0`), DCI))
17524	return Res;
17525	break;
17526	default:
17527	break;
17528	}
17529
17530	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17531	return SDValue ();
17532
17533	switch (N->getOpcode()) {
17534	case ISD::ADD:
17535	return performAddCombine(N, DCI);
17536	case ISD::PTRADD:
17537	return performPtrAddCombine(N, DCI);
17538	case ISD::SUB:
17539	return performSubCombine(N, DCI);
17540	case ISD::UADDO_CARRY:
17541	case ISD::USUBO_CARRY:
17542	return performAddCarrySubCarryCombine(N, DCI);
17543	case ISD::FADD:
17544	return performFAddCombine(N, DCI);
17545	case ISD::FSUB:
17546	return performFSubCombine(N, DCI);
17547	case ISD::FDIV:
17548	return performFDivCombine(N, DCI);
17549	case ISD::FMUL:
17550	return performFMulCombine(N, DCI);
17551	case ISD::SETCC:
17552	return performSetCCCombine(N, DCI);
17553	case ISD::SELECT:
17554	if (auto Res = performSelectCombine(N, DCI))
17555	return Res;
17556	break;
17557	case ISD::FMAXNUM:
17558	case ISD::FMINNUM:
17559	case ISD::FMAXNUM_IEEE:
17560	case ISD::FMINNUM_IEEE:
17561	case ISD::FMAXIMUM:
17562	case ISD::FMINIMUM:
17563	case ISD::FMAXIMUMNUM:
17564	case ISD::FMINIMUMNUM:
17565	case ISD::SMAX:
17566	case ISD::SMIN:
17567	case ISD::UMAX:
17568	case ISD::UMIN:
17569	case AMDGPUISD::FMIN_LEGACY:
17570	case AMDGPUISD::FMAX_LEGACY:
17571	return performMinMaxCombine(N, DCI);
17572	case ISD::FMA:
17573	return performFMACombine(N, DCI);
17574	case ISD::AND:
17575	return performAndCombine(N, DCI);
17576	case ISD::OR:
17577	return performOrCombine(N, DCI);
17578	case ISD::FSHR: {
17579	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17580	if (N->getValueType(ResNo: `0`) == MVT::i32 && N->isDivergent() &&
17581	TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
17582	return matchPERM(N, DCI);
17583	}
17584	break;
17585	}
17586	case ISD::XOR:
17587	return performXorCombine(N, DCI);
17588	case ISD::ANY_EXTEND:
17589	case ISD::ZERO_EXTEND:
17590	return performZeroOrAnyExtendCombine(N, DCI);
17591	case ISD::SIGN_EXTEND_INREG:
17592	return performSignExtendInRegCombine(N, DCI);
17593	case AMDGPUISD::FP_CLASS:
17594	return performClassCombine(N, DCI);
17595	case ISD::FCANONICALIZE:
17596	return performFCanonicalizeCombine(N, DCI);
17597	case AMDGPUISD::RCP:
17598	return performRcpCombine(N, DCI);
17599	case ISD::FLDEXP:
17600	case AMDGPUISD::FRACT:
17601	case AMDGPUISD::RSQ:
17602	case AMDGPUISD::RCP_LEGACY:
17603	case AMDGPUISD::RCP_IFLAG:
17604	case AMDGPUISD::RSQ_CLAMP: {
17605	// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17606	SDValue Src = N->getOperand(Num: `0`);
17607	if (Src.isUndef())
17608	return Src;
17609	break;
17610	}
17611	case ISD::SINT_TO_FP:
17612	case ISD::UINT_TO_FP:
17613	return performUCharToFloatCombine(N, DCI);
17614	case ISD::FCOPYSIGN:
17615	return performFCopySignCombine(N, DCI);
17616	case AMDGPUISD::CVT_F32_UBYTE0:
17617	case AMDGPUISD::CVT_F32_UBYTE1:
17618	case AMDGPUISD::CVT_F32_UBYTE2:
17619	case AMDGPUISD::CVT_F32_UBYTE3:
17620	return performCvtF32UByteNCombine(N, DCI);
17621	case AMDGPUISD::FMED3:
17622	return performFMed3Combine(N, DCI);
17623	case AMDGPUISD::CVT_PKRTZ_F16_F32:
17624	return performCvtPkRTZCombine(N, DCI);
17625	case AMDGPUISD::CLAMP:
17626	return performClampCombine(N, DCI);
17627	case ISD::SCALAR_TO_VECTOR: {
17628	SelectionDAG &DAG = DCI.DAG;
17629	EVT VT = N->getValueType(ResNo: `0`);
17630
17631	// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17632	if (VT == MVT::v2i16 \|\| VT == MVT::v2f16 \|\| VT == MVT::v2bf16) {
17633	SDLoc SL(N);
17634	SDValue Src = N->getOperand(Num: `0`);
17635	EVT EltVT = Src.getValueType();
17636	if (EltVT != MVT::i16)
17637	Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
17638
17639	SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
17640	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
17641	}
17642
17643	break;
17644	}
17645	case ISD::EXTRACT_VECTOR_ELT:
17646	return performExtractVectorEltCombine(N, DCI);
17647	case ISD::INSERT_VECTOR_ELT:
17648	return performInsertVectorEltCombine(N, DCI);
17649	case ISD::FP_ROUND:
17650	return performFPRoundCombine(N, DCI);
17651	case ISD::LOAD: {
17652	if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
17653	return Widened;
17654	[[fallthrough]];
17655	}
17656	default: {
17657	if (!DCI.isBeforeLegalize()) {
17658	if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
17659	return performMemSDNodeCombine(N: MemNode, DCI);
17660	}
17661
17662	break;
17663	}
17664	}
17665
17666	return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
17667	}
17668
17669	/// Helper function for adjustWritemask
17670	static unsigned SubIdx2Lane(unsigned Idx) {
17671	switch (Idx) {
17672	default:
17673	return ~`0u`;
17674	case AMDGPU::sub0:
17675	return `0`;
17676	case AMDGPU::sub1:
17677	return `1`;
17678	case AMDGPU::sub2:
17679	return `2`;
17680	case AMDGPU::sub3:
17681	return `3`;
17682	case AMDGPU::sub4:
17683	return `4`; // Possible with TFE/LWE
17684	}
17685	}
17686
17687	/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17688	SDNode SITargetLowering::adjustWritemask(MachineSDNode &Node,
17689	SelectionDAG &DAG) const {
17690	unsigned Opcode = Node->getMachineOpcode();
17691
17692	// Subtract 1 because the vdata output is not a MachineSDNode operand.
17693	int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - `1`;
17694	if (D16Idx >= `0` && Node->getConstantOperandVal(Num: D16Idx))
17695	return Node; // not implemented for D16
17696
17697	SDNode Users[`5`] = {nullptr*};
17698	unsigned Lane = `0`;
17699	unsigned DmaskIdx =
17700	AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - `1`;
17701	unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
17702	unsigned NewDmask = `0`;
17703	unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - `1`;
17704	unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - `1`;
17705	bool UsesTFC = (int(TFEIdx) >= `0` && Node->getConstantOperandVal(Num: TFEIdx)) \|\|
17706	(int(LWEIdx) >= `0` && Node->getConstantOperandVal(Num: LWEIdx));
17707	unsigned TFCLane = `0`;
17708	bool HasChain = Node->getNumValues() > `1`;
17709
17710	if (OldDmask == `0`) {
17711	// These are folded out, but on the chance it happens don't assert.
17712	return Node;
17713	}
17714
17715	unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
17716	// Work out which is the TFE/LWE lane if that is enabled.
17717	if (UsesTFC) {
17718	TFCLane = OldBitsSet;
17719	}
17720
17721	// Try to figure out the used register components
17722	for (SDUse &Use : Node->uses()) {
17723
17724	// Don't look at users of the chain.
17725	if (Use.getResNo() != `0`)
17726	continue;
17727
17728	SDNode *User = Use.getUser();
17729
17730	// Abort if we can't understand the usage
17731	if (!User->isMachineOpcode() \|\|
17732	User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17733	return Node;
17734
17735	// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17736	// Note that subregs are packed, i.e. Lane==0 is the first bit set
17737	// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17738	// set, etc.
17739	Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: `1`));
17740	if (Lane == ~`0u`)
17741	return Node;
17742
17743	// Check if the use is for the TFE/LWE generated result at VGPRn+1.
17744	if (UsesTFC && Lane == TFCLane) {
17745	Users[Lane] = User;
17746	} else {
17747	// Set which texture component corresponds to the lane.
17748	unsigned Comp;
17749	for (unsigned i = `0`, Dmask = OldDmask; (i <= Lane) && (Dmask != `0`); i++) {
17750	Comp = llvm::countr_zero(Val: Dmask);
17751	Dmask &= ~(`1` << Comp);
17752	}
17753
17754	// Abort if we have more than one user per component.
17755	if (Users[Lane])
17756	return Node;
17757
17758	Users[Lane] = User;
17759	NewDmask \|= `1` << Comp;
17760	}
17761	}
17762
17763	// Don't allow 0 dmask, as hardware assumes one channel enabled.
17764	bool NoChannels = !NewDmask;
17765	if (NoChannels) {
17766	if (!UsesTFC) {
17767	// No uses of the result and not using TFC. Then do nothing.
17768	return Node;
17769	}
17770	// If the original dmask has one channel - then nothing to do
17771	if (OldBitsSet == `1`)
17772	return Node;
17773	// Use an arbitrary dmask - required for the instruction to work
17774	NewDmask = `1`;
17775	}
17776	// Abort if there's no change
17777	if (NewDmask == OldDmask)
17778	return Node;
17779
17780	unsigned BitsSet = llvm::popcount(Value: NewDmask);
17781
17782	// Check for TFE or LWE - increase the number of channels by one to account
17783	// for the extra return value
17784	// This will need adjustment for D16 if this is also included in
17785	// adjustWriteMask (this function) but at present D16 are excluded.
17786	unsigned NewChannels = BitsSet + UsesTFC;
17787
17788	int NewOpcode =
17789	AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
17790	assert(NewOpcode != -`1` &&
17791	NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17792	"failed to find equivalent MIMG op");
17793
17794	// Adjust the writemask in the node
17795	SmallVector<SDValue, `12`> Ops;
17796	llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
17797	Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc (Node), VT: MVT::i32));
17798	llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + `1`));
17799
17800	MVT SVT = Node->getValueType(ResNo: `0`).getVectorElementType().getSimpleVT();
17801
17802	MVT ResultVT = NewChannels == `1`
17803	? SVT
17804	: MVT::getVectorVT(VT: SVT, NumElements: NewChannels == `3` ? `4`
17805	: NewChannels == `5` ? `8`
17806	: NewChannels);
17807	SDVTList NewVTList =
17808	HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
17809
17810	MachineSDNode *NewNode =
17811	DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc (Node), VTs: NewVTList, Ops);
17812
17813	if (HasChain) {
17814	// Update chain.
17815	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
17816	DAG.ReplaceAllUsesOfValueWith(From: SDValue (Node, `1`), To: SDValue (NewNode, `1`));
17817	}
17818
17819	if (NewChannels == `1`) {
17820	assert(Node->hasNUsesOfValue(`1`, `0`));
17821	SDNode *Copy =
17822	DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc (Node),
17823	VT: Users[Lane]->getValueType(ResNo: `0`), Op1: SDValue (NewNode, `0`));
17824	DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
17825	return nullptr;
17826	}
17827
17828	// Update the users of the node with the new indices
17829	for (unsigned i = `0`, Idx = AMDGPU::sub0; i < `5`; ++i) {
17830	SDNode *User = Users[i];
17831	if (!User) {
17832	// Handle the special case of NoChannels. We set NewDmask to 1 above, but
17833	// Users[0] is still nullptr because channel 0 doesn't really have a use.
17834	if (i \|\| !NoChannels)
17835	continue;
17836	} else {
17837	SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc (User), VT: MVT::i32);
17838	SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue (NewNode, `0`), Op2: Op);
17839	if (NewUser != User) {
17840	DAG.ReplaceAllUsesWith(From: SDValue (User, `0`), To: SDValue (NewUser, `0`));
17841	DAG.RemoveDeadNode(N: User);
17842	}
17843	}
17844
17845	switch (Idx) {
17846	default:
17847	break;
17848	case AMDGPU::sub0:
17849	Idx = AMDGPU::sub1;
17850	break;
17851	case AMDGPU::sub1:
17852	Idx = AMDGPU::sub2;
17853	break;
17854	case AMDGPU::sub2:
17855	Idx = AMDGPU::sub3;
17856	break;
17857	case AMDGPU::sub3:
17858	Idx = AMDGPU::sub4;
17859	break;
17860	}
17861	}
17862
17863	DAG.RemoveDeadNode(N: Node);
17864	return nullptr;
17865	}
17866
17867	static bool isFrameIndexOp(SDValue Op) {
17868	if (Op.getOpcode() == ISD::AssertZext)
17869	Op = Op.getOperand(i: `0`);
17870
17871	return isa<FrameIndexSDNode>(Val: Op);
17872	}
17873
17874	/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17875	/// with frame index operands.
17876	/// LLVM assumes that inputs are to these instructions are registers.
17877	SDNode *
17878	SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
17879	SelectionDAG &DAG) const {
17880	if (Node->getOpcode() == ISD::CopyToReg) {
17881	RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: `1`));
17882	SDValue SrcVal = Node->getOperand(Num: `2`);
17883
17884	// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17885	// to try understanding copies to physical registers.
17886	if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17887	SDLoc SL(Node);
17888	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17889	SDValue VReg = DAG.getRegister(
17890	Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
17891
17892	SDNode *Glued = Node->getGluedNode();
17893	SDValue ToVReg = DAG.getCopyToReg(
17894	Chain: Node->getOperand(Num: `0`), dl: SL, Reg: VReg, N: SrcVal,
17895	Glue: SDValue (Glued, Glued ? Glued->getNumValues() - `1` : `0`));
17896	SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue (DestReg, `0`),
17897	N: VReg, Glue: ToVReg.getValue(R: `1`));
17898	DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
17899	DAG.RemoveDeadNode(N: Node);
17900	return ToResultReg.getNode();
17901	}
17902	}
17903
17904	SmallVector<SDValue, `8`> Ops;
17905	for (unsigned i = `0`; i < Node->getNumOperands(); ++i) {
17906	if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
17907	Ops.push_back(Elt: Node->getOperand(Num: i));
17908	continue;
17909	}
17910
17911	SDLoc DL(Node);
17912	Ops.push_back(Elt: SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
17913	VT: Node->getOperand(Num: i).getValueType(),
17914	Op1: Node->getOperand(Num: i)),
17915	`0`));
17916	}
17917
17918	return DAG.UpdateNodeOperands(N: Node, Ops);
17919	}
17920
17921	/// Fold the instructions after selecting them.
17922	/// Returns null if users were already updated.
17923	SDNode SITargetLowering::PostISelFolding(MachineSDNode Node,
17924	SelectionDAG &DAG) const {
17925	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17926	unsigned Opcode = Node->getMachineOpcode();
17927
17928	if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17929	!TII->isGather4(Opcode) &&
17930	AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
17931	return adjustWritemask(Node, DAG);
17932	}
17933
17934	if (Opcode == AMDGPU::INSERT_SUBREG \|\| Opcode == AMDGPU::REG_SEQUENCE) {
17935	legalizeTargetIndependentNode(Node, DAG);
17936	return Node;
17937	}
17938
17939	switch (Opcode) {
17940	case AMDGPU::V_DIV_SCALE_F32_e64:
17941	case AMDGPU::V_DIV_SCALE_F64_e64: {
17942	// Satisfy the operand register constraint when one of the inputs is
17943	// undefined. Ordinarily each undef value will have its own implicit_def of
17944	// a vreg, so force these to use a single register.
17945	SDValue Src0 = Node->getOperand(Num: `1`);
17946	SDValue Src1 = Node->getOperand(Num: `3`);
17947	SDValue Src2 = Node->getOperand(Num: `5`);
17948
17949	if ((Src0.isMachineOpcode() &&
17950	Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17951	(Src0 == Src1 \|\| Src0 == Src2))
17952	break;
17953
17954	MVT VT = Src0.getValueType().getSimpleVT();
17955	const TargetRegisterClass *RC =
17956	getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
17957
17958	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17959	SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
17960
17961	SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc (Node), Reg: UndefReg,
17962	N: Src0, Glue: SDValue ());
17963
17964	// src0 must be the same register as src1 or src2, even if the value is
17965	// undefined, so make sure we don't violate this constraint.
17966	if (Src0.isMachineOpcode() &&
17967	Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17968	if (Src1.isMachineOpcode() &&
17969	Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17970	Src0 = Src1;
17971	else if (Src2.isMachineOpcode() &&
17972	Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17973	Src0 = Src2;
17974	else {
17975	assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17976	Src0 = UndefReg;
17977	Src1 = UndefReg;
17978	}
17979	} else
17980	break;
17981
17982	SmallVector<SDValue, `9`> Ops(Node->ops());
17983	Ops [`1`] = Src0;
17984	Ops [`3`] = Src1;
17985	Ops [`5`] = Src2;
17986	Ops.push_back(Elt: ImpDef.getValue(R: `1`));
17987	return DAG.getMachineNode(Opcode, dl: SDLoc (Node), VTs: Node->getVTList(), Ops);
17988	}
17989	default:
17990	break;
17991	}
17992
17993	return Node;
17994	}
17995
17996	// Any MIMG instructions that use tfe or lwe require an initialization of the
17997	// result register that will be written in the case of a memory access failure.
17998	// The required code is also added to tie this init code to the result of the
17999	// img instruction.
18000	void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
18001	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18002	const SIRegisterInfo &TRI = TII->getRegisterInfo();
18003	MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
18004	MachineBasicBlock &MBB = *MI.getParent();
18005
18006	int DstIdx =
18007	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
18008	unsigned InitIdx = `0`;
18009
18010	if (TII->isImage(MI)) {
18011	MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
18012	MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
18013	MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
18014
18015	if (!TFE && !LWE) // intersect_ray
18016	return;
18017
18018	unsigned TFEVal = TFE ? TFE->getImm() : `0`;
18019	unsigned LWEVal = LWE ? LWE->getImm() : `0`;
18020	unsigned D16Val = D16 ? D16->getImm() : `0`;
18021
18022	if (!TFEVal && !LWEVal)
18023	return;
18024
18025	// At least one of TFE or LWE are non-zero
18026	// We have to insert a suitable initialization of the result value and
18027	// tie this to the dest of the image instruction.
18028
18029	// Calculate which dword we have to initialize to 0.
18030	MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
18031
18032	// check that dmask operand is found.
18033	assert(MO_Dmask && "Expected dmask operand in instruction");
18034
18035	unsigned dmask = MO_Dmask->getImm();
18036	// Determine the number of active lanes taking into account the
18037	// Gather4 special case
18038	unsigned ActiveLanes = TII->isGather4(MI) ? `4` : llvm::popcount(Value: dmask);
18039
18040	bool Packed = !Subtarget->hasUnpackedD16VMem();
18041
18042	InitIdx = D16Val && Packed ? ((ActiveLanes + `1`) >> `1`) + `1` : ActiveLanes + `1`;
18043
18044	// Abandon attempt if the dst size isn't large enough
18045	// - this is in fact an error but this is picked up elsewhere and
18046	// reported correctly.
18047	const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18048
18049	uint32_t DstSize = TRI.getRegSizeInBits(RC: *DstRC) / `32`;
18050	if (DstSize < InitIdx)
18051	return;
18052	} else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
18053	const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18054	InitIdx = TRI.getRegSizeInBits(RC: *DstRC) / `32`;
18055	} else {
18056	return;
18057	}
18058
18059	const DebugLoc &DL = MI.getDebugLoc();
18060
18061	// Create a register for the initialization value.
18062	Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
18063	unsigned NewDst = `0`; // Final initialized value will be in here
18064
18065	// If PRTStrictNull feature is enabled (the default) then initialize
18066	// all the result registers to 0, otherwise just the error indication
18067	// register (VGPRn+1)
18068	unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : `1`;
18069	unsigned CurrIdx = Subtarget->usePRTStrictNull() ? `0` : (InitIdx - `1`);
18070
18071	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
18072	for (; SizeLeft; SizeLeft--, CurrIdx++) {
18073	NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
18074	// Initialize dword
18075	Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
18076	// clang-format off
18077	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
18078	.addImm(Val: `0`);
18079	// clang-format on
18080	// Insert into the super-reg
18081	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
18082	.addReg(RegNo: PrevDst)
18083	.addReg(RegNo: SubReg)
18084	.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
18085
18086	PrevDst = NewDst;
18087	}
18088
18089	// Add as an implicit operand
18090	MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
18091
18092	// Tie the just added implicit operand to the dst
18093	MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - `1`);
18094	}
18095
18096	/// Assign the register class depending on the number of
18097	/// bits set in the writemask
18098	void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
18099	SDNode Node) const* {
18100	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18101
18102	MachineFunction *MF = MI.getMF();
18103	MachineRegisterInfo &MRI = MF->getRegInfo();
18104
18105	if (TII->isVOP3(Opcode: MI.getOpcode())) {
18106	// Make sure constant bus requirements are respected.
18107	TII->legalizeOperandsVOP3(MRI, MI);
18108
18109	if (TII->isMAI(MI)) {
18110	// The ordinary src0, src1, src2 were legalized above.
18111	//
18112	// We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18113	// as a separate instruction.
18114	int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18115	Name: AMDGPU::OpName::scale_src0);
18116	if (Src0Idx != -`1`) {
18117	int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18118	Name: AMDGPU::OpName::scale_src1);
18119	if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
18120	TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
18121	TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
18122	}
18123	}
18124
18125	return;
18126	}
18127
18128	if (TII->isImage(MI))
18129	TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
18130	}
18131
18132	static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
18133	uint64_t Val) {
18134	SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
18135	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), `0`);
18136	}
18137
18138	MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
18139	const SDLoc &DL,
18140	SDValue Ptr) const {
18141	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18142
18143	// Build the half of the subregister with the constants before building the
18144	// full 128-bit register. If we are building multiple resource descriptors,
18145	// this will allow CSEing of the 2-component register.
18146	const SDValue Ops0[] = {
18147	DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
18148	buildSMovImm32(DAG, DL, Val: `0`),
18149	DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18150	buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> `32`),
18151	DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
18152
18153	SDValue SubRegHi = SDValue (
18154	DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), `0`);
18155
18156	// Combine the constants and the pointer.
18157	const SDValue Ops1[] = {
18158	DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
18159	DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
18160	DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
18161
18162	return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
18163	}
18164
18165	/// Return a resource descriptor with the 'Add TID' bit enabled
18166	/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18167	/// of the resource descriptor) to create an offset, which is added to
18168	/// the resource pointer.
18169	MachineSDNode SITargetLowering::buildRSRC(SelectionDAG &DAG, const* SDLoc &DL,
18170	SDValue Ptr, uint32_t RsrcDword1,
18171	uint64_t RsrcDword2And3) const {
18172	SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
18173	SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
18174	if (RsrcDword1) {
18175	PtrHi =
18176	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
18177	Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
18178	`0`);
18179	}
18180
18181	SDValue DataLo =
18182	buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(`0xFFFFFFFF`));
18183	SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> `32`);
18184
18185	const SDValue Ops[] = {
18186	DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
18187	PtrLo,
18188	DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18189	PtrHi,
18190	DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
18191	DataLo,
18192	DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
18193	DataHi,
18194	DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
18195
18196	return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
18197	}
18198
18199	//===----------------------------------------------------------------------===//
18200	// SI Inline Assembly Support
18201	//===----------------------------------------------------------------------===//
18202
18203	std::pair<unsigned, const TargetRegisterClass *>
18204	SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
18205	StringRef Constraint,
18206	MVT VT) const {
18207	const SIRegisterInfo TRI = static_cast<const* SIRegisterInfo *>(TRI_);
18208
18209	const TargetRegisterClass RC = nullptr*;
18210	if (Constraint.size() == `1`) {
18211	// Check if we cannot determine the bit size of the given value type. This
18212	// can happen, for example, in this situation where we have an empty struct
18213	// (size 0): `call void asm "", "v"({} poison)`-
18214	if (VT == MVT::Other)
18215	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18216	const unsigned BitWidth = VT.getSizeInBits();
18217	switch (Constraint [`0`]) {
18218	default:
18219	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18220	case `'s'`:
18221	case `'r'`:
18222	switch (BitWidth) {
18223	case `16`:
18224	RC = &AMDGPU::SReg_32RegClass;
18225	break;
18226	case `64`:
18227	RC = &AMDGPU::SGPR_64RegClass;
18228	break;
18229	default:
18230	RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
18231	if (!RC)
18232	return std::pair(`0U`, nullptr);
18233	break;
18234	}
18235	break;
18236	case `'v'`:
18237	switch (BitWidth) {
18238	case `1`:
18239	return std::pair(`0U`, nullptr);
18240	case `16`:
18241	RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18242	: &AMDGPU::VGPR_32_Lo256RegClass;
18243	break;
18244	default:
18245	RC = Subtarget->has1024AddressableVGPRs()
18246	? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18247	: TRI->getVGPRClassForBitWidth(BitWidth);
18248	if (!RC)
18249	return std::pair(`0U`, nullptr);
18250	break;
18251	}
18252	break;
18253	case `'a'`:
18254	if (!Subtarget->hasMAIInsts())
18255	break;
18256	switch (BitWidth) {
18257	case `1`:
18258	return std::pair(`0U`, nullptr);
18259	case `16`:
18260	RC = &AMDGPU::AGPR_32RegClass;
18261	break;
18262	default:
18263	RC = TRI->getAGPRClassForBitWidth(BitWidth);
18264	if (!RC)
18265	return std::pair(`0U`, nullptr);
18266	break;
18267	}
18268	break;
18269	}
18270	} else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18271	const unsigned BitWidth = VT.getSizeInBits();
18272	switch (BitWidth) {
18273	case `16`:
18274	RC = &AMDGPU::AV_32RegClass;
18275	break;
18276	default:
18277	RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18278	if (!RC)
18279	return std::pair(`0U`, nullptr);
18280	break;
18281	}
18282	}
18283
18284	// We actually support i128, i16 and f16 as inline parameters
18285	// even if they are not reported as legal
18286	if (RC && (isTypeLegal(VT) \|\| VT.SimpleTy == MVT::i128 \|\|
18287	VT.SimpleTy == MVT::i16 \|\| VT.SimpleTy == MVT::f16))
18288	return std::pair(`0U`, RC);
18289
18290	auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18291	if (Kind != `'\0'`) {
18292	if (Kind == `'v'`) {
18293	RC = &AMDGPU::VGPR_32_Lo256RegClass;
18294	} else if (Kind == `'s'`) {
18295	RC = &AMDGPU::SGPR_32RegClass;
18296	} else if (Kind == `'a'`) {
18297	RC = &AMDGPU::AGPR_32RegClass;
18298	}
18299
18300	if (RC) {
18301	if (NumRegs > `1`) {
18302	if (Idx >= RC->getNumRegs() \|\| Idx + NumRegs - `1` >= RC->getNumRegs())
18303	return std::pair(`0U`, nullptr);
18304
18305	uint32_t Width = NumRegs * `32`;
18306	// Prohibit constraints for register ranges with a width that does not
18307	// match the required type.
18308	if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18309	return std::pair(`0U`, nullptr);
18310
18311	MCRegister Reg = RC->getRegister(i: Idx);
18312	if (SIRegisterInfo::isVGPRClass(RC))
18313	RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
18314	else if (SIRegisterInfo::isSGPRClass(RC))
18315	RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
18316	else if (SIRegisterInfo::isAGPRClass(RC))
18317	RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
18318	if (RC) {
18319	Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
18320	if (!Reg) {
18321	// The register class does not contain the requested register,
18322	// e.g., because it is an SGPR pair that would violate alignment
18323	// requirements.
18324	return std::pair(`0U`, nullptr);
18325	}
18326	return std::pair(Reg, RC);
18327	}
18328	}
18329
18330	// Check for lossy scalar/vector conversions.
18331	if (VT.isVector() && VT.getSizeInBits() != `32`)
18332	return std::pair(`0U`, nullptr);
18333	if (Idx < RC->getNumRegs())
18334	return std::pair(RC->getRegister(i: Idx), RC);
18335	return std::pair(`0U`, nullptr);
18336	}
18337	}
18338
18339	auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18340	if (Ret.first)
18341	Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
18342
18343	return Ret;
18344	}
18345
18346	static bool isImmConstraint(StringRef Constraint) {
18347	if (Constraint.size() == `1`) {
18348	switch (Constraint [`0`]) {
18349	default:
18350	break;
18351	case `'I'`:
18352	case `'J'`:
18353	case `'A'`:
18354	case `'B'`:
18355	case `'C'`:
18356	return true;
18357	}
18358	} else if (Constraint == "DA" \|\| Constraint == "DB") {
18359	return true;
18360	}
18361	return false;
18362	}
18363
18364	SITargetLowering::ConstraintType
18365	SITargetLowering::getConstraintType(StringRef Constraint) const {
18366	if (Constraint.size() == `1`) {
18367	switch (Constraint [`0`]) {
18368	default:
18369	break;
18370	case `'s'`:
18371	case `'v'`:
18372	case `'a'`:
18373	return C_RegisterClass;
18374	}
18375	} else if (Constraint.size() == `2`) {
18376	if (Constraint == "VA")
18377	return C_RegisterClass;
18378	}
18379	if (isImmConstraint(Constraint)) {
18380	return C_Other;
18381	}
18382	return TargetLowering::getConstraintType(Constraint);
18383	}
18384
18385	static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18386	if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
18387	Val = Val & maskTrailingOnes<uint64_t>(N: Size);
18388	}
18389	return Val;
18390	}
18391
18392	void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18393	StringRef Constraint,
18394	std::vector<SDValue> &Ops,
18395	SelectionDAG &DAG) const {
18396	if (isImmConstraint(Constraint)) {
18397	uint64_t Val;
18398	if (getAsmOperandConstVal(Op, Val) &&
18399	checkAsmConstraintVal(Op, Constraint, Val)) {
18400	Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
18401	Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc (Op), VT: MVT::i64));
18402	}
18403	} else {
18404	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18405	}
18406	}
18407
18408	bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
18409	unsigned Size = Op.getScalarValueSizeInBits();
18410	if (Size > `64`)
18411	return false;
18412
18413	if (Size == `16` && !Subtarget->has16BitInsts())
18414	return false;
18415
18416	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
18417	Val = C->getSExtValue();
18418	return true;
18419	}
18420	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
18421	Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18422	return true;
18423	}
18424	if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
18425	if (Size != `16` \|\| Op.getNumOperands() != `2`)
18426	return false;
18427	if (Op.getOperand(i: `0`).isUndef() \|\| Op.getOperand(i: `1`).isUndef())
18428	return false;
18429	if (ConstantSDNode *C = V->getConstantSplatNode()) {
18430	Val = C->getSExtValue();
18431	return true;
18432	}
18433	if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18434	Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18435	return true;
18436	}
18437	}
18438
18439	return false;
18440	}
18441
18442	bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
18443	uint64_t Val) const {
18444	if (Constraint.size() == `1`) {
18445	switch (Constraint [`0`]) {
18446	case `'I'`:
18447	return AMDGPU::isInlinableIntLiteral(Literal: Val);
18448	case `'J'`:
18449	return isInt<`16`>(x: Val);
18450	case `'A'`:
18451	return checkAsmConstraintValA(Op, Val);
18452	case `'B'`:
18453	return isInt<`32`>(x: Val);
18454	case `'C'`:
18455	return isUInt<`32`>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) \|\|
18456	AMDGPU::isInlinableIntLiteral(Literal: Val);
18457	default:
18458	break;
18459	}
18460	} else if (Constraint.size() == `2`) {
18461	if (Constraint == "DA") {
18462	int64_t HiBits = static_cast<int32_t>(Val >> `32`);
18463	int64_t LoBits = static_cast<int32_t>(Val);
18464	return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: `32`) &&
18465	checkAsmConstraintValA(Op, Val: LoBits, MaxSize: `32`);
18466	}
18467	if (Constraint == "DB") {
18468	return true;
18469	}
18470	}
18471	llvm_unreachable("Invalid asm constraint");
18472	}
18473
18474	bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
18475	unsigned MaxSize) const {
18476	unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
18477	bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18478	if (Size == `16`) {
18479	MVT VT = Op.getSimpleValueType();
18480	switch (VT.SimpleTy) {
18481	default:
18482	return false;
18483	case MVT::i16:
18484	return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
18485	case MVT::f16:
18486	return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
18487	case MVT::bf16:
18488	return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
18489	case MVT::v2i16:
18490	return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
18491	case MVT::v2f16:
18492	return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
18493	case MVT::v2bf16:
18494	return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
18495	}
18496	}
18497	if ((Size == `32` && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) \|\|
18498	(Size == `64` && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
18499	return true;
18500	return false;
18501	}
18502
18503	static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18504	switch (UnalignedClassID) {
18505	case AMDGPU::VReg_64RegClassID:
18506	return AMDGPU::VReg_64_Align2RegClassID;
18507	case AMDGPU::VReg_96RegClassID:
18508	return AMDGPU::VReg_96_Align2RegClassID;
18509	case AMDGPU::VReg_128RegClassID:
18510	return AMDGPU::VReg_128_Align2RegClassID;
18511	case AMDGPU::VReg_160RegClassID:
18512	return AMDGPU::VReg_160_Align2RegClassID;
18513	case AMDGPU::VReg_192RegClassID:
18514	return AMDGPU::VReg_192_Align2RegClassID;
18515	case AMDGPU::VReg_224RegClassID:
18516	return AMDGPU::VReg_224_Align2RegClassID;
18517	case AMDGPU::VReg_256RegClassID:
18518	return AMDGPU::VReg_256_Align2RegClassID;
18519	case AMDGPU::VReg_288RegClassID:
18520	return AMDGPU::VReg_288_Align2RegClassID;
18521	case AMDGPU::VReg_320RegClassID:
18522	return AMDGPU::VReg_320_Align2RegClassID;
18523	case AMDGPU::VReg_352RegClassID:
18524	return AMDGPU::VReg_352_Align2RegClassID;
18525	case AMDGPU::VReg_384RegClassID:
18526	return AMDGPU::VReg_384_Align2RegClassID;
18527	case AMDGPU::VReg_512RegClassID:
18528	return AMDGPU::VReg_512_Align2RegClassID;
18529	case AMDGPU::VReg_1024RegClassID:
18530	return AMDGPU::VReg_1024_Align2RegClassID;
18531	case AMDGPU::AReg_64RegClassID:
18532	return AMDGPU::AReg_64_Align2RegClassID;
18533	case AMDGPU::AReg_96RegClassID:
18534	return AMDGPU::AReg_96_Align2RegClassID;
18535	case AMDGPU::AReg_128RegClassID:
18536	return AMDGPU::AReg_128_Align2RegClassID;
18537	case AMDGPU::AReg_160RegClassID:
18538	return AMDGPU::AReg_160_Align2RegClassID;
18539	case AMDGPU::AReg_192RegClassID:
18540	return AMDGPU::AReg_192_Align2RegClassID;
18541	case AMDGPU::AReg_256RegClassID:
18542	return AMDGPU::AReg_256_Align2RegClassID;
18543	case AMDGPU::AReg_512RegClassID:
18544	return AMDGPU::AReg_512_Align2RegClassID;
18545	case AMDGPU::AReg_1024RegClassID:
18546	return AMDGPU::AReg_1024_Align2RegClassID;
18547	default:
18548	return -`1`;
18549	}
18550	}
18551
18552	// Figure out which registers should be reserved for stack access. Only after
18553	// the function is legalized do we know all of the non-spill stack objects or if
18554	// calls are present.
18555	void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
18556	MachineRegisterInfo &MRI = MF.getRegInfo();
18557	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18558	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18559	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18560	const SIInstrInfo *TII = ST.getInstrInfo();
18561
18562	if (Info->isEntryFunction()) {
18563	// Callable functions have fixed registers used for stack access.
18564	reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: TRI, Info&: Info);
18565	}
18566
18567	// TODO: Move this logic to getReservedRegs()
18568	// Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18569	unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18570	Register SReg = ST.isWave32()
18571	? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - `1`)
18572	: TRI->getAlignedHighSGPRForRC(MF, /Align=/`2`,
18573	RC: &AMDGPU::SGPR_64RegClass);
18574	Info->setSGPRForEXECCopy(SReg);
18575
18576	assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18577	Info->getStackPtrOffsetReg()));
18578	if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18579	MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
18580
18581	// We need to worry about replacing the default register with itself in case
18582	// of MIR testcases missing the MFI.
18583	if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18584	MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
18585
18586	if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18587	MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
18588
18589	Info->limitOccupancy(MF);
18590
18591	if (ST.isWave32() && !MF.empty()) {
18592	for (auto &MBB : MF) {
18593	for (auto &MI : MBB) {
18594	TII->fixImplicitOperands(MI);
18595	}
18596	}
18597	}
18598
18599	// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18600	// classes if required. Ideally the register class constraints would differ
18601	// per-subtarget, but there's no easy way to achieve that right now. This is
18602	// not a problem for VGPRs because the correctly aligned VGPR class is implied
18603	// from using them as the register class for legal types.
18604	if (ST.needsAlignedVGPRs()) {
18605	for (unsigned I = `0`, E = MRI.getNumVirtRegs(); I != E; ++I) {
18606	const Register Reg = Register::index2VirtReg(Index: I);
18607	const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18608	if (!RC)
18609	continue;
18610	int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
18611	if (NewClassID != -`1`)
18612	MRI.setRegClass(Reg, RC: TRI->getRegClass(i: NewClassID));
18613	}
18614	}
18615
18616	TargetLoweringBase::finalizeLowering(MF);
18617	}
18618
18619	void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18620	KnownBits &Known,
18621	const APInt &DemandedElts,
18622	const SelectionDAG &DAG,
18623	unsigned Depth) const {
18624	Known.resetAll();
18625	unsigned Opc = Op.getOpcode();
18626	switch (Opc) {
18627	case ISD::INTRINSIC_WO_CHAIN: {
18628	unsigned IID = Op.getConstantOperandVal(i: `0`);
18629	switch (IID) {
18630	case Intrinsic::amdgcn_mbcnt_lo:
18631	case Intrinsic::amdgcn_mbcnt_hi: {
18632	const GCNSubtarget &ST =
18633	DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
18634	// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18635	// most 31 + src1.
18636	Known.Zero.setBitsFrom(
18637	IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : `5`);
18638	KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
18639	Known = KnownBits::add(LHS: Known, RHS: Known2);
18640	return;
18641	}
18642	}
18643	break;
18644	}
18645	}
18646	return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
18647	Op, Known, DemandedElts, DAG, Depth);
18648	}
18649
18650	void SITargetLowering::computeKnownBitsForFrameIndex(
18651	const int FI, KnownBits &Known, const MachineFunction &MF) const {
18652	TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
18653
18654	// Set the high bits to zero based on the maximum allowed scratch size per
18655	// wave. We can't use vaddr in MUBUF instructions if we don't know the address
18656	// calculation won't overflow, so assume the sign bit is never set.
18657	Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18658	}
18659
18660	static void knownBitsForWorkitemID(const GCNSubtarget &ST,
18661	GISelValueTracking &VT, KnownBits &Known,
18662	unsigned Dim) {
18663	unsigned MaxValue =
18664	ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
18665	Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
18666	}
18667
18668	static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
18669	KnownBits &Known, const APInt &DemandedElts,
18670	unsigned BFEWidth, bool SExt, unsigned Depth) {
18671	const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
18672	const MachineOperand &Src1 = MI.getOperand(i: `2`);
18673
18674	unsigned Src1Cst = `0`;
18675	if (Src1.isImm()) {
18676	Src1Cst = Src1.getImm();
18677	} else if (Src1.isReg()) {
18678	auto Cst = getIConstantVRegValWithLookThrough(VReg: Src1.getReg(), MRI);
18679	if (!Cst)
18680	return;
18681	Src1Cst = Cst ->Value.getZExtValue();
18682	} else {
18683	return;
18684	}
18685
18686	// Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18687	// Width is always [22:16].
18688	const unsigned Offset =
18689	Src1Cst & maskTrailingOnes<unsigned>(N: (BFEWidth == `32`) ? `5` : `6`);
18690	const unsigned Width = (Src1Cst >> `16`) & maskTrailingOnes<unsigned>(N: `6`);
18691
18692	if (Width >= BFEWidth) // Ill-formed.
18693	return;
18694
18695	VT.computeKnownBitsImpl(R: MI.getOperand(i: `1`).getReg(), Known, DemandedElts,
18696	Depth: Depth + `1`);
18697
18698	Known = Known.extractBits(NumBits: Width, BitPosition: Offset);
18699
18700	if (SExt)
18701	Known = Known.sext(BitWidth: BFEWidth);
18702	else
18703	Known = Known.zext(BitWidth: BFEWidth);
18704	}
18705
18706	void SITargetLowering::computeKnownBitsForTargetInstr(
18707	GISelValueTracking &VT, Register R, KnownBits &Known,
18708	const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18709	unsigned Depth) const {
18710	Known.resetAll();
18711	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18712	switch (MI->getOpcode()) {
18713	case AMDGPU::S_BFE_I32:
18714	return knownBitsForSBFE(MI: MI, VT, Known, DemandedElts, /Width=/*BFEWidth: `32`,
18715	/SExt=/true, Depth);
18716	case AMDGPU::S_BFE_U32:
18717	return knownBitsForSBFE(MI: MI, VT, Known, DemandedElts, /Width=/*BFEWidth: `32`,
18718	/SExt=/false, Depth);
18719	case AMDGPU::S_BFE_I64:
18720	return knownBitsForSBFE(MI: MI, VT, Known, DemandedElts, /Width=/*BFEWidth: `64`,
18721	/SExt=/true, Depth);
18722	case AMDGPU::S_BFE_U64:
18723	return knownBitsForSBFE(MI: MI, VT, Known, DemandedElts, /Width=/*BFEWidth: `64`,
18724	/SExt=/false, Depth);
18725	case AMDGPU::G_INTRINSIC:
18726	case AMDGPU::G_INTRINSIC_CONVERGENT: {
18727	Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
18728	switch (IID) {
18729	case Intrinsic::amdgcn_workitem_id_x:
18730	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `0`);
18731	break;
18732	case Intrinsic::amdgcn_workitem_id_y:
18733	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `1`);
18734	break;
18735	case Intrinsic::amdgcn_workitem_id_z:
18736	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `2`);
18737	break;
18738	case Intrinsic::amdgcn_mbcnt_lo:
18739	case Intrinsic::amdgcn_mbcnt_hi: {
18740	// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18741	// most 31 + src1.
18742	Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18743	? getSubtarget()->getWavefrontSizeLog2()
18744	: `5`);
18745	KnownBits Known2;
18746	VT.computeKnownBitsImpl(R: MI->getOperand(i: `3`).getReg(), Known&: Known2, DemandedElts,
18747	Depth: Depth + `1`);
18748	Known = KnownBits::add(LHS: Known, RHS: Known2);
18749	break;
18750	}
18751	case Intrinsic::amdgcn_groupstaticsize: {
18752	// We can report everything over the maximum size as 0. We can't report
18753	// based on the actual size because we don't know if it's accurate or not
18754	// at any given point.
18755	Known.Zero.setHighBits(
18756	llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
18757	break;
18758	}
18759	}
18760	break;
18761	}
18762	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18763	Known.Zero.setHighBits(`24`);
18764	break;
18765	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18766	Known.Zero.setHighBits(`16`);
18767	break;
18768	case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18769	// G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18770	// producing exactly 0 or 1.
18771	Known.Zero.setHighBits(Known.getBitWidth() - `1`);
18772	break;
18773	case AMDGPU::G_AMDGPU_SMED3:
18774	case AMDGPU::G_AMDGPU_UMED3: {
18775	auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18776
18777	KnownBits Known2;
18778	VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + `1`);
18779	if (Known2.isUnknown())
18780	break;
18781
18782	KnownBits Known1;
18783	VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + `1`);
18784	if (Known1.isUnknown())
18785	break;
18786
18787	KnownBits Known0;
18788	VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + `1`);
18789	if (Known0.isUnknown())
18790	break;
18791
18792	// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18793	Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18794	Known.One = Known0.One & Known1.One & Known2.One;
18795	break;
18796	}
18797	}
18798	}
18799
18800	Align SITargetLowering::computeKnownAlignForTargetInstr(
18801	GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
18802	unsigned Depth) const {
18803	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18804	if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
18805	// FIXME: Can this move to generic code? What about the case where the call
18806	// site specifies a lower alignment?
18807	Intrinsic::ID IID = GI->getIntrinsicID();
18808	LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
18809	AttributeList Attrs =
18810	Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
18811	if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18812	return *RetAlign;
18813	}
18814	return Align (`1`);
18815	}
18816
18817	Align SITargetLowering::getPrefLoopAlignment(MachineLoop ML) const* {
18818	const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
18819	const Align CacheLineAlign = Align (`64`);
18820
18821	// GFX950: Prevent an 8-byte instruction at loop header from being split by
18822	// the 32-byte instruction fetch window boundary. This avoids a significant
18823	// fetch delay after backward branch. We use 32-byte alignment with max
18824	// padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
18825	if (ML && !DisableLoopAlignment &&
18826	getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
18827	const MachineBasicBlock *Header = ML->getHeader();
18828	// Respect user-specified or previously set alignment.
18829	if (Header->getAlignment() != PrefAlign)
18830	return Header->getAlignment();
18831	if (needsFetchWindowAlignment(MBB: *Header))
18832	return Align (`32`);
18833	}
18834
18835	// Pre-GFX10 target did not benefit from loop alignment
18836	if (!ML \|\| DisableLoopAlignment \|\| !getSubtarget()->hasInstPrefetch() \|\|
18837	getSubtarget()->hasInstFwdPrefetchBug())
18838	return PrefAlign;
18839
18840	// On GFX10 I$ is 4 x 64 bytes cache lines.
18841	// By default prefetcher keeps one cache line behind and reads two ahead.
18842	// We can modify it with S_INST_PREFETCH for larger loops to have two lines
18843	// behind and one ahead.
18844	// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18845	// If loop fits 64 bytes it always spans no more than two cache lines and
18846	// does not need an alignment.
18847	// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18848	// Else if loop is less or equal 192 bytes we need two lines behind.
18849
18850	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18851	const MachineBasicBlock *Header = ML->getHeader();
18852	if (Header->getAlignment() != PrefAlign)
18853	return Header->getAlignment(); // Already processed.
18854
18855	unsigned LoopSize = `0`;
18856	for (const MachineBasicBlock *MBB : ML->blocks()) {
18857	// If inner loop block is aligned assume in average half of the alignment
18858	// size to be added as nops.
18859	if (MBB != Header)
18860	LoopSize += MBB->getAlignment().value() / `2`;
18861
18862	for (const MachineInstr &MI : *MBB) {
18863	LoopSize += TII->getInstSizeInBytes(MI);
18864	if (LoopSize > `192`)
18865	return PrefAlign;
18866	}
18867	}
18868
18869	if (LoopSize <= `64`)
18870	return PrefAlign;
18871
18872	if (LoopSize <= `128`)
18873	return CacheLineAlign;
18874
18875	// If any of parent loops is surrounded by prefetch instructions do not
18876	// insert new for inner loop, which would reset parent's settings.
18877	for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18878	if (MachineBasicBlock *Exit = P->getExitBlock()) {
18879	auto I = Exit->getFirstNonDebugInstr();
18880	if (I != Exit->end() && I ->getOpcode() == AMDGPU::S_INST_PREFETCH)
18881	return CacheLineAlign;
18882	}
18883	}
18884
18885	MachineBasicBlock *Pre = ML->getLoopPreheader();
18886	MachineBasicBlock *Exit = ML->getExitBlock();
18887
18888	if (Pre && Exit) {
18889	auto PreTerm = Pre->getFirstTerminator();
18890	if (PreTerm == Pre->begin() \|\|
18891	std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18892	BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18893	.addImm(Val: `1`); // prefetch 2 lines behind PC
18894
18895	auto ExitHead = Exit->getFirstNonDebugInstr();
18896	if (ExitHead == Exit->end() \|\|
18897	ExitHead ->getOpcode() != AMDGPU::S_INST_PREFETCH)
18898	BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18899	.addImm(Val: `2`); // prefetch 1 line behind PC
18900	}
18901
18902	return CacheLineAlign;
18903	}
18904
18905	unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
18906	MachineBasicBlock MBB) const* {
18907	// GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
18908	// instruction could be split by the 32-byte fetch window boundary.
18909	// See getPrefLoopAlignment() for context.
18910	if (needsFetchWindowAlignment(MBB: *MBB))
18911	return `4`;
18912	return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
18913	}
18914
18915	bool SITargetLowering::needsFetchWindowAlignment(
18916	const MachineBasicBlock &MBB) const {
18917	if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
18918	return false;
18919	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18920	for (const MachineInstr &MI : MBB) {
18921	if (MI.isMetaInstruction())
18922	continue;
18923	// Instructions larger than 4 bytes can be split by a 32-byte boundary.
18924	return TII->getInstSizeInBytes(MI) > `4`;
18925	}
18926	return false;
18927	}
18928
18929	[[maybe_unused]]
18930	static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18931	assert(N->getOpcode() == ISD::CopyFromReg);
18932	do {
18933	// Follow the chain until we find an INLINEASM node.
18934	N = N->getOperand(Num: `0`).getNode();
18935	if (N->getOpcode() == ISD::INLINEASM \|\| N->getOpcode() == ISD::INLINEASM_BR)
18936	return true;
18937	} while (N->getOpcode() == ISD::CopyFromReg);
18938	return false;
18939	}
18940
18941	bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
18942	FunctionLoweringInfo *FLI,
18943	UniformityInfo UA) const* {
18944	switch (N->getOpcode()) {
18945	case ISD::CopyFromReg: {
18946	const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: `1`));
18947	const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18948	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18949	Register Reg = R->getReg();
18950
18951	// FIXME: Why does this need to consider isLiveIn?
18952	if (Reg.isPhysical() \|\| MRI.isLiveIn(Reg))
18953	return !TRI->isSGPRReg(MRI, Reg);
18954
18955	if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
18956	return UA->isDivergent(V);
18957
18958	assert(Reg == FLI->DemoteRegister \|\| isCopyFromRegOfInlineAsm(N));
18959	return !TRI->isSGPRReg(MRI, Reg);
18960	}
18961	case ISD::LOAD: {
18962	const LoadSDNode *L = cast<LoadSDNode>(Val: N);
18963	unsigned AS = L->getAddressSpace();
18964	// A flat load may access private memory.
18965	return AS == AMDGPUAS::PRIVATE_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS;
18966	}
18967	case ISD::CALLSEQ_END:
18968	return true;
18969	case ISD::INTRINSIC_WO_CHAIN:
18970	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: `0`));
18971	case ISD::INTRINSIC_W_CHAIN:
18972	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: `1`));
18973	case AMDGPUISD::ATOMIC_CMP_SWAP:
18974	case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18975	case AMDGPUISD::BUFFER_ATOMIC_ADD:
18976	case AMDGPUISD::BUFFER_ATOMIC_SUB:
18977	case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18978	case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18979	case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18980	case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18981	case AMDGPUISD::BUFFER_ATOMIC_AND:
18982	case AMDGPUISD::BUFFER_ATOMIC_OR:
18983	case AMDGPUISD::BUFFER_ATOMIC_XOR:
18984	case AMDGPUISD::BUFFER_ATOMIC_INC:
18985	case AMDGPUISD::BUFFER_ATOMIC_DEC:
18986	case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18987	case AMDGPUISD::BUFFER_ATOMIC_FADD:
18988	case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18989	case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18990	// Target-specific read-modify-write atomics are sources of divergence.
18991	return true;
18992	default:
18993	if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
18994	// Generic read-modify-write atomics are sources of divergence.
18995	return A->readMem() && A->writeMem();
18996	}
18997	return false;
18998	}
18999	}
19000
19001	bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
19002	EVT VT) const {
19003	switch (VT.getScalarType().getSimpleVT().SimpleTy) {
19004	case MVT::f32:
19005	return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
19006	case MVT::f64:
19007	case MVT::f16:
19008	return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
19009	default:
19010	return false;
19011	}
19012	}
19013
19014	bool SITargetLowering::denormalsEnabledForType(
19015	LLT Ty, const MachineFunction &MF) const {
19016	switch (Ty.getScalarSizeInBits()) {
19017	case `32`:
19018	return !denormalModeIsFlushAllF32(MF);
19019	case `64`:
19020	case `16`:
19021	return !denormalModeIsFlushAllF64F16(MF);
19022	default:
19023	return false;
19024	}
19025	}
19026
19027	bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
19028	const APInt &DemandedElts,
19029	const SelectionDAG &DAG,
19030	bool SNaN,
19031	unsigned Depth) const {
19032	if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19033	const MachineFunction &MF = DAG.getMachineFunction();
19034	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
19035
19036	if (Info->getMode().DX10Clamp)
19037	return true; // Clamped to 0.
19038	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`);
19039	}
19040
19041	return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
19042	DAG, SNaN, Depth);
19043	}
19044
19045	// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19046	// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19047	static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
19048	if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
19049	return true;
19050
19051	const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19052	auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
19053	if (DenormMode == DenormalMode::getPreserveSign())
19054	return true;
19055
19056	// TODO: Remove this.
19057	return RMW->getFunction()
19058	->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
19059	.getValueAsBool();
19060	}
19061
19062	static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
19063	LLVMContext &Ctx = RMW->getContext();
19064	StringRef MemScope =
19065	Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
19066
19067	return OptimizationRemark (DEBUG_TYPE, "Passed", RMW)
19068	<< "Hardware instruction generated for atomic "
19069	<< RMW->getOperationName(Op: RMW->getOperation())
19070	<< " operation at memory scope " << MemScope;
19071	}
19072
19073	static bool isV2F16OrV2BF16(Type *Ty) {
19074	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19075	Type *EltTy = VT->getElementType();
19076	return VT->getNumElements() == `2` &&
19077	(EltTy->isHalfTy() \|\| EltTy->isBFloatTy());
19078	}
19079
19080	return false;
19081	}
19082
19083	static bool isV2F16(Type *Ty) {
19084	FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19085	return VT && VT->getNumElements() == `2` && VT->getElementType()->isHalfTy();
19086	}
19087
19088	static bool isV2BF16(Type *Ty) {
19089	FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19090	return VT && VT->getNumElements() == `2` && VT->getElementType()->isBFloatTy();
19091	}
19092
19093	/// \return true if atomicrmw integer ops work for the type.
19094	static bool isAtomicRMWLegalIntTy(Type *Ty) {
19095	if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
19096	unsigned BW = IT->getBitWidth();
19097	return BW == `32` \|\| BW == `64`;
19098	}
19099
19100	return false;
19101	}
19102
19103	/// \return true if this atomicrmw xchg type can be selected.
19104	static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19105	Type *Ty = RMW->getType();
19106	if (isAtomicRMWLegalIntTy(Ty))
19107	return true;
19108
19109	if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
19110	const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19111	unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
19112	return BW == `32` \|\| BW == `64`;
19113	}
19114
19115	if (Ty->isFloatTy() \|\| Ty->isDoubleTy())
19116	return true;
19117
19118	if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19119	return VT->getNumElements() == `2` &&
19120	VT->getElementType()->getPrimitiveSizeInBits() == `16`;
19121	}
19122
19123	return false;
19124	}
19125
19126	/// \returns true if it's valid to emit a native instruction for \p RMW, based
19127	/// on the properties of the target memory.
19128	static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19129	const AtomicRMWInst *RMW,
19130	bool HasSystemScope) {
19131	// The remote/fine-grained access logic is different from the integer
19132	// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19133	// fine-grained access does not work, even for a device local allocation.
19134	//
19135	// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19136	// allocations work.
19137	if (HasSystemScope) {
19138	if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19139	RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19140	return true;
19141	if (Subtarget.hasEmulatedSystemScopeAtomics())
19142	return true;
19143	} else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19144	return true;
19145
19146	return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
19147	}
19148
19149	/// \return Action to perform on AtomicRMWInsts for integer operations.
19150	static TargetLowering::AtomicExpansionKind
19151	atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
19152	return isAtomicRMWLegalIntTy(Ty: RMW->getType())
19153	? TargetLowering::AtomicExpansionKind::None
19154	: TargetLowering::AtomicExpansionKind::CmpXChg;
19155	}
19156
19157	/// Return if a flat address space atomicrmw can access private memory.
19158	static bool flatInstrMayAccessPrivate(const Instruction *I) {
19159	const MDNode *MD = I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
19160	return !MD \|\|
19161	!AMDGPU::hasValueInRangeLikeMetadata(MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
19162	}
19163
19164	static TargetLowering::AtomicExpansionKind
19165	getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
19166	// For GAS, lower to flat atomic.
19167	return STI.hasGloballyAddressableScratch()
19168	? TargetLowering::AtomicExpansionKind::CustomExpand
19169	: TargetLowering::AtomicExpansionKind::NotAtomic;
19170	}
19171
19172	TargetLowering::AtomicExpansionKind
19173	SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst RMW) const* {
19174	unsigned AS = RMW->getPointerAddressSpace();
19175	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19176	return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19177
19178	// 64-bit flat atomics that dynamically reside in private memory will silently
19179	// be dropped.
19180	//
19181	// Note that we will emit a new copy of the original atomic in the expansion,
19182	// which will be incrementally relegalized.
19183	const DataLayout &DL = RMW->getFunction()->getDataLayout();
19184	if (AS == AMDGPUAS::FLAT_ADDRESS &&
19185	DL.getTypeSizeInBits(Ty: RMW->getType()) == `64` &&
19186	flatInstrMayAccessPrivate(I: RMW))
19187	return AtomicExpansionKind::CustomExpand;
19188
19189	auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19190	OptimizationRemarkEmitter ORE(RMW->getFunction());
19191	ORE.emit(RemarkBuilder: [=]() {
19192	return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19193	});
19194	return Kind;
19195	};
19196
19197	auto SSID = RMW->getSyncScopeID();
19198	bool HasSystemScope =
19199	SSID == SyncScope::System \|\|
19200	SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
19201
19202	auto Op = RMW->getOperation();
19203	switch (Op) {
19204	case AtomicRMWInst::Xchg:
19205	// PCIe supports add and xchg for system atomics.
19206	return isAtomicRMWLegalXChgTy(RMW)
19207	? TargetLowering::AtomicExpansionKind::None
19208	: TargetLowering::AtomicExpansionKind::CmpXChg;
19209	case AtomicRMWInst::Add:
19210	// PCIe supports add and xchg for system atomics.
19211	return atomicSupportedIfLegalIntType(RMW);
19212	case AtomicRMWInst::Sub:
19213	case AtomicRMWInst::And:
19214	case AtomicRMWInst::Or:
19215	case AtomicRMWInst::Xor:
19216	case AtomicRMWInst::Max:
19217	case AtomicRMWInst::Min:
19218	case AtomicRMWInst::UMax:
19219	case AtomicRMWInst::UMin:
19220	case AtomicRMWInst::UIncWrap:
19221	case AtomicRMWInst::UDecWrap:
19222	case AtomicRMWInst::USubCond:
19223	case AtomicRMWInst::USubSat: {
19224	if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19225	return AtomicExpansionKind::CmpXChg;
19226	if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19227	return AtomicExpansionKind::CmpXChg;
19228	if (Op == AtomicRMWInst::USubCond \|\| Op == AtomicRMWInst::USubSat) {
19229	auto *IT = dyn_cast<IntegerType>(Val: RMW->getType());
19230	if (!IT \|\| IT->getBitWidth() != `32`)
19231	return AtomicExpansionKind::CmpXChg;
19232	}
19233
19234	if (AMDGPU::isFlatGlobalAddrSpace(AS) \|\|
19235	AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19236	if (Subtarget->hasEmulatedSystemScopeAtomics())
19237	return atomicSupportedIfLegalIntType(RMW);
19238
19239	// On most subtargets, for atomicrmw operations other than add/xchg,
19240	// whether or not the instructions will behave correctly depends on where
19241	// the address physically resides and what interconnect is used in the
19242	// system configuration. On some some targets the instruction will nop,
19243	// and in others synchronization will only occur at degraded device scope.
19244	//
19245	// If the allocation is known local to the device, the instructions should
19246	// work correctly.
19247	if (RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19248	return atomicSupportedIfLegalIntType(RMW);
19249
19250	// If fine-grained remote memory works at device scope, we don't need to
19251	// do anything.
19252	if (!HasSystemScope &&
19253	Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19254	return atomicSupportedIfLegalIntType(RMW);
19255
19256	// If we are targeting a remote allocated address, it depends what kind of
19257	// allocation the address belongs to.
19258	//
19259	// If the allocation is fine-grained (in host memory, or in PCIe peer
19260	// device memory), the operation will fail depending on the target.
19261	//
19262	// Note fine-grained host memory access does work on APUs or if XGMI is
19263	// used, but we do not know if we are targeting an APU or the system
19264	// configuration from the ISA version/target-cpu.
19265	if (RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory"))
19266	return atomicSupportedIfLegalIntType(RMW);
19267
19268	if (Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Or \|\|
19269	Op == AtomicRMWInst::Xor) {
19270	// Atomic sub/or/xor do not work over PCI express, but atomic add
19271	// does. InstCombine transforms these with 0 to or, so undo that.
19272	if (const Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
19273	ConstVal && ConstVal->isNullValue())
19274	return AtomicExpansionKind::CustomExpand;
19275	}
19276
19277	// If the allocation could be in remote, fine-grained memory, the rmw
19278	// instructions may fail. cmpxchg should work, so emit that. On some
19279	// system configurations, PCIe atomics aren't supported so cmpxchg won't
19280	// even work, so you're out of luck anyway.
19281
19282	// In summary:
19283	//
19284	// Cases that may fail:
19285	// - fine-grained pinned host memory
19286	// - fine-grained migratable host memory
19287	// - fine-grained PCIe peer device
19288	//
19289	// Cases that should work, but may be treated overly conservatively.
19290	// - fine-grained host memory on an APU
19291	// - fine-grained XGMI peer device
19292	return AtomicExpansionKind::CmpXChg;
19293	}
19294
19295	return atomicSupportedIfLegalIntType(RMW);
19296	}
19297	case AtomicRMWInst::FAdd: {
19298	Type *Ty = RMW->getType();
19299
19300	// TODO: Handle REGION_ADDRESS
19301	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19302	// DS F32 FP atomics do respect the denormal mode, but the rounding mode
19303	// is fixed to round-to-nearest-even.
19304	//
19305	// F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19306	// round-to-nearest-even.
19307	//
19308	// We ignore the rounding mode problem, even in strictfp. The C++ standard
19309	// suggests it is OK if the floating-point mode may not match the calling
19310	// thread.
19311	if (Ty->isFloatTy()) {
19312	return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19313	: AtomicExpansionKind::CmpXChg;
19314	}
19315
19316	if (Ty->isDoubleTy()) {
19317	// Ignores denormal mode, but we don't consider flushing mandatory.
19318	return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19319	: AtomicExpansionKind::CmpXChg;
19320	}
19321
19322	if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19323	return AtomicExpansionKind::None;
19324
19325	return AtomicExpansionKind::CmpXChg;
19326	}
19327
19328	// LDS atomics respect the denormal mode from the mode register.
19329	//
19330	// Traditionally f32 global/buffer memory atomics would unconditionally
19331	// flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19332	// flush.
19333	//
19334	// On targets with flat atomic fadd, denormals would flush depending on
19335	// whether the target address resides in LDS or global memory. We consider
19336	// this flat-maybe-flush as will-flush.
19337	if (Ty->isFloatTy() &&
19338	!Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19339	!atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
19340	return AtomicExpansionKind::CmpXChg;
19341
19342	// FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19343	// safe. The message phrasing also should be better.
19344	if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19345	if (AS == AMDGPUAS::FLAT_ADDRESS) {
19346	// gfx942, gfx12
19347	if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19348	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19349	} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19350	// gfx90a, gfx942, gfx12
19351	if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19352	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19353
19354	// gfx942, gfx12
19355	if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19356	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19357	} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19358	// gfx90a, gfx942, gfx12
19359	if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19360	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19361
19362	// While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19363	// buffer. gfx12 does have the buffer version.
19364	if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19365	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19366	}
19367
19368	// global and flat atomic fadd f64: gfx90a, gfx942.
19369	if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19370	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19371
19372	if (AS != AMDGPUAS::FLAT_ADDRESS) {
19373	if (Ty->isFloatTy()) {
19374	// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19375	// gfx11+.
19376	if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19377	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19378	// global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19379	if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19380	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19381	} else {
19382	// gfx908
19383	if (RMW->use_empty() &&
19384	Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19385	isV2F16(Ty))
19386	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19387	}
19388	}
19389
19390	// flat atomic fadd f32: gfx942, gfx11+.
19391	if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19392	if (Subtarget->hasFlatAtomicFaddF32Inst())
19393	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19394
19395	// If it is in flat address space, and the type is float, we will try to
19396	// expand it, if the target supports global and lds atomic fadd. The
19397	// reason we need that is, in the expansion, we emit the check of
19398	// address space. If it is in global address space, we emit the global
19399	// atomic fadd; if it is in shared address space, we emit the LDS atomic
19400	// fadd.
19401	if (Subtarget->hasLDSFPAtomicAddF32()) {
19402	if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19403	return AtomicExpansionKind::CustomExpand;
19404	if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19405	return AtomicExpansionKind::CustomExpand;
19406	}
19407	}
19408	}
19409
19410	return AtomicExpansionKind::CmpXChg;
19411	}
19412	case AtomicRMWInst::FMin:
19413	case AtomicRMWInst::FMax: {
19414	Type *Ty = RMW->getType();
19415
19416	// LDS float and double fmin/fmax were always supported.
19417	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19418	return Ty->isFloatTy() \|\| Ty->isDoubleTy() ? AtomicExpansionKind::None
19419	: AtomicExpansionKind::CmpXChg;
19420	}
19421
19422	if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19423	// For flat and global cases:
19424	// float, double in gfx7. Manual claims denormal support.
19425	// Removed in gfx8.
19426	// float, double restored in gfx10.
19427	// double removed again in gfx11, so only f32 for gfx11/gfx12.
19428	//
19429	// For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19430	// no f32.
19431	if (AS == AMDGPUAS::FLAT_ADDRESS) {
19432	if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19433	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19434	if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19435	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19436	} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) \|\|
19437	AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19438	if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19439	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19440	if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19441	return ReportUnsafeHWInst (AtomicExpansionKind::None);
19442	}
19443	}
19444
19445	return AtomicExpansionKind::CmpXChg;
19446	}
19447	case AtomicRMWInst::Nand:
19448	case AtomicRMWInst::FSub:
19449	default:
19450	return AtomicExpansionKind::CmpXChg;
19451	}
19452
19453	llvm_unreachable("covered atomicrmw op switch");
19454	}
19455
19456	TargetLowering::AtomicExpansionKind
19457	SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst LI) const* {
19458	return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19459	? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19460	: AtomicExpansionKind::None;
19461	}
19462
19463	TargetLowering::AtomicExpansionKind
19464	SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst SI) const* {
19465	return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19466	? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19467	: AtomicExpansionKind::None;
19468	}
19469
19470	TargetLowering::AtomicExpansionKind
19471	SITargetLowering::shouldExpandAtomicCmpXchgInIR(
19472	const AtomicCmpXchgInst CmpX) const* {
19473	unsigned AddrSpace = CmpX->getPointerAddressSpace();
19474	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19475	return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19476
19477	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS \|\| !flatInstrMayAccessPrivate(I: CmpX))
19478	return AtomicExpansionKind::None;
19479
19480	const DataLayout &DL = CmpX->getDataLayout();
19481
19482	Type *ValTy = CmpX->getNewValOperand()->getType();
19483
19484	// If a 64-bit flat atomic may alias private, we need to avoid using the
19485	// atomic in the private case.
19486	return DL.getTypeSizeInBits(Ty: ValTy) == `64` ? AtomicExpansionKind::CustomExpand
19487	: AtomicExpansionKind::None;
19488	}
19489
19490	const TargetRegisterClass *
19491	SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19492	const TargetRegisterClass RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false*);
19493	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19494	if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19495	return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19496	: &AMDGPU::SReg_32RegClass;
19497	if (!TRI->isSGPRClass(RC) && !isDivergent)
19498	return TRI->getEquivalentSGPRClass(VRC: RC);
19499	if (TRI->isSGPRClass(RC) && isDivergent) {
19500	if (Subtarget->hasGFX90AInsts())
19501	return TRI->getEquivalentAVClass(SRC: RC);
19502	return TRI->getEquivalentVGPRClass(SRC: RC);
19503	}
19504
19505	return RC;
19506	}
19507
19508	// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19509	// uniform values (as produced by the mask results of control flow intrinsics)
19510	// used outside of divergent blocks. The phi users need to also be treated as
19511	// always uniform.
19512	//
19513	// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19514	static bool hasCFUser(const Value V, SmallPtrSet<const* Value *, `16`> &Visited,
19515	unsigned WaveSize) {
19516	// FIXME: We assume we never cast the mask results of a control flow
19517	// intrinsic.
19518	// Early exit if the type won't be consistent as a compile time hack.
19519	IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
19520	if (!IT \|\| IT->getBitWidth() != WaveSize)
19521	return false;
19522
19523	if (!isa<Instruction>(Val: V))
19524	return false;
19525	if (!Visited.insert(Ptr: V).second)
19526	return false;
19527	bool Result = false;
19528	for (const auto *U : V->users()) {
19529	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
19530	if (V == U->getOperand(i: `1`)) {
19531	switch (Intrinsic->getIntrinsicID()) {
19532	default:
19533	Result = false;
19534	break;
19535	case Intrinsic::amdgcn_if_break:
19536	case Intrinsic::amdgcn_if:
19537	case Intrinsic::amdgcn_else:
19538	Result = true;
19539	break;
19540	}
19541	}
19542	if (V == U->getOperand(i: `0`)) {
19543	switch (Intrinsic->getIntrinsicID()) {
19544	default:
19545	Result = false;
19546	break;
19547	case Intrinsic::amdgcn_end_cf:
19548	case Intrinsic::amdgcn_loop:
19549	Result = true;
19550	break;
19551	}
19552	}
19553	} else {
19554	Result = hasCFUser(V: U, Visited, WaveSize);
19555	}
19556	if (Result)
19557	break;
19558	}
19559	return Result;
19560	}
19561
19562	bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
19563	const Value V) const* {
19564	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
19565	if (CI->isInlineAsm()) {
19566	// FIXME: This cannot give a correct answer. This should only trigger in
19567	// the case where inline asm returns mixed SGPR and VGPR results, used
19568	// outside the defining block. We don't have a specific result to
19569	// consider, so this assumes if any value is SGPR, the overall register
19570	// also needs to be SGPR.
19571	const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19572	TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
19573	DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
19574	for (auto &TC : TargetConstraints) {
19575	if (TC.Type == InlineAsm::isOutput) {
19576	ComputeConstraintToUse(OpInfo&: TC, Op: SDValue ());
19577	const TargetRegisterClass *RC =
19578	getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
19579	VT: TC.ConstraintVT)
19580	.second;
19581	if (RC && SIRI->isSGPRClass(RC))
19582	return true;
19583	}
19584	}
19585	}
19586	}
19587	SmallPtrSet<const Value *, `16`> Visited;
19588	return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
19589	}
19590
19591	bool SITargetLowering::hasMemSDNodeUser(SDNode N) const* {
19592	for (SDUse &Use : N->uses()) {
19593	if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
19594	if (getBasePtrIndex(N: M) == Use.getOperandNo())
19595	return true;
19596	}
19597	}
19598	return false;
19599	}
19600
19601	bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
19602	SDValue N1) const {
19603	if (!N0.hasOneUse())
19604	return false;
19605	// Take care of the opportunity to keep N0 uniform
19606	if (N0 ->isDivergent() \|\| !N1 ->isDivergent())
19607	return true;
19608	// Check if we have a good chance to form the memory access pattern with the
19609	// base and offset
19610	return (DAG.isBaseWithConstantOffset(Op: N0) &&
19611	hasMemSDNodeUser(N: *N0 ->user_begin()));
19612	}
19613
19614	bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
19615	Register N0, Register N1) const {
19616	return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
19617	}
19618
19619	MachineMemOperand::Flags
19620	SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
19621	// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19622	MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
19623	if (I.getMetadata(Kind: "amdgpu.noclobber"))
19624	Flags \|= MONoClobber;
19625	if (I.getMetadata(Kind: "amdgpu.last.use"))
19626	Flags \|= MOLastUse;
19627	return Flags;
19628	}
19629
19630	void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
19631	Instruction AI) const* {
19632	// Given: atomicrmw fadd ptr %addr, float %val ordering
19633	//
19634	// With this expansion we produce the following code:
19635	// [...]
19636	// %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19637	// br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19638	//
19639	// atomicrmw.shared:
19640	// %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19641	// %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19642	// float %val ordering
19643	// br label %atomicrmw.phi
19644	//
19645	// atomicrmw.check.private:
19646	// %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19647	// br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19648	//
19649	// atomicrmw.private:
19650	// %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19651	// %loaded.private = load float, ptr addrspace(5) %cast.private
19652	// %val.new = fadd float %loaded.private, %val
19653	// store float %val.new, ptr addrspace(5) %cast.private
19654	// br label %atomicrmw.phi
19655	//
19656	// atomicrmw.global:
19657	// %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19658	// %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19659	// float %val ordering
19660	// br label %atomicrmw.phi
19661	//
19662	// atomicrmw.phi:
19663	// %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19664	// [ %loaded.private, %atomicrmw.private ],
19665	// [ %loaded.global, %atomicrmw.global ]
19666	// br label %atomicrmw.end
19667	//
19668	// atomicrmw.end:
19669	// [...]
19670	//
19671	//
19672	// For 64-bit atomics which may reside in private memory, we perform a simpler
19673	// version that only inserts the private check, and uses the flat operation.
19674
19675	IRBuilder<> Builder(AI);
19676	LLVMContext &Ctx = Builder.getContext();
19677
19678	auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
19679	const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19680	: AtomicCmpXchgInst::getPointerOperandIndex();
19681	Value *Addr = AI->getOperand(i: PtrOpIdx);
19682
19683	/// TODO: Only need to check private, then emit flat-known-not private (no
19684	/// need for shared block, or cast to global).
19685	AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
19686
19687	Align Alignment;
19688	if (RMW)
19689	Alignment = RMW->getAlign();
19690	else if (CX)
19691	Alignment = CX->getAlign();
19692	else
19693	llvm_unreachable("unhandled atomic operation");
19694
19695	// FullFlatEmulation is true if we need to issue the private, shared, and
19696	// global cases.
19697	//
19698	// If this is false, we are only dealing with the flat-targeting-private case,
19699	// where we only insert a check for private and still use the flat instruction
19700	// for global and shared.
19701
19702	bool FullFlatEmulation =
19703	RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19704	((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) \|\|
19705	(Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19706	RMW->getType()->isDoubleTy()));
19707
19708	// If the return value isn't used, do not introduce a false use in the phi.
19709	bool ReturnValueIsUsed = !AI->use_empty();
19710
19711	BasicBlock *BB = Builder.GetInsertBlock();
19712	Function *F = BB->getParent();
19713	BasicBlock *ExitBB =
19714	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
19715	BasicBlock SharedBB = nullptr*;
19716
19717	BasicBlock *CheckPrivateBB = BB;
19718	if (FullFlatEmulation) {
19719	SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
19720	CheckPrivateBB =
19721	BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
19722	}
19723
19724	BasicBlock *PrivateBB =
19725	BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
19726	BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
19727	BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
19728
19729	std::prev(x: BB->end())->eraseFromParent();
19730	Builder.SetInsertPoint(BB);
19731
19732	Value LoadedShared = nullptr*;
19733	if (FullFlatEmulation) {
19734	CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
19735	Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
19736	Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
19737	Builder.SetInsertPoint(SharedBB);
19738	Value *CastToLocal = Builder.CreateAddrSpaceCast(
19739	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
19740
19741	Instruction *Clone = AI->clone();
19742	Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
19743	Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
19744	LoadedShared = Clone;
19745
19746	Builder.CreateBr(Dest: PhiBB);
19747	Builder.SetInsertPoint(CheckPrivateBB);
19748	}
19749
19750	CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
19751	Args: {Addr}, FMFSource: nullptr, Name: "is.private");
19752	Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
19753
19754	Builder.SetInsertPoint(PrivateBB);
19755
19756	Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19757	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
19758
19759	Value *LoadedPrivate;
19760	if (RMW) {
19761	LoadedPrivate = Builder.CreateAlignedLoad(
19762	Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
19763
19764	Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
19765	Loaded: LoadedPrivate, Val: RMW->getValOperand());
19766
19767	Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
19768	} else {
19769	auto [ResultLoad, Equal] =
19770	buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
19771	Val: CX->getNewValOperand(), Alignment: CX->getAlign());
19772
19773	Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
19774	Val: ResultLoad, Idxs: `0`);
19775	LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: `1`);
19776	}
19777
19778	Builder.CreateBr(Dest: PhiBB);
19779
19780	Builder.SetInsertPoint(GlobalBB);
19781
19782	// Continue using a flat instruction if we only emitted the check for private.
19783	Instruction *LoadedGlobal = AI;
19784	if (FullFlatEmulation) {
19785	Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19786	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
19787	AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
19788	}
19789
19790	AI->removeFromParent();
19791	AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
19792
19793	// The new atomicrmw may go through another round of legalization later.
19794	if (!FullFlatEmulation) {
19795	// We inserted the runtime check already, make sure we do not try to
19796	// re-expand this.
19797	// TODO: Should union with any existing metadata.
19798	MDBuilder MDB(F->getContext());
19799	MDNode *RangeNotPrivate =
19800	MDB.createRange(Lo: APInt (`32`, AMDGPUAS::PRIVATE_ADDRESS),
19801	Hi: APInt (`32`, AMDGPUAS::PRIVATE_ADDRESS + `1`));
19802	LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
19803	Node: RangeNotPrivate);
19804	}
19805
19806	Builder.CreateBr(Dest: PhiBB);
19807
19808	Builder.SetInsertPoint(PhiBB);
19809
19810	if (ReturnValueIsUsed) {
19811	PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: `3`);
19812	AI->replaceAllUsesWith(V: Loaded);
19813	if (FullFlatEmulation)
19814	Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
19815	Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
19816	Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
19817	Loaded->takeName(V: AI);
19818	}
19819
19820	Builder.CreateBr(Dest: ExitBB);
19821	}
19822
19823	static void convertScratchAtomicToFlatAtomic(Instruction *I,
19824	unsigned PtrOpIdx) {
19825	Value *PtrOp = I->getOperand(i: PtrOpIdx);
19826	assert(PtrOp->getType()->getPointerAddressSpace() ==
19827	AMDGPUAS::PRIVATE_ADDRESS);
19828
19829	Type *FlatPtr = PointerType::get(C&: I->getContext(), AddressSpace: AMDGPUAS::FLAT_ADDRESS);
19830	Value *ASCast = CastInst::CreatePointerCast(S: PtrOp, Ty: FlatPtr, Name: "scratch.ascast",
19831	InsertBefore: I->getIterator());
19832	I->setOperand(i: PtrOpIdx, Val: ASCast);
19833	}
19834
19835	void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst AI) const* {
19836	AtomicRMWInst::BinOp Op = AI->getOperation();
19837
19838	if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19839	return convertScratchAtomicToFlatAtomic(I: AI, PtrOpIdx: AI->getPointerOperandIndex());
19840
19841	if (Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Or \|\|
19842	Op == AtomicRMWInst::Xor) {
19843	if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
19844	ConstVal && ConstVal->isNullValue()) {
19845	// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19846	AI->setOperation(AtomicRMWInst::Add);
19847
19848	// We may still need the private-alias-flat handling below.
19849
19850	// TODO: Skip this for cases where we cannot access remote memory.
19851	}
19852	}
19853
19854	// The non-flat expansions should only perform the de-canonicalization of
19855	// identity values.
19856	if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
19857	return;
19858
19859	emitExpandAtomicAddrSpacePredicate(AI);
19860	}
19861
19862	void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst CI) const* {
19863	if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19864	return convertScratchAtomicToFlatAtomic(I: CI, PtrOpIdx: CI->getPointerOperandIndex());
19865
19866	emitExpandAtomicAddrSpacePredicate(AI: CI);
19867	}
19868
19869	void SITargetLowering::emitExpandAtomicLoad(LoadInst LI) const* {
19870	if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19871	return convertScratchAtomicToFlatAtomic(I: LI, PtrOpIdx: LI->getPointerOperandIndex());
19872
19873	llvm_unreachable(
19874	"Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19875	}
19876
19877	void SITargetLowering::emitExpandAtomicStore(StoreInst SI) const* {
19878	if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19879	return convertScratchAtomicToFlatAtomic(I: SI, PtrOpIdx: SI->getPointerOperandIndex());
19880
19881	llvm_unreachable(
19882	"Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19883	}
19884
19885	LoadInst *
19886	SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst AI) const* {
19887	IRBuilder<> Builder(AI);
19888	auto Order = AI->getOrdering();
19889
19890	// The optimization removes store aspect of the atomicrmw. Therefore, cache
19891	// must be flushed if the atomic ordering had a release semantics. This is
19892	// not necessary a fence, a release fence just coincides to do that flush.
19893	// Avoid replacing of an atomicrmw with a release semantics.
19894	if (isReleaseOrStronger(AO: Order))
19895	return nullptr;
19896
19897	LoadInst *LI = Builder.CreateAlignedLoad(
19898	Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
19899	LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
19900	LI->copyMetadata(SrcInst: *AI);
19901	LI->takeName(V: AI);
19902	AI->replaceAllUsesWith(V: LI);
19903	AI->eraseFromParent();
19904	return LI;
19905	}
19906

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIISelLowering.cpp