SIISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/SIISelLowering.cpp]

1	//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Custom DAG lowering for SI
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIISelLowering.h"
15	#include "AMDGPU.h"
16	#include "AMDGPUInstrInfo.h"
17	#include "AMDGPUTargetMachine.h"
18	#include "GCNSubtarget.h"
19	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20	#include "SIMachineFunctionInfo.h"
21	#include "SIRegisterInfo.h"
22	#include "llvm/ADT/APInt.h"
23	#include "llvm/ADT/FloatingPointMode.h"
24	#include "llvm/ADT/Statistic.h"
25	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26	#include "llvm/Analysis/UniformityAnalysis.h"
27	#include "llvm/CodeGen/Analysis.h"
28	#include "llvm/CodeGen/ByteProvider.h"
29	#include "llvm/CodeGen/FunctionLoweringInfo.h"
30	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
31	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32	#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33	#include "llvm/CodeGen/MachineFrameInfo.h"
34	#include "llvm/CodeGen/MachineFunction.h"
35	#include "llvm/CodeGen/MachineLoopInfo.h"
36	#include "llvm/IR/DiagnosticInfo.h"
37	#include "llvm/IR/IRBuilder.h"
38	#include "llvm/IR/IntrinsicInst.h"
39	#include "llvm/IR/IntrinsicsAMDGPU.h"
40	#include "llvm/IR/IntrinsicsR600.h"
41	#include "llvm/IR/MDBuilder.h"
42	#include "llvm/Support/CommandLine.h"
43	#include "llvm/Support/KnownBits.h"
44	#include "llvm/Support/ModRef.h"
45	#include "llvm/Transforms/Utils/LowerAtomic.h"
46	#include <optional>
47
48	using namespace llvm;
49
50	#define DEBUG_TYPE "si-lower"
51
52	STATISTIC(NumTailCalls, "Number of tail calls");
53
54	static cl::opt<bool>
55	DisableLoopAlignment("amdgpu-disable-loop-alignment",
56	cl::desc ("Do not align and prefetch loops"),
57	cl::init(Val: false));
58
59	static cl::opt<bool> UseDivergentRegisterIndexing(
60	"amdgpu-use-divergent-register-indexing", cl::Hidden,
61	cl::desc ("Use indirect register addressing for divergent indexes"),
62	cl::init(Val: false));
63
64	// TODO: This option should be removed once we switch to always using PTRADD in
65	// the SelectionDAG.
66	static cl::opt<bool> UseSelectionDAGPTRADD(
67	"amdgpu-use-sdag-ptradd", cl::Hidden,
68	cl::desc ("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
69	"SelectionDAG ISel"),
70	cl::init(Val: false));
71
72	static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
73	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
74	return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
75	}
76
77	static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
78	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
79	return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
80	}
81
82	static unsigned findFirstFreeSGPR(CCState &CCInfo) {
83	unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
84	for (unsigned Reg = `0`; Reg < NumSGPRs; ++Reg) {
85	if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
86	return AMDGPU::SGPR0 + Reg;
87	}
88	}
89	llvm_unreachable("Cannot allocate sgpr");
90	}
91
92	SITargetLowering::SITargetLowering(const TargetMachine &TM,
93	const GCNSubtarget &STI)
94	: AMDGPUTargetLowering (TM, STI), Subtarget(&STI) {
95	addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
96	addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
97
98	addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
99	addRegisterClass(VT: MVT::f32, RC: &AMDGPU::VGPR_32RegClass);
100
101	addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
102
103	const SIRegisterInfo *TRI = STI.getRegisterInfo();
104	const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
105
106	addRegisterClass(VT: MVT::f64, RC: V64RegClass);
107	addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
108	addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
109
110	addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
111	addRegisterClass(VT: MVT::v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `96`));
112
113	addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
114	addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
115
116	addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
117	addRegisterClass(VT: MVT::v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `128`));
118
119	addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
120	addRegisterClass(VT: MVT::v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `160`));
121
122	addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
123	addRegisterClass(VT: MVT::v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `192`));
124
125	addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
126	addRegisterClass(VT: MVT::v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: `192`));
127
128	addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
129	addRegisterClass(VT: MVT::v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `224`));
130
131	addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
132	addRegisterClass(VT: MVT::v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `256`));
133
134	addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
135	addRegisterClass(VT: MVT::v4f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: `256`));
136
137	addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
138	addRegisterClass(VT: MVT::v9f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `288`));
139
140	addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
141	addRegisterClass(VT: MVT::v10f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `320`));
142
143	addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
144	addRegisterClass(VT: MVT::v11f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `352`));
145
146	addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
147	addRegisterClass(VT: MVT::v12f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `384`));
148
149	addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
150	addRegisterClass(VT: MVT::v16f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `512`));
151
152	addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
153	addRegisterClass(VT: MVT::v8f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: `512`));
154
155	addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
156	addRegisterClass(VT: MVT::v16f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: `1024`));
157
158	if (Subtarget->has16BitInsts()) {
159	if (Subtarget->useRealTrue16Insts()) {
160	addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
161	addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
162	addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
163	} else {
164	addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
165	addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
166	addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
167	}
168
169	// Unless there are also VOP3P operations, not operations are really legal.
170	addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
171	addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
172	addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
173	addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
174	addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
175	addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
176	addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
177	addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
178	addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
179	addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
180	addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
181	addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
182	addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
183	addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
184	addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
185	}
186
187	addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
188	addRegisterClass(VT: MVT::v32f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: `1024`));
189
190	computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
191
192	// The boolean content concept here is too inflexible. Compares only ever
193	// really produce a 1-bit result. Any copy/extend from these will turn into a
194	// select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
195	// it's what most targets use.
196	setBooleanContents(ZeroOrOneBooleanContent);
197	setBooleanVectorContents(ZeroOrOneBooleanContent);
198
199	// We need to custom lower vector stores from local memory
200	setOperationAction(Ops: ISD::LOAD,
201	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204	MVT::i1, MVT::v32i32},
205	Action: Custom);
206
207	setOperationAction(Ops: ISD::STORE,
208	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211	MVT::i1, MVT::v32i32},
212	Action: Custom);
213
214	if (isTypeLegal(VT: MVT::bf16)) {
215	for (unsigned Opc :
216	{ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
217	ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
218	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
219	ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
220	ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
221	ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
222	ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
223	ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
224	ISD::SETCC}) {
225	// FIXME: The promoted to type shouldn't need to be explicit
226	setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
227	AddPromotedToType(Opc, OrigVT: MVT::bf16, DestVT: MVT::f32);
228	}
229
230	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
231
232	setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
233	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
234
235	setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
236	setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
237	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
238
239	// We only need to custom lower because we can't specify an action for bf16
240	// sources.
241	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
242	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
243	}
244
245	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
246	setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
247	setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
248	setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
249	setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
250	setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
251	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
252	setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
253	setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
254	setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
255	setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
256	setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
257	setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
258	setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
259	setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
260	setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
261
262	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
263	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
264	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
265	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
266	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
267	setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
268	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
269
270	setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
271
272	setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
273	setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
274	setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
275	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
276
277	setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
278
279	setOperationAction(Ops: ISD::SELECT_CC,
280	VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
281
282	setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
283	setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
284	AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
285
286	setOperationAction(Ops: ISD::TRUNCATE,
287	VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
288	MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
289	MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
290	Action: Expand);
291	setOperationAction(Ops: ISD::FP_ROUND,
292	VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
293	MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
294	MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295	Action: Expand);
296
297	setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
298	VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
299	MVT::v3i16, MVT::v4i16, MVT::Other},
300	Action: Custom);
301
302	setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
303	setOperationAction(Ops: ISD::BR_CC,
304	VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
305
306	setOperationAction(Ops: {ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
307
308	setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
309
310	setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
311	Action: Expand);
312
313	#if 0
314	setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
315	#endif
316
317	// We only support LOAD/STORE and vector manipulation ops for vectors
318	// with > 4 elements.
319	for (MVT VT :
320	{MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
321	MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
322	MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
323	MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
324	MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
325	MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
326	MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
327	MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
328	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op) {
329	switch (Op) {
330	case ISD::LOAD:
331	case ISD::STORE:
332	case ISD::BUILD_VECTOR:
333	case ISD::BITCAST:
334	case ISD::UNDEF:
335	case ISD::EXTRACT_VECTOR_ELT:
336	case ISD::INSERT_VECTOR_ELT:
337	case ISD::SCALAR_TO_VECTOR:
338	case ISD::IS_FPCLASS:
339	break;
340	case ISD::EXTRACT_SUBVECTOR:
341	case ISD::INSERT_SUBVECTOR:
342	case ISD::CONCAT_VECTORS:
343	setOperationAction(Op, VT, Action: Custom);
344	break;
345	default:
346	setOperationAction(Op, VT, Action: Expand);
347	break;
348	}
349	}
350	}
351
352	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
353
354	// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
355	// is expanded to avoid having two separate loops in case the index is a VGPR.
356
357	// Most operations are naturally 32-bit vector operations. We only support
358	// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
359	for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
360	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
361	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
362
363	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
364	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
365
366	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
367	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
368
369	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
370	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
371	}
372
373	for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
374	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
375	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
376
377	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
378	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
379
380	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
381	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
382
383	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
384	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
385	}
386
387	for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
388	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
389	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
390
391	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
392	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
393
394	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
395	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
396
397	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
398	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
399	}
400
401	for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
402	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
403	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
404
405	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
406	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
407
408	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
409	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
410
411	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
412	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
413	}
414
415	for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
416	setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
417	AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
418
419	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
420	AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
421
422	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
423	AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
424
425	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
426	AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
427	}
428
429	setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
430	VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
431	MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
432	Action: Custom);
433
434	if (Subtarget->hasPkMovB32()) {
435	// TODO: 16-bit element vectors should be legal with even aligned elements.
436	// TODO: Can be legal with wider source types than the result with
437	// subregister extracts.
438	setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
439	}
440
441	setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442	Action: Custom);
443
444	// Avoid stack access for these.
445	// TODO: Generalize to more vector types.
446	setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
447	VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
448	MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
449	Action: Custom);
450
451	// Deal with vec3 vector operations when widened to vec4.
452	setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
453	VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
454
455	// Deal with vec5/6/7 vector operations when widened to vec8.
456	setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
457	VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
458	MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
459	MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
460	MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
461	Action: Custom);
462
463	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
464	// and output demarshalling
465	setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
466
467	// We can't return success/failure, only the old value,
468	// let LLVM add the comparison
469	setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
470	Action: Expand);
471
472	setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
473
474	setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
475
476	// FIXME: This should be narrowed to i32, but that only happens if i64 is
477	// illegal.
478	// FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
479	setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
480
481	// On SI this is s_memtime and s_memrealtime on VI.
482	setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
483
484	if (Subtarget->hasSMemRealTime() \|\|
485	Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
486	setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
487	setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
488
489	if (Subtarget->has16BitInsts()) {
490	setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
491	setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
492	} else {
493	setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
494	}
495
496	if (Subtarget->hasMadMacF32Insts())
497	setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
498
499	if (!Subtarget->hasBFI())
500	// fcopysign can be done in a single instruction with BFI.
501	setOperationAction(Ops: ISD::FCOPYSIGN, VTs: {MVT::f32, MVT::f64}, Action: Expand);
502
503	if (!Subtarget->hasBCNT(Size: `32`))
504	setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Expand);
505
506	if (!Subtarget->hasBCNT(Size: `64`))
507	setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Expand);
508
509	if (Subtarget->hasFFBH())
510	setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
511
512	if (Subtarget->hasFFBL())
513	setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
514
515	// We only really have 32-bit BFE instructions (and 16-bit on VI).
516	//
517	// On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
518	// effort to match them now. We want this to be false for i64 cases when the
519	// extraction isn't restricted to the upper or lower half. Ideally we would
520	// have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
521	// span the midpoint are probably relatively rare, so don't worry about them
522	// for now.
523	if (Subtarget->hasBFE())
524	setHasExtractBitsInsn(true);
525
526	// Clamp modifier on add/sub
527	if (Subtarget->hasIntClamp())
528	setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
529
530	if (Subtarget->hasAddNoCarry())
531	setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
532	Action: Legal);
533
534	setOperationAction(
535	Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
536	VTs: {MVT::f32, MVT::f64}, Action: Custom);
537
538	// These are really only legal for ieee_mode functions. We should be avoiding
539	// them for functions that don't have ieee_mode enabled, so just say they are
540	// legal.
541	setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
542	VTs: {MVT::f32, MVT::f64}, Action: Legal);
543
544	if (Subtarget->haveRoundOpsF64())
545	setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
546	Action: Legal);
547	else
548	setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
549	VT: MVT::f64, Action: Custom);
550
551	setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
552	setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
553	Action: Legal);
554	setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
555
556	setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
557	setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
558
559	setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
560	setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
561
562	// Custom lower these because we can't specify a rule based on an illegal
563	// source bf16.
564	setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
565	setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
566
567	if (Subtarget->has16BitInsts()) {
568	setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
569	ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
570	VT: MVT::i16, Action: Legal);
571
572	AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
573
574	setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
575	VT: MVT::i16, Action: Expand);
576
577	setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
578	ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
579	ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
580	ISD::CTPOP},
581	VT: MVT::i16, Action: Promote);
582
583	setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
584
585	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
586
587	setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
588	AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
589	setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
590	AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
591
592	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
593	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
594	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
595
596	setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
597
598	// F16 - Constant Actions.
599	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
600	setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
601
602	// F16 - Load/Store Actions.
603	setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
604	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
605	setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
606	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
607
608	// BF16 - Load/Store Actions.
609	setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
610	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
611	setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
612	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
613
614	// F16 - VOP1 Actions.
615	setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
616	ISD::FSIN, ISD::FROUND},
617	VT: MVT::f16, Action: Custom);
618
619	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::f16, Action: Promote);
620	setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::bf16, Action: Promote);
621
622	// F16 - VOP2 Actions.
623	setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
624	Action: Expand);
625	setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
626	setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
627	setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
628
629	// F16 - VOP3 Actions.
630	setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
631	if (STI.hasMadF16())
632	setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
633
634	for (MVT VT :
635	{MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
636	MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
637	MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
638	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op) {
639	switch (Op) {
640	case ISD::LOAD:
641	case ISD::STORE:
642	case ISD::BUILD_VECTOR:
643	case ISD::BITCAST:
644	case ISD::UNDEF:
645	case ISD::EXTRACT_VECTOR_ELT:
646	case ISD::INSERT_VECTOR_ELT:
647	case ISD::INSERT_SUBVECTOR:
648	case ISD::SCALAR_TO_VECTOR:
649	case ISD::IS_FPCLASS:
650	break;
651	case ISD::EXTRACT_SUBVECTOR:
652	case ISD::CONCAT_VECTORS:
653	setOperationAction(Op, VT, Action: Custom);
654	break;
655	default:
656	setOperationAction(Op, VT, Action: Expand);
657	break;
658	}
659	}
660	}
661
662	// v_perm_b32 can handle either of these.
663	setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
664	setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
665
666	// XXX - Do these do anything? Vector constants turn into build_vector.
667	setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
668
669	setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
670	Action: Legal);
671
672	setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
673	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
674	setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
675	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
676
677	setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
678	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
679	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
680	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
681
682	setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
683	AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
684	setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
685	AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
686	setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
687	AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
688
689	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
690	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
691	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
692	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
693	setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
694	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
695
696	setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
697	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
698	setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
699	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
700	setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
701	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
702
703	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
704	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
705	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
706	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
707	setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
708	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
709
710	setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
711	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
712	setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
713	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
714
715	setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
716	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
717	setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
718	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
719	setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
720	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
721
722	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
723	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
724	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
725	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
726	setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
727	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
728
729	setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
730	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
731	setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
732	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
733	setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
734	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
735
736	setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
737	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
738	setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
739	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
740	setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
741	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
742
743	setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
744	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
745	setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
746	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
747	setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
748	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
749
750	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
751	VT: MVT::v2i32, Action: Expand);
752	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
753
754	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
755	VT: MVT::v4i32, Action: Expand);
756
757	setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
758	VT: MVT::v8i32, Action: Expand);
759
760	setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
761	Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
762
763	setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
764	// This isn't really legal, but this avoids the legalizer unrolling it (and
765	// allows matching fneg (fabs x) patterns)
766	setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
767
768	// Can do this in one BFI plus a constant materialize.
769	setOperationAction(Ops: ISD::FCOPYSIGN,
770	VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
771	MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
772	MVT::v32f16, MVT::v32bf16},
773	Action: Custom);
774
775	setOperationAction(
776	Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
777	VT: MVT::f16, Action: Custom);
778	setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
779
780	setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
781	ISD::FMAXIMUMNUM},
782	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
783	Action: Custom);
784
785	setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
786	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
787	Action: Expand);
788
789	for (MVT Vec16 :
790	{MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
791	MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792	setOperationAction(
793	Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
794	VT: Vec16, Action: Custom);
795	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
796	}
797	}
798
799	if (Subtarget->hasVOP3PInsts()) {
800	setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
801	ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
802	ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
803	VT: MVT::v2i16, Action: Legal);
804
805	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
806	ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
807	VT: MVT::v2f16, Action: Legal);
808
809	setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
810	VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
811
812	setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
813	VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
814	MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
815	MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
816	Action: Custom);
817
818	for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
819	// Split vector operations.
820	setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
821	ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
822	ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
823	ISD::SSUBSAT},
824	VT, Action: Custom);
825
826	for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
827	// Split vector operations.
828	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
829	VT, Action: Custom);
830
831	setOperationAction(
832	Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
833	VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
834
835	setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
836	setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
837	Action: Custom);
838
839	if (Subtarget->hasPackedFP32Ops()) {
840	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
841	VT: MVT::v2f32, Action: Legal);
842	setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
843	VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
844	Action: Custom);
845	}
846	}
847
848	setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
849
850	if (Subtarget->has16BitInsts()) {
851	setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
852	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
853	setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
854	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
855	} else {
856	// Legalization hack.
857	setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
858
859	setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
860	}
861
862	setOperationAction(Ops: ISD::SELECT,
863	VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
864	MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
865	MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
866	MVT::v32f16, MVT::v32bf16},
867	Action: Custom);
868
869	setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
870
871	if (Subtarget->hasScalarSMulU64())
872	setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
873
874	if (Subtarget->hasMad64_32())
875	setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
876
877	if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
878	setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
879
880	if (Subtarget->hasIEEEMinMax()) {
881	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
882	VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
883	} else {
884	// FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
885	if (Subtarget->hasMinimum3Maximum3F32())
886	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
887
888	if (Subtarget->hasMinimum3Maximum3PKF16()) {
889	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
890
891	// If only the vector form is available, we need to widen to a vector.
892	if (!Subtarget->hasMinimum3Maximum3F16())
893	setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
894	}
895	}
896
897	if (Subtarget->hasVOP3PInsts()) {
898	// We want to break these into v2f16 pieces, not scalarize.
899	setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
900	VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
901	Action: Custom);
902	}
903
904	setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
905	VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
906	MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
907	MVT::i8},
908	Action: Custom);
909
910	setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
911	VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
912	MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
913	MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
914	MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
915	Action: Custom);
916
917	setOperationAction(Ops: ISD::INTRINSIC_VOID,
918	VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
919	MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
920	MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
921	MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
922	Action: Custom);
923
924	setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
925	setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
926	setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
927	setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
928	setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
929
930	// TODO: Could move this to custom lowering, could benefit from combines on
931	// extract of relevant bits.
932	setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
933
934	setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
935
936	if (Subtarget->hasBF16ConversionInsts()) {
937	setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
938	setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
939	}
940
941	if (Subtarget->hasCvtPkF16F32Inst()) {
942	setOperationAction(Ops: ISD::FP_ROUND,
943	VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
944	Action: Custom);
945	}
946
947	setTargetDAGCombine({ISD::ADD,
948	ISD::PTRADD,
949	ISD::UADDO_CARRY,
950	ISD::SUB,
951	ISD::USUBO_CARRY,
952	ISD::MUL,
953	ISD::FADD,
954	ISD::FSUB,
955	ISD::FDIV,
956	ISD::FMUL,
957	ISD::FMINNUM,
958	ISD::FMAXNUM,
959	ISD::FMINNUM_IEEE,
960	ISD::FMAXNUM_IEEE,
961	ISD::FMINIMUM,
962	ISD::FMAXIMUM,
963	ISD::FMINIMUMNUM,
964	ISD::FMAXIMUMNUM,
965	ISD::FMA,
966	ISD::SMIN,
967	ISD::SMAX,
968	ISD::UMIN,
969	ISD::UMAX,
970	ISD::SETCC,
971	ISD::SELECT,
972	ISD::SMIN,
973	ISD::SMAX,
974	ISD::UMIN,
975	ISD::UMAX,
976	ISD::AND,
977	ISD::OR,
978	ISD::XOR,
979	ISD::SHL,
980	ISD::SRL,
981	ISD::SRA,
982	ISD::FSHR,
983	ISD::SINT_TO_FP,
984	ISD::UINT_TO_FP,
985	ISD::FCANONICALIZE,
986	ISD::SCALAR_TO_VECTOR,
987	ISD::ZERO_EXTEND,
988	ISD::SIGN_EXTEND_INREG,
989	ISD::EXTRACT_VECTOR_ELT,
990	ISD::INSERT_VECTOR_ELT,
991	ISD::FCOPYSIGN});
992
993	if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
994	setTargetDAGCombine(ISD::FP_ROUND);
995
996	// All memory operations. Some folding on the pointer operand is done to help
997	// matching the constant offsets in the addressing modes.
998	setTargetDAGCombine({ISD::LOAD,
999	ISD::STORE,
1000	ISD::ATOMIC_LOAD,
1001	ISD::ATOMIC_STORE,
1002	ISD::ATOMIC_CMP_SWAP,
1003	ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1004	ISD::ATOMIC_SWAP,
1005	ISD::ATOMIC_LOAD_ADD,
1006	ISD::ATOMIC_LOAD_SUB,
1007	ISD::ATOMIC_LOAD_AND,
1008	ISD::ATOMIC_LOAD_OR,
1009	ISD::ATOMIC_LOAD_XOR,
1010	ISD::ATOMIC_LOAD_NAND,
1011	ISD::ATOMIC_LOAD_MIN,
1012	ISD::ATOMIC_LOAD_MAX,
1013	ISD::ATOMIC_LOAD_UMIN,
1014	ISD::ATOMIC_LOAD_UMAX,
1015	ISD::ATOMIC_LOAD_FADD,
1016	ISD::ATOMIC_LOAD_FMIN,
1017	ISD::ATOMIC_LOAD_FMAX,
1018	ISD::ATOMIC_LOAD_UINC_WRAP,
1019	ISD::ATOMIC_LOAD_UDEC_WRAP,
1020	ISD::INTRINSIC_VOID,
1021	ISD::INTRINSIC_W_CHAIN});
1022
1023	// FIXME: In other contexts we pretend this is a per-function property.
1024	setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1025
1026	setSchedulingPreference(Sched::RegPressure);
1027	}
1028
1029	const GCNSubtarget SITargetLowering::getSubtarget() const* { return Subtarget; }
1030
1031	ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1032	static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1033	return RCRegs;
1034	}
1035
1036	//===----------------------------------------------------------------------===//
1037	// TargetLowering queries
1038	//===----------------------------------------------------------------------===//
1039
1040	// v_mad_mix support a conversion from f16 to f32.*
1041	//
1042	// There is only one special case when denormals are enabled we don't currently,
1043	// where this is OK to use.
1044	bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1045	EVT DestVT, EVT SrcVT) const {
1046	return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) \|\|
1047	(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1048	DestVT.getScalarType() == MVT::f32 &&
1049	SrcVT.getScalarType() == MVT::f16 &&
1050	// TODO: This probably only requires no input flushing?
1051	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1052	}
1053
1054	bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1055	LLT DestTy, LLT SrcTy) const {
1056	return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) \|\|
1057	(Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1058	DestTy.getScalarSizeInBits() == `32` &&
1059	SrcTy.getScalarSizeInBits() == `16` &&
1060	// TODO: This probably only requires no input flushing?
1061	denormalModeIsFlushAllF32(MF: *MI.getMF());
1062	}
1063
1064	bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1065	// SI has some legal vector types, but no legal vector operations. Say no
1066	// shuffles are legal in order to prefer scalarizing some vector operations.
1067	return false;
1068	}
1069
1070	MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1071	CallingConv::ID CC,
1072	EVT VT) const {
1073	if (CC == CallingConv::AMDGPU_KERNEL)
1074	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1075
1076	if (VT.isVector()) {
1077	EVT ScalarVT = VT.getScalarType();
1078	unsigned Size = ScalarVT.getSizeInBits();
1079	if (Size == `16`) {
1080	if (Subtarget->has16BitInsts()) {
1081	if (VT.isInteger())
1082	return MVT::v2i16;
1083	return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1084	}
1085	return VT.isInteger() ? MVT::i32 : MVT::f32;
1086	}
1087
1088	if (Size < `16`)
1089	return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1090	return Size == `32` ? ScalarVT.getSimpleVT() : MVT::i32;
1091	}
1092
1093	if (VT.getSizeInBits() > `32`)
1094	return MVT::i32;
1095
1096	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1097	}
1098
1099	unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1100	CallingConv::ID CC,
1101	EVT VT) const {
1102	if (CC == CallingConv::AMDGPU_KERNEL)
1103	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1104
1105	if (VT.isVector()) {
1106	unsigned NumElts = VT.getVectorNumElements();
1107	EVT ScalarVT = VT.getScalarType();
1108	unsigned Size = ScalarVT.getSizeInBits();
1109
1110	// FIXME: Should probably promote 8-bit vectors to i16.
1111	if (Size == `16` && Subtarget->has16BitInsts())
1112	return (NumElts + `1`) / `2`;
1113
1114	if (Size <= `32`)
1115	return NumElts;
1116
1117	if (Size > `32`)
1118	return NumElts * ((Size + `31`) / `32`);
1119	} else if (VT.getSizeInBits() > `32`)
1120	return (VT.getSizeInBits() + `31`) / `32`;
1121
1122	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1123	}
1124
1125	unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1126	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1127	unsigned &NumIntermediates, MVT &RegisterVT) const {
1128	if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1129	unsigned NumElts = VT.getVectorNumElements();
1130	EVT ScalarVT = VT.getScalarType();
1131	unsigned Size = ScalarVT.getSizeInBits();
1132	// FIXME: We should fix the ABI to be the same on targets without 16-bit
1133	// support, but unless we can properly handle 3-vectors, it will be still be
1134	// inconsistent.
1135	if (Size == `16` && Subtarget->has16BitInsts()) {
1136	if (ScalarVT == MVT::bf16) {
1137	RegisterVT = MVT::i32;
1138	IntermediateVT = MVT::v2bf16;
1139	} else {
1140	RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1141	IntermediateVT = RegisterVT;
1142	}
1143	NumIntermediates = (NumElts + `1`) / `2`;
1144	return NumIntermediates;
1145	}
1146
1147	if (Size == `32`) {
1148	RegisterVT = ScalarVT.getSimpleVT();
1149	IntermediateVT = RegisterVT;
1150	NumIntermediates = NumElts;
1151	return NumIntermediates;
1152	}
1153
1154	if (Size < `16` && Subtarget->has16BitInsts()) {
1155	// FIXME: Should probably form v2i16 pieces
1156	RegisterVT = MVT::i16;
1157	IntermediateVT = ScalarVT;
1158	NumIntermediates = NumElts;
1159	return NumIntermediates;
1160	}
1161
1162	if (Size != `16` && Size <= `32`) {
1163	RegisterVT = MVT::i32;
1164	IntermediateVT = ScalarVT;
1165	NumIntermediates = NumElts;
1166	return NumIntermediates;
1167	}
1168
1169	if (Size > `32`) {
1170	RegisterVT = MVT::i32;
1171	IntermediateVT = RegisterVT;
1172	NumIntermediates = NumElts * ((Size + `31`) / `32`);
1173	return NumIntermediates;
1174	}
1175	}
1176
1177	return TargetLowering::getVectorTypeBreakdownForCallingConv(
1178	Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1179	}
1180
1181	static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1182	const DataLayout &DL, Type *Ty,
1183	unsigned MaxNumLanes) {
1184	assert(MaxNumLanes != `0`);
1185
1186	LLVMContext &Ctx = Ty->getContext();
1187	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1188	unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1189	return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1190	NumElements: NumElts);
1191	}
1192
1193	return TLI.getValueType(DL, Ty);
1194	}
1195
1196	// Peek through TFE struct returns to only use the data size.
1197	static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1198	const DataLayout &DL, Type *Ty,
1199	unsigned MaxNumLanes) {
1200	auto *ST = dyn_cast<StructType>(Val: Ty);
1201	if (!ST)
1202	return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1203
1204	// TFE intrinsics return an aggregate type.
1205	assert(ST->getNumContainedTypes() == `2` &&
1206	ST->getContainedType(`1`)->isIntegerTy(`32`));
1207	return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: `0`), MaxNumLanes);
1208	}
1209
1210	/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1211	/// in-memory representation. This return value is a custom type because there
1212	/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1213	/// could cause issues during codegen, these address space 7 pointers will be
1214	/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1215	/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1216	/// for cost modeling, to work. (This also sets us up decently for doing the
1217	/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1218	MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1219	if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == `160`)
1220	return MVT::amdgpuBufferFatPointer;
1221	if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1222	DL.getPointerSizeInBits(AS) == `192`)
1223	return MVT::amdgpuBufferStridedPointer;
1224	return AMDGPUTargetLowering::getPointerTy(DL, AS);
1225	}
1226	/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1227	/// v8i32 when padding is added.
1228	/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1229	/// also v8i32 with padding.
1230	MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1231	if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1232	DL.getPointerSizeInBits(AS) == `160`) \|\|
1233	(AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1234	DL.getPointerSizeInBits(AS) == `192`))
1235	return MVT::v8i32;
1236	return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1237	}
1238
1239	bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1240	const CallInst &CI,
1241	MachineFunction &MF,
1242	unsigned IntrID) const {
1243	Info.flags = MachineMemOperand::MONone;
1244	if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1245	Info.flags \|= MachineMemOperand::MOInvariant;
1246	if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1247	Info.flags \|= MachineMemOperand::MONonTemporal;
1248	Info.flags \|= getTargetMMOFlags(I: CI);
1249
1250	if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1251	AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1252	AttributeSet Attr =
1253	Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1254	MemoryEffects ME = Attr.getMemoryEffects();
1255	if (ME.doesNotAccessMemory())
1256	return false;
1257
1258	// TODO: Should images get their own address space?
1259	Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1260
1261	const AMDGPU::MIMGBaseOpcodeInfo BaseOpcode = nullptr*;
1262	if (RsrcIntr->IsImage) {
1263	const AMDGPU::ImageDimIntrinsicInfo *Intr =
1264	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1265	BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1266	Info.align.reset();
1267	}
1268
1269	Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1270	if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1271	if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1272	// We conservatively set the memory operand of a buffer intrinsic to the
1273	// base resource pointer, so that we can access alias information about
1274	// those pointers. Cases like "this points at the same value
1275	// but with a different offset" are handled in
1276	// areMemAccessesTriviallyDisjoint.
1277	Info.ptrVal = RsrcArg;
1278	}
1279
1280	bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1281	if (!IsSPrefetch) {
1282	auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - `1`));
1283	if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1284	Info.flags \|= MachineMemOperand::MOVolatile;
1285	}
1286
1287	Info.flags \|= MachineMemOperand::MODereferenceable;
1288	if (ME.onlyReadsMemory()) {
1289	if (RsrcIntr->IsImage) {
1290	unsigned MaxNumLanes = `4`;
1291
1292	if (!BaseOpcode->Gather4) {
1293	// If this isn't a gather, we may have excess loaded elements in the
1294	// IR type. Check the dmask for the real number of elements loaded.
1295	unsigned DMask =
1296	cast<ConstantInt>(Val: CI.getArgOperand(i: `0`))->getZExtValue();
1297	MaxNumLanes = DMask == `0` ? `1` : llvm::popcount(Value: DMask);
1298	}
1299
1300	Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1301	Ty: CI.getType(), MaxNumLanes);
1302	} else {
1303	Info.memVT =
1304	memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1305	MaxNumLanes: std::numeric_limits<unsigned>::max());
1306	}
1307
1308	// FIXME: What does alignment mean for an image?
1309	Info.opc = ISD::INTRINSIC_W_CHAIN;
1310	Info.flags \|= MachineMemOperand::MOLoad;
1311	} else if (ME.onlyWritesMemory()) {
1312	Info.opc = ISD::INTRINSIC_VOID;
1313
1314	Type *DataTy = CI.getArgOperand(i: `0`)->getType();
1315	if (RsrcIntr->IsImage) {
1316	unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: `1`))->getZExtValue();
1317	unsigned DMaskLanes = DMask == `0` ? `1` : llvm::popcount(Value: DMask);
1318	Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1319	MaxNumLanes: DMaskLanes);
1320	} else
1321	Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1322
1323	Info.flags \|= MachineMemOperand::MOStore;
1324	} else {
1325	// Atomic, NoReturn Sampler or prefetch
1326	Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1327	: ISD::INTRINSIC_W_CHAIN;
1328	Info.flags \|=
1329	MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable;
1330
1331	if (!IsSPrefetch)
1332	Info.flags \|= MachineMemOperand::MOStore;
1333
1334	switch (IntrID) {
1335	default:
1336	if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) \|\| IsSPrefetch) {
1337	// Fake memory access type for no return sampler intrinsics
1338	Info.memVT = MVT::i32;
1339	} else {
1340	// XXX - Should this be volatile without known ordering?
1341	Info.flags \|= MachineMemOperand::MOVolatile;
1342	Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: `0`)->getType());
1343	}
1344	break;
1345	case Intrinsic::amdgcn_raw_buffer_load_lds:
1346	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1347	case Intrinsic::amdgcn_struct_buffer_load_lds:
1348	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1349	unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getZExtValue();
1350	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * `8`);
1351	Info.ptrVal = CI.getArgOperand(i: `1`);
1352	return true;
1353	}
1354	case Intrinsic::amdgcn_raw_atomic_buffer_load:
1355	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1356	case Intrinsic::amdgcn_struct_atomic_buffer_load:
1357	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1358	Info.memVT =
1359	memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1360	MaxNumLanes: std::numeric_limits<unsigned>::max());
1361	Info.flags &= ~MachineMemOperand::MOStore;
1362	return true;
1363	}
1364	}
1365	}
1366	return true;
1367	}
1368
1369	switch (IntrID) {
1370	case Intrinsic::amdgcn_ds_ordered_add:
1371	case Intrinsic::amdgcn_ds_ordered_swap: {
1372	Info.opc = ISD::INTRINSIC_W_CHAIN;
1373	Info.memVT = MVT::getVT(Ty: CI.getType());
1374	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1375	Info.align.reset();
1376	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1377
1378	const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: `4`));
1379	if (!Vol->isZero())
1380	Info.flags \|= MachineMemOperand::MOVolatile;
1381
1382	return true;
1383	}
1384	case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1385	case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1386	Info.opc = ISD::INTRINSIC_W_CHAIN;
1387	Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: `0`)->getType());
1388	Info.ptrVal = nullptr;
1389	Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1390	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1391	return true;
1392	}
1393	case Intrinsic::amdgcn_ds_append:
1394	case Intrinsic::amdgcn_ds_consume: {
1395	Info.opc = ISD::INTRINSIC_W_CHAIN;
1396	Info.memVT = MVT::getVT(Ty: CI.getType());
1397	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1398	Info.align.reset();
1399	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1400
1401	const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: `1`));
1402	if (!Vol->isZero())
1403	Info.flags \|= MachineMemOperand::MOVolatile;
1404
1405	return true;
1406	}
1407	case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1408	case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1409	Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1410	? ISD::INTRINSIC_W_CHAIN
1411	: ISD::INTRINSIC_VOID;
1412	Info.memVT = MVT::getVT(Ty: CI.getType());
1413	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1414	Info.memVT = MVT::i64;
1415	Info.size = `8`;
1416	Info.align.reset();
1417	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1418	return true;
1419	}
1420	case Intrinsic::amdgcn_global_atomic_csub: {
1421	Info.opc = ISD::INTRINSIC_W_CHAIN;
1422	Info.memVT = MVT::getVT(Ty: CI.getType());
1423	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1424	Info.align.reset();
1425	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
1426	MachineMemOperand::MOVolatile;
1427	return true;
1428	}
1429	case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1430	case Intrinsic::amdgcn_image_bvh_intersect_ray:
1431	case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1432	Info.opc = ISD::INTRINSIC_W_CHAIN;
1433	Info.memVT =
1434	MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1435	? CI.getType()
1436	: cast<StructType>(Val: CI.getType())
1437	->getElementType(N: `0`)); // XXX: what is correct VT?
1438
1439	Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1440	Info.align.reset();
1441	Info.flags \|=
1442	MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable;
1443	return true;
1444	}
1445	case Intrinsic::amdgcn_global_atomic_fmin_num:
1446	case Intrinsic::amdgcn_global_atomic_fmax_num:
1447	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1448	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1449	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1450	case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1451	Info.opc = ISD::INTRINSIC_W_CHAIN;
1452	Info.memVT = MVT::getVT(Ty: CI.getType());
1453	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1454	Info.align.reset();
1455	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore \|
1456	MachineMemOperand::MODereferenceable \|
1457	MachineMemOperand::MOVolatile;
1458	return true;
1459	}
1460	case Intrinsic::amdgcn_ds_load_tr6_b96:
1461	case Intrinsic::amdgcn_ds_load_tr4_b64:
1462	case Intrinsic::amdgcn_ds_load_tr8_b64:
1463	case Intrinsic::amdgcn_ds_load_tr16_b128:
1464	case Intrinsic::amdgcn_global_load_tr6_b96:
1465	case Intrinsic::amdgcn_global_load_tr4_b64:
1466	case Intrinsic::amdgcn_global_load_tr_b64:
1467	case Intrinsic::amdgcn_global_load_tr_b128:
1468	case Intrinsic::amdgcn_ds_read_tr4_b64:
1469	case Intrinsic::amdgcn_ds_read_tr6_b96:
1470	case Intrinsic::amdgcn_ds_read_tr8_b64:
1471	case Intrinsic::amdgcn_ds_read_tr16_b64: {
1472	Info.opc = ISD::INTRINSIC_W_CHAIN;
1473	Info.memVT = MVT::getVT(Ty: CI.getType());
1474	Info.ptrVal = CI.getOperand(i_nocapture: `0`);
1475	Info.align.reset();
1476	Info.flags \|= MachineMemOperand::MOLoad;
1477	return true;
1478	}
1479	case Intrinsic::amdgcn_ds_gws_init:
1480	case Intrinsic::amdgcn_ds_gws_barrier:
1481	case Intrinsic::amdgcn_ds_gws_sema_v:
1482	case Intrinsic::amdgcn_ds_gws_sema_br:
1483	case Intrinsic::amdgcn_ds_gws_sema_p:
1484	case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1485	Info.opc = ISD::INTRINSIC_VOID;
1486
1487	const GCNTargetMachine &TM =
1488	static_cast<const GCNTargetMachine &>(getTargetMachine());
1489
1490	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1491	Info.ptrVal = MFI->getGWSPSV(TM);
1492
1493	// This is an abstract access, but we need to specify a type and size.
1494	Info.memVT = MVT::i32;
1495	Info.size = `4`;
1496	Info.align = Align (`4`);
1497
1498	if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1499	Info.flags \|= MachineMemOperand::MOLoad;
1500	else
1501	Info.flags \|= MachineMemOperand::MOStore;
1502	return true;
1503	}
1504	case Intrinsic::amdgcn_load_to_lds:
1505	case Intrinsic::amdgcn_global_load_lds: {
1506	Info.opc = ISD::INTRINSIC_VOID;
1507	unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: `2`))->getZExtValue();
1508	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * `8`);
1509	Info.ptrVal = CI.getArgOperand(i: `1`);
1510	Info.flags \|= MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1511	return true;
1512	}
1513	case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1514	case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1515	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1516	case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1517	Info.opc = ISD::INTRINSIC_W_CHAIN;
1518
1519	const GCNTargetMachine &TM =
1520	static_cast<const GCNTargetMachine &>(getTargetMachine());
1521
1522	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1523	Info.ptrVal = MFI->getGWSPSV(TM);
1524
1525	// This is an abstract access, but we need to specify a type and size.
1526	Info.memVT = MVT::i32;
1527	Info.size = `4`;
1528	Info.align = Align (`4`);
1529
1530	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;
1531	return true;
1532	}
1533	case Intrinsic::amdgcn_s_prefetch_data: {
1534	Info.opc = ISD::INTRINSIC_VOID;
1535	Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: `8`);
1536	Info.ptrVal = CI.getArgOperand(i: `0`);
1537	Info.flags \|= MachineMemOperand::MOLoad;
1538	return true;
1539	}
1540	default:
1541	return false;
1542	}
1543	}
1544
1545	void SITargetLowering::CollectTargetIntrinsicOperands(
1546	const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1547	switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1548	case Intrinsic::amdgcn_addrspacecast_nonnull: {
1549	// The DAG's ValueType loses the addrspaces.
1550	// Add them as 2 extra Constant operands "from" and "to".
1551	unsigned SrcAS = I.getOperand(i_nocapture: `0`)->getType()->getPointerAddressSpace();
1552	unsigned DstAS = I.getType()->getPointerAddressSpace();
1553	Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc (), VT: MVT::i32));
1554	Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc (), VT: MVT::i32));
1555	break;
1556	}
1557	default:
1558	break;
1559	}
1560	}
1561
1562	bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1563	SmallVectorImpl<Value *> &Ops,
1564	Type &AccessTy) const* {
1565	Value Ptr = nullptr*;
1566	switch (II->getIntrinsicID()) {
1567	case Intrinsic::amdgcn_atomic_cond_sub_u32:
1568	case Intrinsic::amdgcn_ds_append:
1569	case Intrinsic::amdgcn_ds_consume:
1570	case Intrinsic::amdgcn_ds_load_tr8_b64:
1571	case Intrinsic::amdgcn_ds_load_tr16_b128:
1572	case Intrinsic::amdgcn_ds_load_tr4_b64:
1573	case Intrinsic::amdgcn_ds_load_tr6_b96:
1574	case Intrinsic::amdgcn_ds_read_tr4_b64:
1575	case Intrinsic::amdgcn_ds_read_tr6_b96:
1576	case Intrinsic::amdgcn_ds_read_tr8_b64:
1577	case Intrinsic::amdgcn_ds_read_tr16_b64:
1578	case Intrinsic::amdgcn_ds_ordered_add:
1579	case Intrinsic::amdgcn_ds_ordered_swap:
1580	case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1581	case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1582	case Intrinsic::amdgcn_flat_atomic_fmax_num:
1583	case Intrinsic::amdgcn_flat_atomic_fmin_num:
1584	case Intrinsic::amdgcn_global_atomic_csub:
1585	case Intrinsic::amdgcn_global_atomic_fmax_num:
1586	case Intrinsic::amdgcn_global_atomic_fmin_num:
1587	case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1588	case Intrinsic::amdgcn_global_load_tr_b64:
1589	case Intrinsic::amdgcn_global_load_tr_b128:
1590	case Intrinsic::amdgcn_global_load_tr4_b64:
1591	case Intrinsic::amdgcn_global_load_tr6_b96:
1592	Ptr = II->getArgOperand(i: `0`);
1593	break;
1594	case Intrinsic::amdgcn_load_to_lds:
1595	case Intrinsic::amdgcn_global_load_lds:
1596	Ptr = II->getArgOperand(i: `1`);
1597	break;
1598	default:
1599	return false;
1600	}
1601	AccessTy = II->getType();
1602	Ops.push_back(Elt: Ptr);
1603	return true;
1604	}
1605
1606	bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1607	unsigned AddrSpace) const {
1608	if (!Subtarget->hasFlatInstOffsets()) {
1609	// Flat instructions do not have offsets, and only have the register
1610	// address.
1611	return AM.BaseOffs == `0` && AM.Scale == `0`;
1612	}
1613
1614	decltype(SIInstrFlags::FLAT) FlatVariant =
1615	AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1616	: AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1617	: SIInstrFlags::FLAT;
1618
1619	return AM.Scale == `0` &&
1620	(AM.BaseOffs == `0` \|\| Subtarget->getInstrInfo()->isLegalFLATOffset(
1621	Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1622	}
1623
1624	bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1625	if (Subtarget->hasFlatGlobalInsts())
1626	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1627
1628	if (!Subtarget->hasAddr64() \|\| Subtarget->useFlatForGlobal()) {
1629	// Assume the we will use FLAT for all global memory accesses
1630	// on VI.
1631	// FIXME: This assumption is currently wrong. On VI we still use
1632	// MUBUF instructions for the r + i addressing mode. As currently
1633	// implemented, the MUBUF instructions only work on buffer < 4GB.
1634	// It may be possible to support > 4GB buffers with MUBUF instructions,
1635	// by setting the stride value in the resource descriptor which would
1636	// increase the size limit to (stride 4GB). However, this is risky,*
1637	// because it has never been validated.
1638	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1639	}
1640
1641	return isLegalMUBUFAddressingMode(AM);
1642	}
1643
1644	bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1645	// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1646	// additionally can do r + r + i with addr64. 32-bit has more addressing
1647	// mode options. Depending on the resource constant, it can also do
1648	// (i64 r0) + (i32 r1) (i14 i).*
1649	//
1650	// Private arrays end up using a scratch buffer most of the time, so also
1651	// assume those use MUBUF instructions. Scratch loads / stores are currently
1652	// implemented as mubuf instructions with offen bit set, so slightly
1653	// different than the normal addr64.
1654	const SIInstrInfo *TII = Subtarget->getInstrInfo();
1655	if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1656	return false;
1657
1658	// FIXME: Since we can split immediate into soffset and immediate offset,
1659	// would it make sense to allow any immediate?
1660
1661	switch (AM.Scale) {
1662	case `0`: // r + i or just i, depending on HasBaseReg.
1663	return true;
1664	case `1`:
1665	return true; // We have r + r or r + i.
1666	case `2`:
1667	if (AM.HasBaseReg) {
1668	// Reject 2 r + r.*
1669	return false;
1670	}
1671
1672	// Allow 2 r as r + r*
1673	// Or 2 r + i is allowed as r + r + i.*
1674	return true;
1675	default: // Don't allow n r*
1676	return false;
1677	}
1678	}
1679
1680	bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1681	const AddrMode &AM, Type *Ty,
1682	unsigned AS,
1683	Instruction I) const* {
1684	// No global is ever allowed as a base.
1685	if (AM.BaseGV)
1686	return false;
1687
1688	if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1689	return isLegalGlobalAddressingMode(AM);
1690
1691	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
1692	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
1693	AS == AMDGPUAS::BUFFER_FAT_POINTER \|\| AS == AMDGPUAS::BUFFER_RESOURCE \|\|
1694	AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1695	// If the offset isn't a multiple of 4, it probably isn't going to be
1696	// correctly aligned.
1697	// FIXME: Can we get the real alignment here?
1698	if (AM.BaseOffs % `4` != `0`)
1699	return isLegalMUBUFAddressingMode(AM);
1700
1701	if (!Subtarget->hasScalarSubwordLoads()) {
1702	// There are no SMRD extloads, so if we have to do a small type access we
1703	// will use a MUBUF load.
1704	// FIXME?: We also need to do this if unaligned, but we don't know the
1705	// alignment here.
1706	if (Ty->isSized() && DL.getTypeStoreSize(Ty) < `4`)
1707	return isLegalGlobalAddressingMode(AM);
1708	}
1709
1710	if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1711	// SMRD instructions have an 8-bit, dword offset on SI.
1712	if (!isUInt<`8`>(x: AM.BaseOffs / `4`))
1713	return false;
1714	} else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1715	// On CI+, this can also be a 32-bit literal constant offset. If it fits
1716	// in 8-bits, it can use a smaller encoding.
1717	if (!isUInt<`32`>(x: AM.BaseOffs / `4`))
1718	return false;
1719	} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1720	// On VI, these use the SMEM format and the offset is 20-bit in bytes.
1721	if (!isUInt<`20`>(x: AM.BaseOffs))
1722	return false;
1723	} else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1724	// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1725	// for S_BUFFER_ instructions).*
1726	if (!isInt<`21`>(x: AM.BaseOffs))
1727	return false;
1728	} else {
1729	// On GFX12, all offsets are signed 24-bit in bytes.
1730	if (!isInt<`24`>(x: AM.BaseOffs))
1731	return false;
1732	}
1733
1734	if ((AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
1735	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1736	AM.BaseOffs < `0`) {
1737	// Scalar (non-buffer) loads can only use a negative offset if
1738	// soffset+offset is non-negative. Since the compiler can only prove that
1739	// in a few special cases, it is safer to claim that negative offsets are
1740	// not supported.
1741	return false;
1742	}
1743
1744	if (AM.Scale == `0`) // r + i or just i, depending on HasBaseReg.
1745	return true;
1746
1747	if (AM.Scale == `1` && AM.HasBaseReg)
1748	return true;
1749
1750	return false;
1751	}
1752
1753	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1754	return Subtarget->enableFlatScratch()
1755	? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
1756	: isLegalMUBUFAddressingMode(AM);
1757
1758	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\|
1759	(AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1760	// Basic, single offset DS instructions allow a 16-bit unsigned immediate
1761	// field.
1762	// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1763	// an 8-bit dword offset but we don't know the alignment here.
1764	if (!isUInt<`16`>(x: AM.BaseOffs))
1765	return false;
1766
1767	if (AM.Scale == `0`) // r + i or just i, depending on HasBaseReg.
1768	return true;
1769
1770	if (AM.Scale == `1` && AM.HasBaseReg)
1771	return true;
1772
1773	return false;
1774	}
1775
1776	if (AS == AMDGPUAS::FLAT_ADDRESS \|\| AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1777	// For an unknown address space, this usually means that this is for some
1778	// reason being used for pure arithmetic, and not based on some addressing
1779	// computation. We don't have instructions that compute pointers with any
1780	// addressing modes, so treat them as having no offset like flat
1781	// instructions.
1782	return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1783	}
1784
1785	// Assume a user alias of global for unknown address spaces.
1786	return isLegalGlobalAddressingMode(AM);
1787	}
1788
1789	bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1790	const MachineFunction &MF) const {
1791	if (AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
1792	return (MemVT.getSizeInBits() <= `4` * `32`);
1793	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1794	unsigned MaxPrivateBits = `8` * getSubtarget()->getMaxPrivateElementSize();
1795	return (MemVT.getSizeInBits() <= MaxPrivateBits);
1796	}
1797	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS)
1798	return (MemVT.getSizeInBits() <= `2` * `32`);
1799	return true;
1800	}
1801
1802	bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1803	unsigned Size, unsigned AddrSpace, Align Alignment,
1804	MachineMemOperand::Flags Flags, unsigned IsFast) const* {
1805	if (IsFast)
1806	*IsFast = `0`;
1807
1808	if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
1809	AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1810	// Check if alignment requirements for ds_read/write instructions are
1811	// disabled.
1812	if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align (`4`))
1813	return false;
1814
1815	Align RequiredAlignment(
1816	PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: `8`))); // Natural alignment.
1817	if (Subtarget->hasLDSMisalignedBug() && Size > `32` &&
1818	Alignment < RequiredAlignment)
1819	return false;
1820
1821	// Either, the alignment requirements are "enabled", or there is an
1822	// unaligned LDS access related hardware bug though alignment requirements
1823	// are "disabled". In either case, we need to check for proper alignment
1824	// requirements.
1825	//
1826	switch (Size) {
1827	case `64`:
1828	// SI has a hardware bug in the LDS / GDS bounds checking: if the base
1829	// address is negative, then the instruction is incorrectly treated as
1830	// out-of-bounds even if base + offsets is in bounds. Split vectorized
1831	// loads here to avoid emitting ds_read2_b32. We may re-combine the
1832	// load later in the SILoadStoreOptimizer.
1833	if (!Subtarget->hasUsableDSOffset() && Alignment < Align (`8`))
1834	return false;
1835
1836	// 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1837	// can do a 4 byte aligned, 8 byte access in a single operation using
1838	// ds_read2/write2_b32 with adjacent offsets.
1839	RequiredAlignment = Align (`4`);
1840
1841	if (Subtarget->hasUnalignedDSAccessEnabled()) {
1842	// We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1843	// ds_write2_b32 depending on the alignment. In either case with either
1844	// alignment there is no faster way of doing this.
1845
1846	// The numbers returned here and below are not additive, it is a 'speed
1847	// rank'. They are just meant to be compared to decide if a certain way
1848	// of lowering an operation is faster than another. For that purpose
1849	// naturally aligned operation gets it bitsize to indicate that "it
1850	// operates with a speed comparable to N-bit wide load". With the full
1851	// alignment ds128 is slower than ds96 for example. If underaligned it
1852	// is comparable to a speed of a single dword access, which would then
1853	// mean 32 < 128 and it is faster to issue a wide load regardless.
1854	// 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1855	// wider load which will not be aligned anymore the latter is slower.
1856	if (IsFast)
1857	*IsFast = (Alignment >= RequiredAlignment) ? `64`
1858	: (Alignment < Align (`4`)) ? `32`
1859	: `1`;
1860	return true;
1861	}
1862
1863	break;
1864	case `96`:
1865	if (!Subtarget->hasDS96AndDS128())
1866	return false;
1867
1868	// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1869	// gfx8 and older.
1870
1871	if (Subtarget->hasUnalignedDSAccessEnabled()) {
1872	// Naturally aligned access is fastest. However, also report it is Fast
1873	// if memory is aligned less than DWORD. A narrow load or store will be
1874	// be equally slow as a single ds_read_b96/ds_write_b96, but there will
1875	// be more of them, so overall we will pay less penalty issuing a single
1876	// instruction.
1877
1878	// See comment on the values above.
1879	if (IsFast)
1880	*IsFast = (Alignment >= RequiredAlignment) ? `96`
1881	: (Alignment < Align (`4`)) ? `32`
1882	: `1`;
1883	return true;
1884	}
1885
1886	break;
1887	case `128`:
1888	if (!Subtarget->hasDS96AndDS128() \|\| !Subtarget->useDS128())
1889	return false;
1890
1891	// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1892	// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1893	// single operation using ds_read2/write2_b64.
1894	RequiredAlignment = Align (`8`);
1895
1896	if (Subtarget->hasUnalignedDSAccessEnabled()) {
1897	// Naturally aligned access is fastest. However, also report it is Fast
1898	// if memory is aligned less than DWORD. A narrow load or store will be
1899	// be equally slow as a single ds_read_b128/ds_write_b128, but there
1900	// will be more of them, so overall we will pay less penalty issuing a
1901	// single instruction.
1902
1903	// See comment on the values above.
1904	if (IsFast)
1905	*IsFast = (Alignment >= RequiredAlignment) ? `128`
1906	: (Alignment < Align (`4`)) ? `32`
1907	: `1`;
1908	return true;
1909	}
1910
1911	break;
1912	default:
1913	if (Size > `32`)
1914	return false;
1915
1916	break;
1917	}
1918
1919	// See comment on the values above.
1920	// Note that we have a single-dword or sub-dword here, so if underaligned
1921	// it is a slowest possible access, hence returned value is 0.
1922	if (IsFast)
1923	*IsFast = (Alignment >= RequiredAlignment) ? Size : `0`;
1924
1925	return Alignment >= RequiredAlignment \|\|
1926	Subtarget->hasUnalignedDSAccessEnabled();
1927	}
1928
1929	// FIXME: We have to be conservative here and assume that flat operations
1930	// will access scratch. If we had access to the IR function, then we
1931	// could determine if any private memory was used in the function.
1932	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS \|\|
1933	AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1934	bool AlignedBy4 = Alignment >= Align (`4`);
1935	if (IsFast)
1936	*IsFast = AlignedBy4;
1937
1938	return AlignedBy4 \|\| Subtarget->hasUnalignedScratchAccessEnabled();
1939	}
1940
1941	// So long as they are correct, wide global memory operations perform better
1942	// than multiple smaller memory ops -- even when misaligned
1943	if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
1944	if (IsFast)
1945	*IsFast = Size;
1946
1947	return Alignment >= Align (`4`) \|\|
1948	Subtarget->hasUnalignedBufferAccessEnabled();
1949	}
1950
1951	// Ensure robust out-of-bounds guarantees for buffer accesses are met if
1952	// RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
1953	// out-of-bounds behavior, but in the edge case where an access starts
1954	// out-of-bounds and then enter in-bounds, the entire access would be treated
1955	// as out-of-bounds. Prevent misaligned memory accesses by requiring the
1956	// natural alignment of buffer accesses.
1957	if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER \|\|
1958	AddrSpace == AMDGPUAS::BUFFER_RESOURCE \|\|
1959	AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1960	if (!Subtarget->hasRelaxedBufferOOBMode() &&
1961	Alignment < Align (PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: `8`))))
1962	return false;
1963	}
1964
1965	// Smaller than dword value must be aligned.
1966	if (Size < `32`)
1967	return false;
1968
1969	// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1970	// byte-address are ignored, thus forcing Dword alignment.
1971	// This applies to private, global, and constant memory.
1972	if (IsFast)
1973	*IsFast = `1`;
1974
1975	return Size >= `32` && Alignment >= Align (`4`);
1976	}
1977
1978	bool SITargetLowering::allowsMisalignedMemoryAccesses(
1979	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1980	unsigned IsFast) const* {
1981	return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
1982	Alignment, Flags, IsFast);
1983	}
1984
1985	EVT SITargetLowering::getOptimalMemOpType(
1986	const MemOp &Op, const AttributeList &FuncAttributes) const {
1987	// FIXME: Should account for address space here.
1988
1989	// The default fallback uses the private pointer size as a guess for a type to
1990	// use. Make sure we switch these to 64-bit accesses.
1991
1992	if (Op.size() >= `16` &&
1993	Op.isDstAligned(AlignCheck: Align (`4`))) // XXX: Should only do for global
1994	return MVT::v4i32;
1995
1996	if (Op.size() >= `8` && Op.isDstAligned(AlignCheck: Align (`4`)))
1997	return MVT::v2i32;
1998
1999	// Use the default.
2000	return MVT::Other;
2001	}
2002
2003	bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode N) const* {
2004	const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2005	return MemNode->getMemOperand()->getFlags() & MONoClobber;
2006	}
2007
2008	bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2009	return AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS \|\|
2010	AS == AMDGPUAS::PRIVATE_ADDRESS;
2011	}
2012
2013	bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2014	unsigned DestAS) const {
2015	// Flat -> private/local is a simple truncate.
2016	// Flat -> global is no-op
2017	if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2018	return true;
2019
2020	const GCNTargetMachine &TM =
2021	static_cast<const GCNTargetMachine &>(getTargetMachine());
2022	return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2023	}
2024
2025	TargetLoweringBase::LegalizeTypeAction
2026	SITargetLowering::getPreferredVectorAction(MVT VT) const {
2027	if (!VT.isScalableVector() && VT.getVectorNumElements() != `1` &&
2028	VT.getScalarType().bitsLE(VT: MVT::i16))
2029	return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2030	return TargetLoweringBase::getPreferredVectorAction(VT);
2031	}
2032
2033	bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2034	Type Ty) const* {
2035	// FIXME: Could be smarter if called for vector constants.
2036	return true;
2037	}
2038
2039	bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2040	unsigned Index) const {
2041	if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2042	return false;
2043
2044	// TODO: Add more cases that are cheap.
2045	return Index == `0`;
2046	}
2047
2048	bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2049	// TODO: This should be more aggressive, particular for 16-bit element
2050	// vectors. However there are some mixed improvements and regressions.
2051	EVT EltTy = VT.getVectorElementType();
2052	return EltTy.getSizeInBits() % `32` == `0`;
2053	}
2054
2055	bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2056	if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2057	switch (Op) {
2058	case ISD::LOAD:
2059	case ISD::STORE:
2060	return true;
2061	default:
2062	return false;
2063	}
2064	}
2065
2066	// SimplifySetCC uses this function to determine whether or not it should
2067	// create setcc with i1 operands. We don't have instructions for i1 setcc.
2068	if (VT == MVT::i1 && Op == ISD::SETCC)
2069	return false;
2070
2071	return TargetLowering::isTypeDesirableForOp(Op, VT);
2072	}
2073
2074	SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2075	const SDLoc &SL,
2076	SDValue Chain,
2077	uint64_t Offset) const {
2078	const DataLayout &DL = DAG.getDataLayout();
2079	MachineFunction &MF = DAG.getMachineFunction();
2080	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2081	MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2082
2083	auto [InputPtrReg, RC, ArgTy] =
2084	Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2085
2086	// We may not have the kernarg segment argument if we have no kernel
2087	// arguments.
2088	if (!InputPtrReg)
2089	return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2090
2091	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2092	SDValue BasePtr = DAG.getCopyFromReg(
2093	Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2094
2095	return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2096	}
2097
2098	SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2099	const SDLoc &SL) const {
2100	uint64_t Offset =
2101	getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2102	return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2103	}
2104
2105	SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2106	const SDLoc &SL) const {
2107
2108	Function &F = DAG.getMachineFunction().getFunction();
2109	std::optional<uint32_t> KnownSize =
2110	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2111	if (KnownSize.has_value())
2112	return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2113	return SDValue ();
2114	}
2115
2116	SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2117	const SDLoc &SL, SDValue Val,
2118	bool Signed,
2119	const ISD::InputArg Arg) const* {
2120	// First, if it is a widened vector, narrow it.
2121	if (VT.isVector() &&
2122	VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2123	EVT NarrowedVT =
2124	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2125	NumElements: VT.getVectorNumElements());
2126	Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2127	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
2128	}
2129
2130	// Then convert the vector elements or scalar value.
2131	if (Arg && (Arg->Flags.isSExt() \|\| Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2132	unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2133	Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2134	}
2135
2136	if (MemVT.isFloatingPoint())
2137	Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2138	else if (Signed)
2139	Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2140	else
2141	Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2142
2143	return Val;
2144	}
2145
2146	SDValue SITargetLowering::lowerKernargMemParameter(
2147	SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2148	uint64_t Offset, Align Alignment, bool Signed,
2149	const ISD::InputArg Arg) const* {
2150	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2151
2152	// Try to avoid using an extload by loading earlier than the argument address,
2153	// and extracting the relevant bits. The load should hopefully be merged with
2154	// the previous argument.
2155	if (MemVT.getStoreSize() < `4` && Alignment < `4`) {
2156	// TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2157	int64_t AlignDownOffset = alignDown(Value: Offset, Align: `4`);
2158	int64_t OffsetDiff = Offset - AlignDownOffset;
2159
2160	EVT IntVT = MemVT.changeTypeToInteger();
2161
2162	// TODO: If we passed in the base kernel offset we could have a better
2163	// alignment than 4, but we don't really need it.
2164	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2165	SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr, PtrInfo, Alignment: Align (`4`),
2166	MMOFlags: MachineMemOperand::MODereferenceable \|
2167	MachineMemOperand::MOInvariant);
2168
2169	SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * `8`, DL: SL, VT: MVT::i32);
2170	SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2171
2172	SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2173	ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2174	ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2175
2176	return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: `1`)}, dl: SL);
2177	}
2178
2179	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2180	SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment,
2181	MMOFlags: MachineMemOperand::MODereferenceable \|
2182	MachineMemOperand::MOInvariant);
2183
2184	SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2185	return DAG.getMergeValues(Ops: {Val, Load.getValue(R: `1`)}, dl: SL);
2186	}
2187
2188	SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2189	CCValAssign &VA, const SDLoc &SL,
2190	SDValue Chain,
2191	const ISD::InputArg &Arg) const {
2192	MachineFunction &MF = DAG.getMachineFunction();
2193	MachineFrameInfo &MFI = MF.getFrameInfo();
2194
2195	if (Arg.Flags.isByVal()) {
2196	unsigned Size = Arg.Flags.getByValSize();
2197	int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2198	return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2199	}
2200
2201	unsigned ArgOffset = VA.getLocMemOffset();
2202	unsigned ArgSize = VA.getValVT().getStoreSize();
2203
2204	int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2205
2206	// Create load nodes to retrieve arguments from the stack.
2207	SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2208	SDValue ArgValue;
2209
2210	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2211	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2212	MVT MemVT = VA.getValVT();
2213
2214	switch (VA.getLocInfo()) {
2215	default:
2216	break;
2217	case CCValAssign::BCvt:
2218	MemVT = VA.getLocVT();
2219	break;
2220	case CCValAssign::SExt:
2221	ExtType = ISD::SEXTLOAD;
2222	break;
2223	case CCValAssign::ZExt:
2224	ExtType = ISD::ZEXTLOAD;
2225	break;
2226	case CCValAssign::AExt:
2227	ExtType = ISD::EXTLOAD;
2228	break;
2229	}
2230
2231	ArgValue = DAG.getExtLoad(
2232	ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2233	PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2234	return ArgValue;
2235	}
2236
2237	SDValue SITargetLowering::getPreloadedValue(
2238	SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2239	AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2240	const ArgDescriptor Reg = nullptr*;
2241	const TargetRegisterClass *RC;
2242	LLT Ty;
2243
2244	CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2245	const ArgDescriptor WorkGroupIDX =
2246	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2247	// If GridZ is not programmed in an entry function then the hardware will set
2248	// it to all zeros, so there is no need to mask the GridY value in the low
2249	// order bits.
2250	const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2251	Reg: AMDGPU::TTMP7,
2252	Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~`0u` : `0xFFFFu`);
2253	const ArgDescriptor WorkGroupIDZ =
2254	ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: `0xFFFF0000u`);
2255	if (Subtarget->hasArchitectedSGPRs() &&
2256	(AMDGPU::isCompute(CC) \|\| CC == CallingConv::AMDGPU_Gfx)) {
2257	switch (PVID) {
2258	case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2259	Reg = &WorkGroupIDX;
2260	RC = &AMDGPU::SReg_32RegClass;
2261	Ty = LLT::scalar(SizeInBits: `32`);
2262	break;
2263	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2264	Reg = &WorkGroupIDY;
2265	RC = &AMDGPU::SReg_32RegClass;
2266	Ty = LLT::scalar(SizeInBits: `32`);
2267	break;
2268	case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2269	Reg = &WorkGroupIDZ;
2270	RC = &AMDGPU::SReg_32RegClass;
2271	Ty = LLT::scalar(SizeInBits: `32`);
2272	break;
2273	default:
2274	break;
2275	}
2276	}
2277
2278	if (!Reg)
2279	std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2280	if (!Reg) {
2281	if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2282	// It's possible for a kernarg intrinsic call to appear in a kernel with
2283	// no allocated segment, in which case we do not add the user sgpr
2284	// argument, so just return null.
2285	return DAG.getConstant(Val: `0`, DL: SDLoc (), VT);
2286	}
2287
2288	// It's undefined behavior if a function marked with the amdgpu-no-*
2289	// attributes uses the corresponding intrinsic.
2290	return DAG.getPOISON(VT);
2291	}
2292
2293	return loadInputValue(DAG, RC, VT, SL: SDLoc (DAG.getEntryNode()), Arg: *Reg);
2294	}
2295
2296	static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2297	CallingConv::ID CallConv,
2298	ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2299	FunctionType *FType,
2300	SIMachineFunctionInfo *Info) {
2301	for (unsigned I = `0`, E = Ins.size(), PSInputNum = `0`; I != E; ++I) {
2302	const ISD::InputArg *Arg = &Ins [I];
2303
2304	assert((!Arg->VT.isVector() \|\| Arg->VT.getScalarSizeInBits() == `16`) &&
2305	"vector type argument should have been split");
2306
2307	// First check if it's a PS input addr.
2308	if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2309	PSInputNum <= `15`) {
2310	bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2311
2312	// Inconveniently only the first part of the split is marked as isSplit,
2313	// so skip to the end. We only want to increment PSInputNum once for the
2314	// entire split argument.
2315	if (Arg->Flags.isSplit()) {
2316	while (!Arg->Flags.isSplitEnd()) {
2317	assert((!Arg->VT.isVector() \|\| Arg->VT.getScalarSizeInBits() == `16`) &&
2318	"unexpected vector split in ps argument type");
2319	if (!SkipArg)
2320	Splits.push_back(Elt: *Arg);
2321	Arg = &Ins [++I];
2322	}
2323	}
2324
2325	if (SkipArg) {
2326	// We can safely skip PS inputs.
2327	Skipped.set(Arg->getOrigArgIndex());
2328	++PSInputNum;
2329	continue;
2330	}
2331
2332	Info->markPSInputAllocated(Index: PSInputNum);
2333	if (Arg->Used)
2334	Info->markPSInputEnabled(Index: PSInputNum);
2335
2336	++PSInputNum;
2337	}
2338
2339	Splits.push_back(Elt: *Arg);
2340	}
2341	}
2342
2343	// Allocate special inputs passed in VGPRs.
2344	void SITargetLowering::allocateSpecialEntryInputVGPRs(
2345	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2346	SIMachineFunctionInfo &Info) const {
2347	const LLT S32 = LLT::scalar(SizeInBits: `32`);
2348	MachineRegisterInfo &MRI = MF.getRegInfo();
2349
2350	if (Info.hasWorkItemIDX()) {
2351	Register Reg = AMDGPU::VGPR0;
2352	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2353
2354	CCInfo.AllocateReg(Reg);
2355	unsigned Mask =
2356	(Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? `0x3ff` : ~`0u`;
2357	Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2358	}
2359
2360	if (Info.hasWorkItemIDY()) {
2361	assert(Info.hasWorkItemIDX());
2362	if (Subtarget->hasPackedTID()) {
2363	Info.setWorkItemIDY(
2364	ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: `0x3ff` << `10`));
2365	} else {
2366	unsigned Reg = AMDGPU::VGPR1;
2367	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2368
2369	CCInfo.AllocateReg(Reg);
2370	Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2371	}
2372	}
2373
2374	if (Info.hasWorkItemIDZ()) {
2375	assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2376	if (Subtarget->hasPackedTID()) {
2377	Info.setWorkItemIDZ(
2378	ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: `0x3ff` << `20`));
2379	} else {
2380	unsigned Reg = AMDGPU::VGPR2;
2381	MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2382
2383	CCInfo.AllocateReg(Reg);
2384	Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2385	}
2386	}
2387	}
2388
2389	// Try to allocate a VGPR at the end of the argument list, or if no argument
2390	// VGPRs are left allocating a stack slot.
2391	// If \p Mask is is given it indicates bitfield position in the register.
2392	// If \p Arg is given use it with new ]p Mask instead of allocating new.
2393	static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~`0u`,
2394	ArgDescriptor Arg = ArgDescriptor ()) {
2395	if (Arg.isSet())
2396	return ArgDescriptor::createArg(Arg, Mask);
2397
2398	ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), `32`);
2399	unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2400	if (RegIdx == ArgVGPRs.size()) {
2401	// Spill to stack required.
2402	int64_t Offset = CCInfo.AllocateStack(Size: `4`, Alignment: Align (`4`));
2403
2404	return ArgDescriptor::createStack(Offset, Mask);
2405	}
2406
2407	unsigned Reg = ArgVGPRs [RegIdx];
2408	Reg = CCInfo.AllocateReg(Reg);
2409	assert(Reg != AMDGPU::NoRegister);
2410
2411	MachineFunction &MF = CCInfo.getMachineFunction();
2412	Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2413	MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: `32`));
2414	return ArgDescriptor::createRegister(Reg, Mask);
2415	}
2416
2417	static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2418	const TargetRegisterClass *RC,
2419	unsigned NumArgRegs) {
2420	ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), `32`);
2421	unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2422	if (RegIdx == ArgSGPRs.size())
2423	report_fatal_error(reason: "ran out of SGPRs for arguments");
2424
2425	unsigned Reg = ArgSGPRs [RegIdx];
2426	Reg = CCInfo.AllocateReg(Reg);
2427	assert(Reg != AMDGPU::NoRegister);
2428
2429	MachineFunction &MF = CCInfo.getMachineFunction();
2430	MF.addLiveIn(PReg: Reg, RC);
2431	return ArgDescriptor::createRegister(Reg);
2432	}
2433
2434	// If this has a fixed position, we still should allocate the register in the
2435	// CCInfo state. Technically we could get away with this for values passed
2436	// outside of the normal argument range.
2437	static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2438	const TargetRegisterClass *RC,
2439	MCRegister Reg) {
2440	Reg = CCInfo.AllocateReg(Reg);
2441	assert(Reg != AMDGPU::NoRegister);
2442	MachineFunction &MF = CCInfo.getMachineFunction();
2443	MF.addLiveIn(PReg: Reg, RC);
2444	}
2445
2446	static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2447	if (Arg) {
2448	allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2449	Reg: Arg.getRegister());
2450	} else
2451	Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: `32`);
2452	}
2453
2454	static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2455	if (Arg) {
2456	allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2457	Reg: Arg.getRegister());
2458	} else
2459	Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: `16`);
2460	}
2461
2462	/// Allocate implicit function VGPR arguments at the end of allocated user
2463	/// arguments.
2464	void SITargetLowering::allocateSpecialInputVGPRs(
2465	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2466	SIMachineFunctionInfo &Info) const {
2467	const unsigned Mask = `0x3ff`;
2468	ArgDescriptor Arg;
2469
2470	if (Info.hasWorkItemIDX()) {
2471	Arg = allocateVGPR32Input(CCInfo, Mask);
2472	Info.setWorkItemIDX(Arg);
2473	}
2474
2475	if (Info.hasWorkItemIDY()) {
2476	Arg = allocateVGPR32Input(CCInfo, Mask: Mask << `10`, Arg);
2477	Info.setWorkItemIDY(Arg);
2478	}
2479
2480	if (Info.hasWorkItemIDZ())
2481	Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << `20`, Arg));
2482	}
2483
2484	/// Allocate implicit function VGPR arguments in fixed registers.
2485	void SITargetLowering::allocateSpecialInputVGPRsFixed(
2486	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2487	SIMachineFunctionInfo &Info) const {
2488	Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2489	if (!Reg)
2490	report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2491
2492	const unsigned Mask = `0x3ff`;
2493	Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2494	Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << `10`));
2495	Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << `20`));
2496	}
2497
2498	void SITargetLowering::allocateSpecialInputSGPRs(
2499	CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2500	SIMachineFunctionInfo &Info) const {
2501	auto &ArgInfo = Info.getArgInfo();
2502	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2503
2504	// TODO: Unify handling with private memory pointers.
2505	if (UserSGPRInfo.hasDispatchPtr())
2506	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2507
2508	if (UserSGPRInfo.hasQueuePtr())
2509	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2510
2511	// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2512	// constant offset from the kernarg segment.
2513	if (Info.hasImplicitArgPtr())
2514	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2515
2516	if (UserSGPRInfo.hasDispatchID())
2517	allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2518
2519	// flat_scratch_init is not applicable for non-kernel functions.
2520
2521	if (Info.hasWorkGroupIDX())
2522	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2523
2524	if (Info.hasWorkGroupIDY())
2525	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2526
2527	if (Info.hasWorkGroupIDZ())
2528	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2529
2530	if (Info.hasLDSKernelId())
2531	allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2532	}
2533
2534	// Allocate special inputs passed in user SGPRs.
2535	void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2536	MachineFunction &MF,
2537	const SIRegisterInfo &TRI,
2538	SIMachineFunctionInfo &Info) const {
2539	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2540	if (UserSGPRInfo.hasImplicitBufferPtr()) {
2541	Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2542	MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2543	CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2544	}
2545
2546	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2547	if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2548	Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2549	MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2550	CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2551	}
2552
2553	if (UserSGPRInfo.hasDispatchPtr()) {
2554	Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2555	MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2556	CCInfo.AllocateReg(Reg: DispatchPtrReg);
2557	}
2558
2559	if (UserSGPRInfo.hasQueuePtr()) {
2560	Register QueuePtrReg = Info.addQueuePtr(TRI);
2561	MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
2562	CCInfo.AllocateReg(Reg: QueuePtrReg);
2563	}
2564
2565	if (UserSGPRInfo.hasKernargSegmentPtr()) {
2566	MachineRegisterInfo &MRI = MF.getRegInfo();
2567	Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2568	CCInfo.AllocateReg(Reg: InputPtrReg);
2569
2570	Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2571	MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: `64`));
2572	}
2573
2574	if (UserSGPRInfo.hasDispatchID()) {
2575	Register DispatchIDReg = Info.addDispatchID(TRI);
2576	MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
2577	CCInfo.AllocateReg(Reg: DispatchIDReg);
2578	}
2579
2580	if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2581	Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2582	MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
2583	CCInfo.AllocateReg(Reg: FlatScratchInitReg);
2584	}
2585
2586	if (UserSGPRInfo.hasPrivateSegmentSize()) {
2587	Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2588	MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
2589	CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
2590	}
2591
2592	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2593	// these from the dispatch pointer.
2594	}
2595
2596	// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2597	// sequential starting from the first argument.
2598	void SITargetLowering::allocatePreloadKernArgSGPRs(
2599	CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2600	const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2601	const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2602	Function &F = MF.getFunction();
2603	unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2604	GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2605	bool InPreloadSequence = true;
2606	unsigned InIdx = `0`;
2607	bool AlignedForImplictArgs = false;
2608	unsigned ImplicitArgOffset = `0`;
2609	for (auto &Arg : F.args()) {
2610	if (!InPreloadSequence \|\| !Arg.hasInRegAttr())
2611	break;
2612
2613	unsigned ArgIdx = Arg.getArgNo();
2614	// Don't preload non-original args or parts not in the current preload
2615	// sequence.
2616	if (InIdx < Ins.size() &&
2617	(!Ins [InIdx].isOrigArg() \|\| Ins [InIdx].getOrigArgIndex() != ArgIdx))
2618	break;
2619
2620	for (; InIdx < Ins.size() && Ins [InIdx].isOrigArg() &&
2621	Ins [InIdx].getOrigArgIndex() == ArgIdx;
2622	InIdx++) {
2623	assert(ArgLocs[ArgIdx].isMemLoc());
2624	auto &ArgLoc = ArgLocs [InIdx];
2625	const Align KernelArgBaseAlign = Align (`16`);
2626	unsigned ArgOffset = ArgLoc.getLocMemOffset();
2627	Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
2628	unsigned NumAllocSGPRs =
2629	alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: `32`) / `32`;
2630
2631	// Fix alignment for hidden arguments.
2632	if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
2633	if (!AlignedForImplictArgs) {
2634	ImplicitArgOffset =
2635	alignTo(Size: LastExplicitArgOffset,
2636	A: Subtarget->getAlignmentForImplicitArgPtr()) -
2637	LastExplicitArgOffset;
2638	AlignedForImplictArgs = true;
2639	}
2640	ArgOffset += ImplicitArgOffset;
2641	}
2642
2643	// Arg is preloaded into the previous SGPR.
2644	if (ArgLoc.getLocVT().getStoreSize() < `4` && Alignment < `4`) {
2645	assert(InIdx >= `1` && "No previous SGPR");
2646	Info.getArgInfo().PreloadKernArgs [InIdx].Regs.push_back(
2647	Elt: Info.getArgInfo().PreloadKernArgs [InIdx - `1`].Regs [`0`]);
2648	continue;
2649	}
2650
2651	unsigned Padding = ArgOffset - LastExplicitArgOffset;
2652	unsigned PaddingSGPRs = alignTo(Value: Padding, Align: `4`) / `4`;
2653	// Check for free user SGPRs for preloading.
2654	if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2655	InPreloadSequence = false;
2656	break;
2657	}
2658
2659	// Preload this argument.
2660	const TargetRegisterClass *RC =
2661	TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * `32`);
2662	SmallVectorImpl<MCRegister> *PreloadRegs =
2663	Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
2664
2665	if (PreloadRegs->size() > `1`)
2666	RC = &AMDGPU::SGPR_32RegClass;
2667	for (auto &Reg : *PreloadRegs) {
2668	assert(Reg);
2669	MF.addLiveIn(PReg: Reg, RC);
2670	CCInfo.AllocateReg(Reg);
2671	}
2672
2673	LastExplicitArgOffset = NumAllocSGPRs * `4` + ArgOffset;
2674	}
2675	}
2676	}
2677
2678	void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2679	const SIRegisterInfo &TRI,
2680	SIMachineFunctionInfo &Info) const {
2681	// Always allocate this last since it is a synthetic preload.
2682	if (Info.hasLDSKernelId()) {
2683	Register Reg = Info.addLDSKernelId();
2684	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2685	CCInfo.AllocateReg(Reg);
2686	}
2687	}
2688
2689	// Allocate special input registers that are initialized per-wave.
2690	void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
2691	SIMachineFunctionInfo &Info,
2692	CallingConv::ID CallConv,
2693	bool IsShader) const {
2694	bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2695	if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2696	// Note: user SGPRs are handled by the front-end for graphics shaders
2697	// Pad up the used user SGPRs with dead inputs.
2698
2699	// TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2700	// before enabling architected SGPRs for workgroup IDs.
2701	assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2702
2703	unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2704	// Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2705	// rely on it to reach 16 since if we end up having no stack usage, it will
2706	// not really be added.
2707	unsigned NumRequiredSystemSGPRs =
2708	Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2709	Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2710	for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < `16`; ++i) {
2711	Register Reg = Info.addReservedUserSGPR();
2712	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2713	CCInfo.AllocateReg(Reg);
2714	}
2715	}
2716
2717	if (!HasArchitectedSGPRs) {
2718	if (Info.hasWorkGroupIDX()) {
2719	Register Reg = Info.addWorkGroupIDX();
2720	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2721	CCInfo.AllocateReg(Reg);
2722	}
2723
2724	if (Info.hasWorkGroupIDY()) {
2725	Register Reg = Info.addWorkGroupIDY();
2726	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2727	CCInfo.AllocateReg(Reg);
2728	}
2729
2730	if (Info.hasWorkGroupIDZ()) {
2731	Register Reg = Info.addWorkGroupIDZ();
2732	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2733	CCInfo.AllocateReg(Reg);
2734	}
2735	}
2736
2737	if (Info.hasWorkGroupInfo()) {
2738	Register Reg = Info.addWorkGroupInfo();
2739	MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2740	CCInfo.AllocateReg(Reg);
2741	}
2742
2743	if (Info.hasPrivateSegmentWaveByteOffset()) {
2744	// Scratch wave offset passed in system SGPR.
2745	unsigned PrivateSegmentWaveByteOffsetReg;
2746
2747	if (IsShader) {
2748	PrivateSegmentWaveByteOffsetReg =
2749	Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2750
2751	// This is true if the scratch wave byte offset doesn't have a fixed
2752	// location.
2753	if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2754	PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2755	Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2756	}
2757	} else
2758	PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2759
2760	MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
2761	CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
2762	}
2763
2764	assert(!Subtarget->hasUserSGPRInit16Bug() \|\| IsShader \|\|
2765	Info.getNumPreloadedSGPRs() >= `16`);
2766	}
2767
2768	static void reservePrivateMemoryRegs(const TargetMachine &TM,
2769	MachineFunction &MF,
2770	const SIRegisterInfo &TRI,
2771	SIMachineFunctionInfo &Info) {
2772	// Now that we've figured out where the scratch register inputs are, see if
2773	// should reserve the arguments and use them directly.
2774	MachineFrameInfo &MFI = MF.getFrameInfo();
2775	bool HasStackObjects = MFI.hasStackObjects();
2776	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2777
2778	// Record that we know we have non-spill stack objects so we don't need to
2779	// check all stack objects later.
2780	if (HasStackObjects)
2781	Info.setHasNonSpillStackObjects(true);
2782
2783	// Everything live out of a block is spilled with fast regalloc, so it's
2784	// almost certain that spilling will be required.
2785	if (TM.getOptLevel() == CodeGenOptLevel::None)
2786	HasStackObjects = true;
2787
2788	// For now assume stack access is needed in any callee functions, so we need
2789	// the scratch registers to pass in.
2790	bool RequiresStackAccess = HasStackObjects \|\| MFI.hasCalls();
2791
2792	if (!ST.enableFlatScratch()) {
2793	if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
2794	// If we have stack objects, we unquestionably need the private buffer
2795	// resource. For the Code Object V2 ABI, this will be the first 4 user
2796	// SGPR inputs. We can reserve those and use them directly.
2797
2798	Register PrivateSegmentBufferReg =
2799	Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2800	Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2801	} else {
2802	unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2803	// We tentatively reserve the last registers (skipping the last registers
2804	// which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2805	// we'll replace these with the ones immediately after those which were
2806	// really allocated. In the prologue copies will be inserted from the
2807	// argument to these reserved registers.
2808
2809	// Without HSA, relocations are used for the scratch pointer and the
2810	// buffer resource setup is always inserted in the prologue. Scratch wave
2811	// offset is still in an input SGPR.
2812	Info.setScratchRSrcReg(ReservedBufferReg);
2813	}
2814	}
2815
2816	MachineRegisterInfo &MRI = MF.getRegInfo();
2817
2818	// For entry functions we have to set up the stack pointer if we use it,
2819	// whereas non-entry functions get this "for free". This means there is no
2820	// intrinsic advantage to using S32 over S34 in cases where we do not have
2821	// calls but do need a frame pointer (i.e. if we are requested to have one
2822	// because frame pointer elimination is disabled). To keep things simple we
2823	// only ever use S32 as the call ABI stack pointer, and so using it does not
2824	// imply we need a separate frame pointer.
2825	//
2826	// Try to use s32 as the SP, but move it if it would interfere with input
2827	// arguments. This won't work with calls though.
2828	//
2829	// FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2830	// registers.
2831	if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
2832	Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2833	} else {
2834	assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2835
2836	if (MFI.hasCalls())
2837	report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
2838
2839	for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2840	if (!MRI.isLiveIn(Reg)) {
2841	Info.setStackPtrOffsetReg(Reg);
2842	break;
2843	}
2844	}
2845
2846	if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2847	report_fatal_error(reason: "failed to find register for SP");
2848	}
2849
2850	// hasFP should be accurate for entry functions even before the frame is
2851	// finalized, because it does not rely on the known stack size, only
2852	// properties like whether variable sized objects are present.
2853	if (ST.getFrameLowering()->hasFP(MF)) {
2854	Info.setFrameOffsetReg(AMDGPU::SGPR33);
2855	}
2856	}
2857
2858	bool SITargetLowering::supportSplitCSR(MachineFunction MF) const* {
2859	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2860	return !Info->isEntryFunction();
2861	}
2862
2863	void SITargetLowering::initializeSplitCSR(MachineBasicBlock Entry) const* {}
2864
2865	void SITargetLowering::insertCopiesSplitCSR(
2866	MachineBasicBlock *Entry,
2867	const SmallVectorImpl<MachineBasicBlock > &Exits) const* {
2868	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2869
2870	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
2871	if (!IStart)
2872	return;
2873
2874	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2875	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2876	MachineBasicBlock::iterator MBBI = Entry->begin();
2877	for (const MCPhysReg I = IStart; I; ++I) {
2878	const TargetRegisterClass RC = nullptr*;
2879	if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
2880	RC = &AMDGPU::SGPR_64RegClass;
2881	else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
2882	RC = &AMDGPU::SGPR_32RegClass;
2883	else
2884	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2885
2886	Register NewVR = MRI->createVirtualRegister(RegClass: RC);
2887	// Create copy from CSR to a virtual register.
2888	Entry->addLiveIn(PhysReg: *I);
2889	BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc (), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
2890	.addReg(RegNo: *I);
2891
2892	// Insert the copy-back instructions right before the terminator.
2893	for (auto *Exit : Exits)
2894	BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc (),
2895	MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
2896	.addReg(RegNo: NewVR);
2897	}
2898	}
2899
2900	SDValue SITargetLowering::LowerFormalArguments(
2901	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2902	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2903	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2904	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2905
2906	MachineFunction &MF = DAG.getMachineFunction();
2907	const Function &Fn = MF.getFunction();
2908	FunctionType *FType = MF.getFunction().getFunctionType();
2909	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2910	bool IsError = false;
2911
2912	if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
2913	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
2914	Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
2915	IsError = true;
2916	}
2917
2918	SmallVector<ISD::InputArg, `16`> Splits;
2919	SmallVector<CCValAssign, `16`> ArgLocs;
2920	BitVector Skipped(Ins.size());
2921	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2922	*DAG.getContext());
2923
2924	bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
2925	bool IsKernel = AMDGPU::isKernel(CC: CallConv);
2926	bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
2927
2928	if (IsGraphics) {
2929	const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2930	assert(!UserSGPRInfo.hasDispatchPtr() &&
2931	!UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2932	!Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2933	!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2934	(void)UserSGPRInfo;
2935	if (!Subtarget->enableFlatScratch())
2936	assert(!UserSGPRInfo.hasFlatScratchInit());
2937	if ((CallConv != CallingConv::AMDGPU_CS &&
2938	CallConv != CallingConv::AMDGPU_Gfx) \|\|
2939	!Subtarget->hasArchitectedSGPRs())
2940	assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2941	!Info->hasWorkGroupIDZ());
2942	}
2943
2944	if (CallConv == CallingConv::AMDGPU_PS) {
2945	processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2946
2947	// At least one interpolation mode must be enabled or else the GPU will
2948	// hang.
2949	//
2950	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2951	// set PSInputAddr, the user wants to enable some bits after the compilation
2952	// based on run-time states. Since we can't know what the final PSInputEna
2953	// will look like, so we shouldn't do anything here and the user should take
2954	// responsibility for the correct programming.
2955	//
2956	// Otherwise, the following restrictions apply:
2957	// - At least one of PERSP_ (0xF) or LINEAR_* (0x70) must be enabled.*
2958	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_ must be*
2959	// enabled too.
2960	if ((Info->getPSInputAddr() & `0x7F`) == `0` \|\|
2961	((Info->getPSInputAddr() & `0xF`) == `0` && Info->isPSInputAllocated(Index: `11`))) {
2962	CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
2963	CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
2964	Info->markPSInputAllocated(Index: `0`);
2965	Info->markPSInputEnabled(Index: `0`);
2966	}
2967	if (Subtarget->isAmdPalOS()) {
2968	// For isAmdPalOS, the user does not enable some bits after compilation
2969	// based on run-time states; the register values being generated here are
2970	// the final ones set in hardware. Therefore we need to apply the
2971	// workaround to PSInputAddr and PSInputEnable together. (The case where
2972	// a bit is set in PSInputAddr but not PSInputEnable is where the
2973	// frontend set up an input arg for a particular interpolation mode, but
2974	// nothing uses that input arg. Really we should have an earlier pass
2975	// that removes such an arg.)
2976	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2977	if ((PsInputBits & `0x7F`) == `0` \|\|
2978	((PsInputBits & `0xF`) == `0` && (PsInputBits >> `11` & `1`)))
2979	Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
2980	}
2981	} else if (IsKernel) {
2982	assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2983	} else {
2984	Splits.append(in_start: Ins.begin(), in_end: Ins.end());
2985	}
2986
2987	if (IsKernel)
2988	analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
2989
2990	if (IsEntryFunc) {
2991	allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
2992	allocateHSAUserSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
2993	if (IsKernel && Subtarget->hasKernargPreload())
2994	allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: TRI, Info&: Info);
2995
2996	allocateLDSKernelId(CCInfo, MF, TRI: TRI, Info&: Info);
2997	} else if (!IsGraphics) {
2998	// For the fixed ABI, pass workitem IDs in the last argument register.
2999	allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: TRI, Info&: Info);
3000
3001	// FIXME: Sink this into allocateSpecialInputSGPRs
3002	if (!Subtarget->enableFlatScratch())
3003	CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3004
3005	allocateSpecialInputSGPRs(CCInfo, MF, TRI: TRI, Info&: Info);
3006	}
3007
3008	if (!IsKernel) {
3009	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3010	CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3011	}
3012
3013	SmallVector<SDValue, `16`> Chains;
3014
3015	// FIXME: This is the minimum kernel argument alignment. We should improve
3016	// this to the maximum alignment of the arguments.
3017	//
3018	// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3019	// kern arg offset.
3020	const Align KernelArgBaseAlign = Align (`16`);
3021
3022	for (unsigned i = `0`, e = Ins.size(), ArgIdx = `0`; i != e; ++i) {
3023	const ISD::InputArg &Arg = Ins [i];
3024	if ((Arg.isOrigArg() && Skipped [Arg.getOrigArgIndex()]) \|\| IsError) {
3025	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3026	continue;
3027	}
3028
3029	CCValAssign &VA = ArgLocs [ArgIdx++];
3030	MVT VT = VA.getLocVT();
3031
3032	if (IsEntryFunc && VA.isMemLoc()) {
3033	VT = Ins [i].VT;
3034	EVT MemVT = VA.getLocVT();
3035
3036	const uint64_t Offset = VA.getLocMemOffset();
3037	Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3038
3039	if (Arg.Flags.isByRef()) {
3040	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3041
3042	const GCNTargetMachine &TM =
3043	static_cast<const GCNTargetMachine &>(getTargetMachine());
3044	if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3045	DestAS: Arg.Flags.getPointerAddrSpace())) {
3046	Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3047	DestAS: Arg.Flags.getPointerAddrSpace());
3048	}
3049
3050	InVals.push_back(Elt: Ptr);
3051	continue;
3052	}
3053
3054	SDValue NewArg;
3055	if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3056	if (MemVT.getStoreSize() < `4` && Alignment < `4`) {
3057	// In this case the argument is packed into the previous preload SGPR.
3058	int64_t AlignDownOffset = alignDown(Value: Offset, Align: `4`);
3059	int64_t OffsetDiff = Offset - AlignDownOffset;
3060	EVT IntVT = MemVT.changeTypeToInteger();
3061
3062	const SIMachineFunctionInfo *Info =
3063	MF.getInfo<SIMachineFunctionInfo>();
3064	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3065	Register Reg =
3066	Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs [`0`];
3067
3068	assert(Reg);
3069	Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3070	SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3071
3072	SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * `8`, DL, VT: MVT::i32);
3073	SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3074
3075	SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3076	ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3077	NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3078	Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3079
3080	NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: `1`)}, dl: DL);
3081	} else {
3082	const SIMachineFunctionInfo *Info =
3083	MF.getInfo<SIMachineFunctionInfo>();
3084	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3085	const SmallVectorImpl<MCRegister> &PreloadRegs =
3086	Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3087
3088	SDValue Copy;
3089	if (PreloadRegs.size() == `1`) {
3090	Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs [`0`]);
3091	const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3092	NewArg = DAG.getCopyFromReg(
3093	Chain, dl: DL, Reg: VReg,
3094	VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3095	BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3096
3097	} else {
3098	// If the kernarg alignment does not match the alignment of the SGPR
3099	// tuple RC that can accommodate this argument, it will be built up
3100	// via copies from from the individual SGPRs that the argument was
3101	// preloaded to.
3102	SmallVector<SDValue, `4`> Elts;
3103	for (auto Reg : PreloadRegs) {
3104	Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3105	Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3106	Elts.push_back(Elt: Copy);
3107	}
3108	NewArg =
3109	DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3110	NumElements: PreloadRegs.size()),
3111	DL, Ops: Elts);
3112	}
3113
3114	// If the argument was preloaded to multiple consecutive 32-bit
3115	// registers because of misalignment between addressable SGPR tuples
3116	// and the argument size, we can still assume that because of kernarg
3117	// segment alignment restrictions that NewArg's size is the same as
3118	// MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3119	// truncate since we cannot preload to less than a single SGPR and the
3120	// MemVT may be smaller.
3121	EVT MemVTInt =
3122	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3123	if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3124	NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3125
3126	NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3127	NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3128	Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3129	NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3130	}
3131	} else {
3132	// Hidden arguments that are in the kernel signature must be preloaded
3133	// to user SGPRs. Print a diagnostic error if a hidden argument is in
3134	// the argument list and is not preloaded.
3135	if (Arg.isOrigArg()) {
3136	Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3137	if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3138	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
3139	*OrigArg->getParent(),
3140	"hidden argument in kernel signature was not preloaded",
3141	DL.getDebugLoc()));
3142	}
3143	}
3144
3145	NewArg =
3146	lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3147	Alignment, Signed: Ins [i].Flags.isSExt(), Arg: &Ins [i]);
3148	}
3149	Chains.push_back(Elt: NewArg.getValue(R: `1`));
3150
3151	auto *ParamTy =
3152	dyn_cast<PointerType>(Val: FType->getParamType(i: Ins [i].getOrigArgIndex()));
3153	if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3154	ParamTy &&
3155	(ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
3156	ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3157	// On SI local pointers are just offsets into LDS, so they are always
3158	// less than 16-bits. On CI and newer they could potentially be
3159	// real pointers, so we can't guarantee their size.
3160	NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3161	N2: DAG.getValueType(MVT::i16));
3162	}
3163
3164	InVals.push_back(Elt: NewArg);
3165	continue;
3166	}
3167	if (!IsEntryFunc && VA.isMemLoc()) {
3168	SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3169	InVals.push_back(Elt: Val);
3170	if (!Arg.Flags.isByVal())
3171	Chains.push_back(Elt: Val.getValue(R: `1`));
3172	continue;
3173	}
3174
3175	assert(VA.isRegLoc() && "Parameter must be in a register!");
3176
3177	Register Reg = VA.getLocReg();
3178	const TargetRegisterClass RC = nullptr*;
3179	if (AMDGPU::VGPR_32RegClass.contains(Reg))
3180	RC = &AMDGPU::VGPR_32RegClass;
3181	else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3182	RC = &AMDGPU::SGPR_32RegClass;
3183	else
3184	llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3185	EVT ValVT = VA.getValVT();
3186
3187	Reg = MF.addLiveIn(PReg: Reg, RC);
3188	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3189
3190	if (Arg.Flags.isSRet()) {
3191	// The return object should be reasonably addressable.
3192
3193	// FIXME: This helps when the return is a real sret. If it is a
3194	// automatically inserted sret (i.e. CanLowerReturn returns false), an
3195	// extra copy is inserted in SelectionDAGBuilder which obscures this.
3196	unsigned NumBits =
3197	`32` - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3198	Val = DAG.getNode(
3199	Opcode: ISD::AssertZext, DL, VT, N1: Val,
3200	N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3201	}
3202
3203	// If this is an 8 or 16-bit value, it is really passed promoted
3204	// to 32 bits. Insert an assert[sz]ext to capture this, then
3205	// truncate to the right size.
3206	switch (VA.getLocInfo()) {
3207	case CCValAssign::Full:
3208	break;
3209	case CCValAssign::BCvt:
3210	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val);
3211	break;
3212	case CCValAssign::SExt:
3213	Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val, N2: DAG.getValueType(ValVT));
3214	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3215	break;
3216	case CCValAssign::ZExt:
3217	Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, N2: DAG.getValueType(ValVT));
3218	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3219	break;
3220	case CCValAssign::AExt:
3221	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3222	break;
3223	default:
3224	llvm_unreachable("Unknown loc info!");
3225	}
3226
3227	InVals.push_back(Elt: Val);
3228	}
3229
3230	// Start adding system SGPRs.
3231	if (IsEntryFunc)
3232	allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3233
3234	// DAG.getPass() returns nullptr when using new pass manager.
3235	// TODO: Use DAG.getMFAM() to access analysis result.
3236	if (DAG.getPass()) {
3237	auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3238	ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3239	}
3240
3241	unsigned StackArgSize = CCInfo.getStackSize();
3242	Info->setBytesInStackArgArea(StackArgSize);
3243
3244	return Chains.empty() ? Chain
3245	: DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3246	}
3247
3248	// TODO: If return values can't fit in registers, we should return as many as
3249	// possible in registers before passing on stack.
3250	bool SITargetLowering::CanLowerReturn(
3251	CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3252	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3253	const Type RetTy) const* {
3254	// Replacing returns with sret/stack usage doesn't make sense for shaders.
3255	// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3256	// for shaders. Vector types should be explicitly handled by CC.
3257	if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3258	return true;
3259
3260	SmallVector<CCValAssign, `16`> RVLocs;
3261	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3262	if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3263	return false;
3264
3265	// We must use the stack if return would require unavailable registers.
3266	unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3267	unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3268	for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3269	if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3270	return false;
3271
3272	return true;
3273	}
3274
3275	SDValue
3276	SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3277	bool isVarArg,
3278	const SmallVectorImpl<ISD::OutputArg> &Outs,
3279	const SmallVectorImpl<SDValue> &OutVals,
3280	const SDLoc &DL, SelectionDAG &DAG) const {
3281	MachineFunction &MF = DAG.getMachineFunction();
3282	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3283	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3284
3285	if (AMDGPU::isKernel(CC: CallConv)) {
3286	return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3287	OutVals, DL, DAG);
3288	}
3289
3290	bool IsShader = AMDGPU::isShader(CC: CallConv);
3291
3292	Info->setIfReturnsVoid(Outs.empty());
3293	bool IsWaveEnd = Info->returnsVoid() && IsShader;
3294
3295	// CCValAssign - represent the assignment of the return value to a location.
3296	SmallVector<CCValAssign, `48`> RVLocs;
3297
3298	// CCState - Info about the registers and stack slots.
3299	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3300	*DAG.getContext());
3301
3302	// Analyze outgoing return values.
3303	CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3304
3305	SDValue Glue;
3306	SmallVector<SDValue, `48`> RetOps;
3307	RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3308
3309	SDValue ReadFirstLane =
3310	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3311	// Copy the result values into the output registers.
3312	for (unsigned I = `0`, RealRVLocIdx = `0`, E = RVLocs.size(); I != E;
3313	++I, ++RealRVLocIdx) {
3314	CCValAssign &VA = RVLocs [I];
3315	assert(VA.isRegLoc() && "Can only return in registers!");
3316	// TODO: Partially return in registers if return values don't fit.
3317	SDValue Arg = OutVals [RealRVLocIdx];
3318
3319	// Copied from other backends.
3320	switch (VA.getLocInfo()) {
3321	case CCValAssign::Full:
3322	break;
3323	case CCValAssign::BCvt:
3324	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3325	break;
3326	case CCValAssign::SExt:
3327	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3328	break;
3329	case CCValAssign::ZExt:
3330	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3331	break;
3332	case CCValAssign::AExt:
3333	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3334	break;
3335	default:
3336	llvm_unreachable("Unknown loc info!");
3337	}
3338	if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3339	Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3340	N1: ReadFirstLane, N2: Arg);
3341	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3342	Glue = Chain.getValue(R: `1`);
3343	RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3344	}
3345
3346	// FIXME: Does sret work properly?
3347	if (!Info->isEntryFunction()) {
3348	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3349	const MCPhysReg *I =
3350	TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3351	if (I) {
3352	for (; *I; ++I) {
3353	if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3354	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3355	else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3356	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3357	else
3358	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3359	}
3360	}
3361	}
3362
3363	// Update chain and glue.
3364	RetOps [`0`] = Chain;
3365	if (Glue.getNode())
3366	RetOps.push_back(Elt: Glue);
3367
3368	unsigned Opc = AMDGPUISD::ENDPGM;
3369	if (!IsWaveEnd)
3370	Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3371	return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3372	}
3373
3374	SDValue SITargetLowering::LowerCallResult(
3375	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3376	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3377	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3378	SDValue ThisVal) const {
3379	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3380
3381	// Assign locations to each value returned by this call.
3382	SmallVector<CCValAssign, `16`> RVLocs;
3383	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3384	*DAG.getContext());
3385	CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3386
3387	// Copy all of the result registers out of their specified physreg.
3388	for (CCValAssign VA : RVLocs) {
3389	SDValue Val;
3390
3391	if (VA.isRegLoc()) {
3392	Val =
3393	DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3394	Chain = Val.getValue(R: `1`);
3395	InGlue = Val.getValue(R: `2`);
3396	} else if (VA.isMemLoc()) {
3397	report_fatal_error(reason: "TODO: return values in memory");
3398	} else
3399	llvm_unreachable("unknown argument location type");
3400
3401	switch (VA.getLocInfo()) {
3402	case CCValAssign::Full:
3403	break;
3404	case CCValAssign::BCvt:
3405	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3406	break;
3407	case CCValAssign::ZExt:
3408	Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3409	N2: DAG.getValueType(VA.getValVT()));
3410	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3411	break;
3412	case CCValAssign::SExt:
3413	Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3414	N2: DAG.getValueType(VA.getValVT()));
3415	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3416	break;
3417	case CCValAssign::AExt:
3418	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3419	break;
3420	default:
3421	llvm_unreachable("Unknown loc info!");
3422	}
3423
3424	InVals.push_back(Elt: Val);
3425	}
3426
3427	return Chain;
3428	}
3429
3430	// Add code to pass special inputs required depending on used features separate
3431	// from the explicit user arguments present in the IR.
3432	void SITargetLowering::passSpecialInputs(
3433	CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3434	SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3435	SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3436	// If we don't have a call site, this was a call inserted by
3437	// legalization. These can never use special inputs.
3438	if (!CLI.CB)
3439	return;
3440
3441	SelectionDAG &DAG = CLI.DAG;
3442	const SDLoc &DL = CLI.DL;
3443	const Function &F = DAG.getMachineFunction().getFunction();
3444
3445	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3446	const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3447
3448	const AMDGPUFunctionArgInfo *CalleeArgInfo =
3449	&AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3450	if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3451	// DAG.getPass() returns nullptr when using new pass manager.
3452	// TODO: Use DAG.getMFAM() to access analysis result.
3453	if (DAG.getPass()) {
3454	auto &ArgUsageInfo =
3455	DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3456	CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc);
3457	}
3458	}
3459
3460	// TODO: Unify with private memory register handling. This is complicated by
3461	// the fact that at least in kernels, the input argument is not necessarily
3462	// in the same location as the input.
3463	// clang-format off
3464	static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3465	StringLiteral> ImplicitAttrs[] = {
3466	{AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3467	{AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3468	{AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3469	{AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3470	{AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3471	{AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3472	{AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3473	{AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3474	};
3475	// clang-format on
3476
3477	for (auto [InputID, Attr] : ImplicitAttrs) {
3478	// If the callee does not use the attribute value, skip copying the value.
3479	if (CLI.CB->hasFnAttr(Kind: Attr))
3480	continue;
3481
3482	const auto [OutgoingArg, ArgRC, ArgTy] =
3483	CalleeArgInfo->getPreloadedValue(Value: InputID);
3484	if (!OutgoingArg)
3485	continue;
3486
3487	const auto [IncomingArg, IncomingArgRC, Ty] =
3488	CallerArgInfo.getPreloadedValue(Value: InputID);
3489	assert(IncomingArgRC == ArgRC);
3490
3491	// All special arguments are ints for now.
3492	EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == `8` ? MVT::i64 : MVT::i32;
3493	SDValue InputReg;
3494
3495	if (IncomingArg) {
3496	InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3497	} else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3498	// The implicit arg ptr is special because it doesn't have a corresponding
3499	// input for kernels, and is computed from the kernarg segment pointer.
3500	InputReg = getImplicitArgPtr(DAG, SL: DL);
3501	} else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3502	std::optional<uint32_t> Id =
3503	AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3504	if (Id.has_value()) {
3505	InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3506	} else {
3507	InputReg = DAG.getPOISON(VT: ArgVT);
3508	}
3509	} else {
3510	// We may have proven the input wasn't needed, although the ABI is
3511	// requiring it. We just need to allocate the register appropriately.
3512	InputReg = DAG.getPOISON(VT: ArgVT);
3513	}
3514
3515	if (OutgoingArg->isRegister()) {
3516	RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3517	if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3518	report_fatal_error(reason: "failed to allocate implicit input argument");
3519	} else {
3520	unsigned SpecialArgOffset =
3521	CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align (`4`));
3522	SDValue ArgStore =
3523	storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3524	MemOpChains.push_back(Elt: ArgStore);
3525	}
3526	}
3527
3528	// Pack workitem IDs into a single register or pass it as is if already
3529	// packed.
3530
3531	auto [OutgoingArg, ArgRC, Ty] =
3532	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3533	if (!OutgoingArg)
3534	std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3535	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3536	if (!OutgoingArg)
3537	std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3538	CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3539	if (!OutgoingArg)
3540	return;
3541
3542	const ArgDescriptor *IncomingArgX = std::get<`0`>(
3543	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3544	const ArgDescriptor *IncomingArgY = std::get<`0`>(
3545	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3546	const ArgDescriptor *IncomingArgZ = std::get<`0`>(
3547	t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3548
3549	SDValue InputReg;
3550	SDLoc SL;
3551
3552	const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3553	const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3554	const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3555
3556	// If incoming ids are not packed we need to pack them.
3557	if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3558	NeedWorkItemIDX) {
3559	if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `0`) != `0`) {
3560	InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3561	} else {
3562	InputReg = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
3563	}
3564	}
3565
3566	if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3567	NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `1`) != `0`) {
3568	SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
3569	Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
3570	N2: DAG.getShiftAmountConstant(Val: `10`, VT: MVT::i32, DL: SL));
3571	InputReg = InputReg.getNode()
3572	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
3573	: Y;
3574	}
3575
3576	if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3577	NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: `2`) != `0`) {
3578	SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
3579	Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
3580	N2: DAG.getShiftAmountConstant(Val: `20`, VT: MVT::i32, DL: SL));
3581	InputReg = InputReg.getNode()
3582	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
3583	: Z;
3584	}
3585
3586	if (!InputReg && (NeedWorkItemIDX \|\| NeedWorkItemIDY \|\| NeedWorkItemIDZ)) {
3587	if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3588	// We're in a situation where the outgoing function requires the workitem
3589	// ID, but the calling function does not have it (e.g a graphics function
3590	// calling a C calling convention function). This is illegal, but we need
3591	// to produce something.
3592	InputReg = DAG.getPOISON(VT: MVT::i32);
3593	} else {
3594	// Workitem ids are already packed, any of present incoming arguments
3595	// will carry all required fields.
3596	ArgDescriptor IncomingArg =
3597	ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
3598	: IncomingArgY ? *IncomingArgY
3599	: *IncomingArgZ,
3600	Mask: ~`0u`);
3601	InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
3602	}
3603	}
3604
3605	if (OutgoingArg->isRegister()) {
3606	if (InputReg)
3607	RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3608
3609	CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
3610	} else {
3611	unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: `4`, Alignment: Align (`4`));
3612	if (InputReg) {
3613	SDValue ArgStore =
3614	storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3615	MemOpChains.push_back(Elt: ArgStore);
3616	}
3617	}
3618	}
3619
3620	bool SITargetLowering::isEligibleForTailCallOptimization(
3621	SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3622	const SmallVectorImpl<ISD::OutputArg> &Outs,
3623	const SmallVectorImpl<SDValue> &OutVals,
3624	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3625	if (AMDGPU::isChainCC(CC: CalleeCC))
3626	return true;
3627
3628	if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
3629	return false;
3630
3631	// For a divergent call target, we need to do a waterfall loop over the
3632	// possible callees which precludes us from using a simple jump.
3633	if (Callee ->isDivergent())
3634	return false;
3635
3636	MachineFunction &MF = DAG.getMachineFunction();
3637	const Function &CallerF = MF.getFunction();
3638	CallingConv::ID CallerCC = CallerF.getCallingConv();
3639	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3640	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3641
3642	// Kernels aren't callable, and don't have a live in return address so it
3643	// doesn't make sense to do a tail call with entry functions.
3644	if (!CallerPreserved)
3645	return false;
3646
3647	bool CCMatch = CallerCC == CalleeCC;
3648
3649	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3650	if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
3651	return true;
3652	return false;
3653	}
3654
3655	// TODO: Can we handle var args?
3656	if (IsVarArg)
3657	return false;
3658
3659	for (const Argument &Arg : CallerF.args()) {
3660	if (Arg.hasByValAttr())
3661	return false;
3662	}
3663
3664	LLVMContext &Ctx = *DAG.getContext();
3665
3666	// Check that the call results are passed in the same way.
3667	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
3668	CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
3669	CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
3670	return false;
3671
3672	// The callee has to preserve all registers the caller needs to preserve.
3673	if (!CCMatch) {
3674	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3675	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
3676	return false;
3677	}
3678
3679	// Nothing more to check if the callee is taking no arguments.
3680	if (Outs.empty())
3681	return true;
3682
3683	SmallVector<CCValAssign, `16`> ArgLocs;
3684	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3685
3686	// FIXME: We are not allocating special input registers, so we will be
3687	// deciding based on incorrect register assignments.
3688	CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
3689
3690	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3691	// If the stack arguments for this call do not fit into our own save area then
3692	// the call cannot be made tail.
3693	// TODO: Is this really necessary?
3694	if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3695	return false;
3696
3697	for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
3698	// FIXME: What about inreg arguments that end up passed in memory?
3699	if (!CCVA.isRegLoc())
3700	continue;
3701
3702	// If we are passing an argument in an SGPR, and the value is divergent,
3703	// this call requires a waterfall loop.
3704	if (ArgVal ->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
3705	LLVM_DEBUG(
3706	dbgs() << "Cannot tail call due to divergent outgoing argument in "
3707	<< printReg(CCVA.getLocReg(), TRI) << `'\n'`);
3708	return false;
3709	}
3710	}
3711
3712	const MachineRegisterInfo &MRI = MF.getRegInfo();
3713	return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
3714	}
3715
3716	bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
3717	if (!CI->isTailCall())
3718	return false;
3719
3720	const Function *ParentFn = CI->getParent()->getParent();
3721	if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
3722	return false;
3723	return true;
3724	}
3725
3726	namespace {
3727	// Chain calls have special arguments that we need to handle. These are
3728	// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3729	// arguments (index 0 and 1 respectively).
3730	enum ChainCallArgIdx {
3731	Exec = `2`,
3732	Flags,
3733	NumVGPRs,
3734	FallbackExec,
3735	FallbackCallee
3736	};
3737	} // anonymous namespace
3738
3739	// The wave scratch offset register is used as the global base pointer.
3740	SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3741	SmallVectorImpl<SDValue> &InVals) const {
3742	CallingConv::ID CallConv = CLI.CallConv;
3743	bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
3744
3745	SelectionDAG &DAG = CLI.DAG;
3746
3747	const SDLoc &DL = CLI.DL;
3748	SDValue Chain = CLI.Chain;
3749	SDValue Callee = CLI.Callee;
3750
3751	llvm::SmallVector<SDValue, `6`> ChainCallSpecialArgs;
3752	bool UsesDynamicVGPRs = false;
3753	if (IsChainCallConv) {
3754	// The last arguments should be the value that we need to put in EXEC,
3755	// followed by the flags and any other arguments with special meanings.
3756	// Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3757	// we don't treat them like the "real" arguments.
3758	auto RequestedExecIt =
3759	llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
3760	return Arg.OrigArgIndex == `2`;
3761	});
3762	assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3763
3764	size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3765	CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
3766	CE: CLI.OutVals.end());
3767	CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
3768
3769	assert(CLI.Outs.back().OrigArgIndex < `2` &&
3770	"Haven't popped all the special args");
3771
3772	TargetLowering::ArgListEntry RequestedExecArg =
3773	CLI.Args [ChainCallArgIdx::Exec];
3774	if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
3775	return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
3776
3777	// Convert constants into TargetConstants, so they become immediate operands
3778	// instead of being selected into S_MOV.
3779	auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3780	if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
3781	ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
3782	Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: `0`)));
3783	} else
3784	ChainCallSpecialArgs.push_back(Elt: Arg.Node);
3785	};
3786
3787	PushNodeOrTargetConstant (RequestedExecArg);
3788
3789	// Process any other special arguments depending on the value of the flags.
3790	TargetLowering::ArgListEntry Flags = CLI.Args [ChainCallArgIdx::Flags];
3791
3792	const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
3793	if (FlagsValue.isZero()) {
3794	if (CLI.Args.size() > ChainCallArgIdx::Flags + `1`)
3795	return lowerUnhandledCall(CLI, InVals,
3796	Reason: "no additional args allowed if flags == 0");
3797	} else if (FlagsValue.isOneBitSet(BitNo: `0`)) {
3798	if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + `1`) {
3799	return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
3800	}
3801
3802	if (!Subtarget->isWave32()) {
3803	return lowerUnhandledCall(
3804	CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
3805	}
3806
3807	UsesDynamicVGPRs = true;
3808	std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3809	last: CLI.Args.end(), f: PushNodeOrTargetConstant);
3810	}
3811	}
3812
3813	SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
3814	SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
3815	SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
3816	bool &IsTailCall = CLI.IsTailCall;
3817	bool IsVarArg = CLI.IsVarArg;
3818	bool IsSibCall = false;
3819	MachineFunction &MF = DAG.getMachineFunction();
3820
3821	if (Callee.isUndef() \|\| isNullConstant(V: Callee)) {
3822	if (!CLI.IsTailCall) {
3823	for (ISD::InputArg &Arg : CLI.Ins)
3824	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3825	}
3826
3827	return Chain;
3828	}
3829
3830	if (IsVarArg) {
3831	return lowerUnhandledCall(CLI, InVals,
3832	Reason: "unsupported call to variadic function ");
3833	}
3834
3835	if (!CLI.CB)
3836	return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
3837
3838	if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3839	return lowerUnhandledCall(CLI, InVals,
3840	Reason: "unsupported required tail call to function ");
3841	}
3842
3843	if (IsTailCall) {
3844	IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
3845	Outs, OutVals, Ins, DAG);
3846	if (!IsTailCall &&
3847	((CLI.CB && CLI.CB->isMustTailCall()) \|\| IsChainCallConv)) {
3848	report_fatal_error(reason: "failed to perform tail call elimination on a call "
3849	"site marked musttail or on llvm.amdgcn.cs.chain");
3850	}
3851
3852	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3853
3854	// A sibling call is one where we're under the usual C ABI and not planning
3855	// to change that but can still do a tail call:
3856	if (!TailCallOpt && IsTailCall)
3857	IsSibCall = true;
3858
3859	if (IsTailCall)
3860	++NumTailCalls;
3861	}
3862
3863	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3864	SmallVector<std::pair<unsigned, SDValue>, `8`> RegsToPass;
3865	SmallVector<SDValue, `8`> MemOpChains;
3866
3867	// Analyze operands of the call, assigning locations to each operand.
3868	SmallVector<CCValAssign, `16`> ArgLocs;
3869	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3870	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
3871
3872	if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) {
3873	// With a fixed ABI, allocate fixed registers before user arguments.
3874	passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
3875	}
3876
3877	CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
3878
3879	// Get a count of how many bytes are to be pushed on the stack.
3880	unsigned NumBytes = CCInfo.getStackSize();
3881
3882	if (IsSibCall) {
3883	// Since we're not changing the ABI to make this a tail call, the memory
3884	// operands are already available in the caller's incoming argument space.
3885	NumBytes = `0`;
3886	}
3887
3888	// FPDiff is the byte offset of the call's argument area from the callee's.
3889	// Stores to callee stack arguments will be placed in FixedStackSlots offset
3890	// by this amount for a tail call. In a sibling call it must be 0 because the
3891	// caller will deallocate the entire stack and the callee still expects its
3892	// arguments to begin at SP+0. Completely unused for non-tail calls.
3893	int32_t FPDiff = `0`;
3894	MachineFrameInfo &MFI = MF.getFrameInfo();
3895	auto TRI = static_cast<const* SIRegisterInfo *>(Subtarget->getRegisterInfo());
3896
3897	// Adjust the stack pointer for the new arguments...
3898	// These operations are automatically eliminated by the prolog/epilog pass
3899	if (!IsSibCall)
3900	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL);
3901
3902	if (!IsSibCall \|\| IsChainCallConv) {
3903	if (!Subtarget->enableFlatScratch()) {
3904	SmallVector<SDValue, `4`> CopyFromChains;
3905
3906	// In the HSA case, this should be an identity copy.
3907	SDValue ScratchRSrcReg =
3908	DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
3909	RegsToPass.emplace_back(Args: IsChainCallConv
3910	? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3911	: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3912	Args&: ScratchRSrcReg);
3913	CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: `1`));
3914	Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
3915	}
3916	}
3917
3918	const unsigned NumSpecialInputs = RegsToPass.size();
3919
3920	MVT PtrVT = MVT::i32;
3921
3922	// Walk the register/memloc assignments, inserting copies/loads.
3923	for (unsigned i = `0`, e = ArgLocs.size(); i != e; ++i) {
3924	CCValAssign &VA = ArgLocs [i];
3925	SDValue Arg = OutVals [i];
3926
3927	// Promote the value if needed.
3928	switch (VA.getLocInfo()) {
3929	case CCValAssign::Full:
3930	break;
3931	case CCValAssign::BCvt:
3932	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3933	break;
3934	case CCValAssign::ZExt:
3935	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3936	break;
3937	case CCValAssign::SExt:
3938	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3939	break;
3940	case CCValAssign::AExt:
3941	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3942	break;
3943	case CCValAssign::FPExt:
3944	Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3945	break;
3946	default:
3947	llvm_unreachable("Unknown loc info!");
3948	}
3949
3950	if (VA.isRegLoc()) {
3951	RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
3952	} else {
3953	assert(VA.isMemLoc());
3954
3955	SDValue DstAddr;
3956	MachinePointerInfo DstInfo;
3957
3958	unsigned LocMemOffset = VA.getLocMemOffset();
3959	int32_t Offset = LocMemOffset;
3960
3961	SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
3962	MaybeAlign Alignment;
3963
3964	if (IsTailCall) {
3965	ISD::ArgFlagsTy Flags = Outs [i].Flags;
3966	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3967	: VA.getValVT().getStoreSize();
3968
3969	// FIXME: We can have better than the minimum byval required alignment.
3970	Alignment =
3971	Flags.isByVal()
3972	? Flags.getNonZeroByValAlign()
3973	: commonAlignment(A: Subtarget->getStackAlignment(), Offset);
3974
3975	Offset = Offset + FPDiff;
3976	int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
3977
3978	DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
3979	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3980
3981	// Make sure any stack arguments overlapping with where we're storing
3982	// are loaded before this eventual operation. Otherwise they'll be
3983	// clobbered.
3984
3985	// FIXME: Why is this really necessary? This seems to just result in a
3986	// lot of code to copy the stack and write them back to the same
3987	// locations, which are supposed to be immutable?
3988	Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
3989	} else {
3990	// Stores to the argument stack area are relative to the stack pointer.
3991	SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
3992	VT: MVT::i32);
3993	DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
3994	DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
3995	Alignment =
3996	commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
3997	}
3998
3999	if (Outs [i].Flags.isByVal()) {
4000	SDValue SizeNode =
4001	DAG.getConstant(Val: Outs [i].Flags.getByValSize(), DL, VT: MVT::i32);
4002	SDValue Cpy =
4003	DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4004	Alignment: Outs [i].Flags.getNonZeroByValAlign(),
4005	/isVol = / false, /AlwaysInline = / true,
4006	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4007	SrcPtrInfo: MachinePointerInfo (AMDGPUAS::PRIVATE_ADDRESS));
4008
4009	MemOpChains.push_back(Elt: Cpy);
4010	} else {
4011	SDValue Store =
4012	DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4013	MemOpChains.push_back(Elt: Store);
4014	}
4015	}
4016	}
4017
4018	if (!MemOpChains.empty())
4019	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4020
4021	SDValue ReadFirstLaneID =
4022	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4023
4024	SDValue TokenGlue;
4025	if (CLI.ConvergenceControlToken) {
4026	TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4027	Operand: CLI.ConvergenceControlToken);
4028	}
4029
4030	// Build a sequence of copy-to-reg nodes chained together with token chain
4031	// and flag operands which copy the outgoing args into the appropriate regs.
4032	SDValue InGlue;
4033
4034	unsigned ArgIdx = `0`;
4035	for (auto [Reg, Val] : RegsToPass) {
4036	if (ArgIdx++ >= NumSpecialInputs &&
4037	(IsChainCallConv \|\| !Val ->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4038	// For chain calls, the inreg arguments are required to be
4039	// uniform. Speculatively Insert a readfirstlane in case we cannot prove
4040	// they are uniform.
4041	//
4042	// For other calls, if an inreg arguments is known to be uniform,
4043	// speculatively insert a readfirstlane in case it is in a VGPR.
4044	//
4045	// FIXME: We need to execute this in a waterfall loop if it is a divergent
4046	// value, so let that continue to produce invalid code.
4047
4048	SmallVector<SDValue, `3`> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4049	if (TokenGlue)
4050	ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4051	Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4052	Ops: ReadfirstlaneArgs);
4053	}
4054
4055	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4056	InGlue = Chain.getValue(R: `1`);
4057	}
4058
4059	// We don't usually want to end the call-sequence here because we would tidy
4060	// the frame up after* the call, however in the ABI-changing tail-call case*
4061	// we've carefully laid out the parameters so that when sp is reset they'll be
4062	// in the correct location.
4063	if (IsTailCall && !IsSibCall) {
4064	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: `0`, Glue: InGlue, DL);
4065	InGlue = Chain.getValue(R: `1`);
4066	}
4067
4068	std::vector<SDValue> Ops({Chain});
4069
4070	// Add a redundant copy of the callee global which will not be legalized, as
4071	// we need direct access to the callee later.
4072	if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4073	const GlobalValue *GV = GSD->getGlobal();
4074	Ops.push_back(x: Callee);
4075	Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4076	} else {
4077	if (IsTailCall) {
4078	// isEligibleForTailCallOptimization considered whether the call target is
4079	// divergent, but we may still end up with a uniform value in a VGPR.
4080	// Insert a readfirstlane just in case.
4081	SDValue ReadFirstLaneID =
4082	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4083
4084	SmallVector<SDValue, `3`> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4085	if (TokenGlue)
4086	ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4087	Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4088	Ops: ReadfirstlaneArgs);
4089	}
4090
4091	Ops.push_back(x: Callee);
4092	Ops.push_back(x: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64));
4093	}
4094
4095	if (IsTailCall) {
4096	// Each tail call may have to adjust the stack by a different amount, so
4097	// this information must travel along with the operation for eventual
4098	// consumption by emitEpilogue.
4099	Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4100	}
4101
4102	if (IsChainCallConv)
4103	llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4104
4105	// Add argument registers to the end of the list so that they are known live
4106	// into the call.
4107	for (auto &[Reg, Val] : RegsToPass)
4108	Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4109
4110	// Add a register mask operand representing the call-preserved registers.
4111	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4112	assert(Mask && "Missing call preserved mask for calling convention");
4113	Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4114
4115	if (SDValue Token = CLI.ConvergenceControlToken) {
4116	SmallVector<SDValue, `2`> GlueOps;
4117	GlueOps.push_back(Elt: Token);
4118	if (InGlue)
4119	GlueOps.push_back(Elt: InGlue);
4120
4121	InGlue = SDValue (DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4122	VT: MVT::Glue, Ops: GlueOps),
4123	`0`);
4124	}
4125
4126	if (InGlue)
4127	Ops.push_back(x: InGlue);
4128
4129	// If we're doing a tall call, use a TC_RETURN here rather than an
4130	// actual call instruction.
4131	if (IsTailCall) {
4132	MFI.setHasTailCall();
4133	unsigned OPC = AMDGPUISD::TC_RETURN;
4134	switch (CallConv) {
4135	case CallingConv::AMDGPU_Gfx:
4136	OPC = AMDGPUISD::TC_RETURN_GFX;
4137	break;
4138	case CallingConv::AMDGPU_CS_Chain:
4139	case CallingConv::AMDGPU_CS_ChainPreserve:
4140	OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4141	: AMDGPUISD::TC_RETURN_CHAIN;
4142	break;
4143	}
4144
4145	return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4146	}
4147
4148	// Returns a chain and a flag for retval copy to use.
4149	SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4150	Chain = Call.getValue(R: `0`);
4151	InGlue = Call.getValue(R: `1`);
4152
4153	uint64_t CalleePopBytes = NumBytes;
4154	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: CalleePopBytes, Glue: InGlue, DL);
4155	if (!Ins.empty())
4156	InGlue = Chain.getValue(R: `1`);
4157
4158	// Handle result values, copying them out of physregs into vregs that we
4159	// return.
4160	return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4161	InVals, /IsThisReturn=/false, ThisVal: SDValue ());
4162	}
4163
4164	// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4165	// except for:
4166	// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4167	// 2. Scale size where, scale = wave-reduction(alloca-size) wave-size*
4168	SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4169	SelectionDAG &DAG) const {
4170	const MachineFunction &MF = DAG.getMachineFunction();
4171	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4172
4173	SDLoc dl(Op);
4174	EVT VT = Op.getValueType();
4175	SDValue Chain = Op.getOperand(i: `0`);
4176	Register SPReg = Info->getStackPtrOffsetReg();
4177
4178	// Chain the dynamic stack allocation so that it doesn't modify the stack
4179	// pointer when other instructions are using the stack.
4180	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL: dl);
4181
4182	SDValue Size = Op.getOperand(i: `1`);
4183	SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4184	Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getAlignValue();
4185
4186	const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4187	assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4188	"Stack grows upwards for AMDGPU");
4189
4190	Chain = BaseAddr.getValue(R: `1`);
4191	Align StackAlign = TFL->getStackAlign();
4192	if (Alignment > StackAlign) {
4193	uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4194	<< Subtarget->getWavefrontSizeLog2();
4195	uint64_t StackAlignMask = ScaledAlignment - `1`;
4196	SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4197	N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4198	BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4199	N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4200	}
4201
4202	assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4203	SDValue NewSP;
4204	if (isa<ConstantSDNode>(Val: Size)) {
4205	// For constant sized alloca, scale alloca size by wave-size
4206	SDValue ScaledSize = DAG.getNode(
4207	Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4208	N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4209	NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4210	} else {
4211	// For dynamic sized alloca, perform wave-wide reduction to get max of
4212	// alloca size(divergent) and then scale it by wave-size
4213	SDValue WaveReduction =
4214	DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4215	Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4216	N2: Size, N3: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32));
4217	SDValue ScaledSize = DAG.getNode(
4218	Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4219	N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4220	NewSP =
4221	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4222	SDValue ReadFirstLaneID =
4223	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4224	NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4225	N2: NewSP);
4226	}
4227
4228	Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4229	SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: SDValue (), DL: dl);
4230
4231	return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4232	}
4233
4234	SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4235	if (Op.getValueType() != MVT::i32)
4236	return Op; // Defer to cannot select error.
4237
4238	Register SP = getStackPointerRegisterToSaveRestore();
4239	SDLoc SL(Op);
4240
4241	SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op ->getOperand(Num: `0`), dl: SL, Reg: SP, VT: MVT::i32);
4242
4243	// Convert from wave uniform to swizzled vector address. This should protect
4244	// from any edge cases where the stacksave result isn't directly used with
4245	// stackrestore.
4246	SDValue VectorAddress =
4247	DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4248	return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: `1`)}, dl: SL);
4249	}
4250
4251	SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4252	SelectionDAG &DAG) const {
4253	SDLoc SL(Op);
4254	assert(Op.getValueType() == MVT::i32);
4255
4256	uint32_t BothRoundHwReg =
4257	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `4`);
4258	SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4259
4260	SDValue IntrinID =
4261	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4262	SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op ->getVTList(),
4263	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: GetRoundBothImm);
4264
4265	// There are two rounding modes, one for f32 and one for f64/f16. We only
4266	// report in the standard value range if both are the same.
4267	//
4268	// The raw values also differ from the expected FLT_ROUNDS values. Nearest
4269	// ties away from zero is not supported, and the other values are rotated by
4270	// 1.
4271	//
4272	// If the two rounding modes are not the same, report a target defined value.
4273
4274	// Mode register rounding mode fields:
4275	//
4276	// [1:0] Single-precision round mode.
4277	// [3:2] Double/Half-precision round mode.
4278	//
4279	// 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4280	//
4281	// Hardware Spec
4282	// Toward-0 3 0
4283	// Nearest Even 0 1
4284	// +Inf 1 2
4285	// -Inf 2 3
4286	// NearestAway0 N/A 4
4287	//
4288	// We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4289	// table we can index by the raw hardware mode.
4290	//
4291	// (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4292
4293	SDValue BitTable =
4294	DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4295
4296	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4297	SDValue RoundModeTimesNumBits =
4298	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4299
4300	// TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4301	// knew only one mode was demanded.
4302	SDValue TableValue =
4303	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4304	SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4305
4306	SDValue EntryMask = DAG.getConstant(Val: `0xf`, DL: SL, VT: MVT::i32);
4307	SDValue TableEntry =
4308	DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4309
4310	// There's a gap in the 4-bit encoded table and actual enum values, so offset
4311	// if it's an extended value.
4312	SDValue Four = DAG.getConstant(Val: `4`, DL: SL, VT: MVT::i32);
4313	SDValue IsStandardValue =
4314	DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4315	SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4316	SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4317	N2: TableEntry, N3: EnumOffset);
4318
4319	return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: `1`)}, dl: SL);
4320	}
4321
4322	SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4323	SelectionDAG &DAG) const {
4324	SDLoc SL(Op);
4325
4326	SDValue NewMode = Op.getOperand(i: `1`);
4327	assert(NewMode.getValueType() == MVT::i32);
4328
4329	// Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4330	// hardware MODE.fp_round values.
4331	if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4332	uint32_t ClampedVal = std::min(
4333	a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4334	b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4335	NewMode = DAG.getConstant(
4336	Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4337	} else {
4338	// If we know the input can only be one of the supported standard modes in
4339	// the range 0-3, we can use a simplified mapping to hardware values.
4340	KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4341	const bool UseReducedTable = KB.countMinLeadingZeros() >= `30`;
4342	// The supported standard values are 0-3. The extended values start at 8. We
4343	// need to offset by 4 if the value is in the extended range.
4344
4345	if (UseReducedTable) {
4346	// Truncate to the low 32-bits.
4347	SDValue BitTable = DAG.getConstant(
4348	Val: AMDGPU::FltRoundToHWConversionTable & `0xffff`, DL: SL, VT: MVT::i32);
4349
4350	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4351	SDValue RoundModeTimesNumBits =
4352	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4353
4354	NewMode =
4355	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4356
4357	// TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4358	// the table extracted bits into inline immediates.
4359	} else {
4360	// table_index = umin(value, value - 4)
4361	// MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4362	SDValue BitTable =
4363	DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4364
4365	SDValue Four = DAG.getConstant(Val: `4`, DL: SL, VT: MVT::i32);
4366	SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4367	SDValue IndexVal =
4368	DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4369
4370	SDValue Two = DAG.getConstant(Val: `2`, DL: SL, VT: MVT::i32);
4371	SDValue RoundModeTimesNumBits =
4372	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4373
4374	SDValue TableValue =
4375	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4376	SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4377
4378	// No need to mask out the high bits since the setreg will ignore them
4379	// anyway.
4380	NewMode = TruncTable;
4381	}
4382
4383	// Insert a readfirstlane in case the value is a VGPR. We could do this
4384	// earlier and keep more operations scalar, but that interferes with
4385	// combining the source.
4386	SDValue ReadFirstLaneID =
4387	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4388	NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4389	N1: ReadFirstLaneID, N2: NewMode);
4390	}
4391
4392	// N.B. The setreg will be later folded into s_round_mode on supported
4393	// targets.
4394	SDValue IntrinID =
4395	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4396	uint32_t BothRoundHwReg =
4397	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `4`);
4398	SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4399
4400	SDValue SetReg =
4401	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op ->getVTList(), N1: Op.getOperand(i: `0`),
4402	N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4403
4404	return SetReg;
4405	}
4406
4407	SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4408	if (Op ->isDivergent())
4409	return SDValue ();
4410
4411	switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4412	case AMDGPUAS::FLAT_ADDRESS:
4413	case AMDGPUAS::GLOBAL_ADDRESS:
4414	case AMDGPUAS::CONSTANT_ADDRESS:
4415	case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4416	break;
4417	default:
4418	return SDValue ();
4419	}
4420
4421	return Op;
4422	}
4423
4424	// Work around DAG legality rules only based on the result type.
4425	SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4426	bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4427	SDValue Src = Op.getOperand(i: IsStrict ? `1` : `0`);
4428	EVT SrcVT = Src.getValueType();
4429
4430	if (SrcVT.getScalarType() != MVT::bf16)
4431	return Op;
4432
4433	SDLoc SL(Op);
4434	SDValue BitCast =
4435	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4436
4437	EVT DstVT = Op.getValueType();
4438	if (IsStrict)
4439	llvm_unreachable("Need STRICT_BF16_TO_FP");
4440
4441	return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4442	}
4443
4444	SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4445	SDLoc SL(Op);
4446	if (Op.getValueType() != MVT::i64)
4447	return Op;
4448
4449	uint32_t ModeHwReg =
4450	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
4451	SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4452	uint32_t TrapHwReg =
4453	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
4454	SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4455
4456	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4457	SDValue IntrinID =
4458	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4459	SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4460	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: ModeHwRegImm);
4461	SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4462	N1: Op.getOperand(i: `0`), N2: IntrinID, N3: TrapHwRegImm);
4463	SDValue TokenReg =
4464	DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: `1`),
4465	N2: GetTrapReg.getValue(R: `1`));
4466
4467	SDValue CvtPtr =
4468	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4469	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4470
4471	return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4472	}
4473
4474	SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4475	SDLoc SL(Op);
4476	if (Op.getOperand(i: `1`).getValueType() != MVT::i64)
4477	return Op;
4478
4479	SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
4480	SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4481	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
4482	SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4483	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
4484
4485	SDValue ReadFirstLaneID =
4486	DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4487	NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4488	N1: ReadFirstLaneID, N2: NewModeReg);
4489	NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4490	N1: ReadFirstLaneID, N2: NewTrapReg);
4491
4492	unsigned ModeHwReg =
4493	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: `0`, Values: `23`);
4494	SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4495	unsigned TrapHwReg =
4496	AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: `0`, Values: `5`);
4497	SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4498
4499	SDValue IntrinID =
4500	DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4501	SDValue SetModeReg =
4502	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4503	N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4504	SDValue SetTrapReg =
4505	DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4506	N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4507	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4508	}
4509
4510	Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4511	const MachineFunction &MF) const {
4512	const Function &Fn = MF.getFunction();
4513
4514	Register Reg = StringSwitch<Register>(RegName)
4515	.Case(S: "m0", Value: AMDGPU::M0)
4516	.Case(S: "exec", Value: AMDGPU::EXEC)
4517	.Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4518	.Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4519	.Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4520	.Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4521	.Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4522	.Default(Value: Register ());
4523	if (!Reg)
4524	return Reg;
4525
4526	if (!Subtarget->hasFlatScrRegister() &&
4527	Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4528	Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef (RegName) +
4529	"\" for subtarget."));
4530	}
4531
4532	switch (Reg) {
4533	case AMDGPU::M0:
4534	case AMDGPU::EXEC_LO:
4535	case AMDGPU::EXEC_HI:
4536	case AMDGPU::FLAT_SCR_LO:
4537	case AMDGPU::FLAT_SCR_HI:
4538	if (VT.getSizeInBits() == `32`)
4539	return Reg;
4540	break;
4541	case AMDGPU::EXEC:
4542	case AMDGPU::FLAT_SCR:
4543	if (VT.getSizeInBits() == `64`)
4544	return Reg;
4545	break;
4546	default:
4547	llvm_unreachable("missing register type checking");
4548	}
4549
4550	report_fatal_error(
4551	reason: Twine("invalid type for register \"" + StringRef (RegName) + "\"."));
4552	}
4553
4554	// If kill is not the last instruction, split the block so kill is always a
4555	// proper terminator.
4556	MachineBasicBlock *
4557	SITargetLowering::splitKillBlock(MachineInstr &MI,
4558	MachineBasicBlock BB) const* {
4559	MachineBasicBlock SplitBB = BB->splitAt(SplitInst&: MI, /UpdateLiveIns=/*true);
4560	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4561	MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
4562	return SplitBB;
4563	}
4564
4565	// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4566	// \p MI will be the only instruction in the loop body block. Otherwise, it will
4567	// be the first instruction in the remainder block.
4568	//
4569	/// \returns { LoopBody, Remainder }
4570	static std::pair<MachineBasicBlock , MachineBasicBlock >
4571	splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4572	MachineFunction *MF = MBB.getParent();
4573	MachineBasicBlock::iterator I(&MI);
4574
4575	// To insert the loop we need to split the block. Move everything after this
4576	// point to a new block, and insert a new empty block between the two.
4577	MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4578	MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4579	MachineFunction::iterator MBBI(MBB);
4580	++MBBI;
4581
4582	MF->insert(MBBI, MBB: LoopBB);
4583	MF->insert(MBBI, MBB: RemainderBB);
4584
4585	LoopBB->addSuccessor(Succ: LoopBB);
4586	LoopBB->addSuccessor(Succ: RemainderBB);
4587
4588	// Move the rest of the block into a new block.
4589	RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
4590
4591	if (InstInLoop) {
4592	auto Next = std::next(x: I);
4593
4594	// Move instruction to loop body.
4595	LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
4596
4597	// Move the rest of the block.
4598	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
4599	} else {
4600	RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
4601	}
4602
4603	MBB.addSuccessor(Succ: LoopBB);
4604
4605	return std::pair(LoopBB, RemainderBB);
4606	}
4607
4608	/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4609	void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4610	MachineBasicBlock *MBB = MI.getParent();
4611	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4612	auto I = MI.getIterator();
4613	auto E = std::next(x: I);
4614
4615	// clang-format off
4616	BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
4617	.addImm(Val: `0`);
4618	// clang-format on
4619
4620	MIBundleBuilder Bundler(*MBB, I, E);
4621	finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
4622	}
4623
4624	MachineBasicBlock *
4625	SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4626	MachineBasicBlock BB) const* {
4627	const DebugLoc &DL = MI.getDebugLoc();
4628
4629	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4630
4631	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4632
4633	// Apparently kill flags are only valid if the def is in the same block?
4634	if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
4635	Src->setIsKill(false);
4636
4637	auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true*);
4638
4639	MachineBasicBlock::iterator I = LoopBB->end();
4640
4641	const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4642	Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: `1`);
4643
4644	// Clear TRAP_STS.MEM_VIOL
4645	BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
4646	.addImm(Val: `0`)
4647	.addImm(Val: EncodedReg);
4648
4649	bundleInstWithWaitcnt(MI);
4650
4651	Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4652
4653	// Load and check TRAP_STS.MEM_VIOL
4654	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
4655	.addImm(Val: EncodedReg);
4656
4657	// FIXME: Do we need to use an isel pseudo that may clobber scc?
4658	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
4659	.addReg(RegNo: Reg, flags: RegState::Kill)
4660	.addImm(Val: `0`);
4661	// clang-format off
4662	BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
4663	.addMBB(MBB: LoopBB);
4664	// clang-format on
4665
4666	return RemainderBB;
4667	}
4668
4669	// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4670	// wavefront. If the value is uniform and just happens to be in a VGPR, this
4671	// will only do one iteration. In the worst case, this will loop 64 times.
4672	//
4673	// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4674	static MachineBasicBlock::iterator
4675	emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4676	MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4677	const DebugLoc &DL, const MachineOperand &Idx,
4678	unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4679	unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4680	Register &SGPRIdxReg) {
4681
4682	MachineFunction *MF = OrigBB.getParent();
4683	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4684	const SIRegisterInfo *TRI = ST.getRegisterInfo();
4685	MachineBasicBlock::iterator I = LoopBB.begin();
4686
4687	const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4688	Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
4689	Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
4690	Register CurrentIdxReg =
4691	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4692	Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
4693
4694	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
4695	.addReg(RegNo: InitReg)
4696	.addMBB(MBB: &OrigBB)
4697	.addReg(RegNo: ResultReg)
4698	.addMBB(MBB: &LoopBB);
4699
4700	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
4701	.addReg(RegNo: InitSaveExecReg)
4702	.addMBB(MBB: &OrigBB)
4703	.addReg(RegNo: NewExec)
4704	.addMBB(MBB: &LoopBB);
4705
4706	// Read the next variant <- also loop target.
4707	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
4708	.addReg(RegNo: Idx.getReg(), flags: getUndefRegState(B: Idx.isUndef()));
4709
4710	// Compare the just read M0 value to all possible Idx values.
4711	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
4712	.addReg(RegNo: CurrentIdxReg)
4713	.addReg(RegNo: Idx.getReg(), flags: `0`, SubReg: Idx.getSubReg());
4714
4715	// Update EXEC, save the original EXEC value to VCC.
4716	BuildMI(BB&: LoopBB, I, MIMD: DL,
4717	MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4718	: AMDGPU::S_AND_SAVEEXEC_B64),
4719	DestReg: NewExec)
4720	.addReg(RegNo: CondReg, flags: RegState::Kill);
4721
4722	MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
4723
4724	if (UseGPRIdxMode) {
4725	if (Offset == `0`) {
4726	SGPRIdxReg = CurrentIdxReg;
4727	} else {
4728	SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
4729	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
4730	.addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4731	.addImm(Val: Offset);
4732	}
4733	} else {
4734	// Move index from VCC into M0
4735	if (Offset == `0`) {
4736	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
4737	.addReg(RegNo: CurrentIdxReg, flags: RegState::Kill);
4738	} else {
4739	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4740	.addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4741	.addImm(Val: Offset);
4742	}
4743	}
4744
4745	// Update EXEC, switch all done bits to 0 and all todo bits to 1.
4746	unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4747	MachineInstr *InsertPt =
4748	BuildMI(BB&: LoopBB, I, MIMD: DL,
4749	MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_XOR_B32_term
4750	: AMDGPU::S_XOR_B64_term),
4751	DestReg: Exec)
4752	.addReg(RegNo: Exec)
4753	.addReg(RegNo: NewExec);
4754
4755	// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4756	// s_cbranch_scc0?
4757
4758	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4759	// clang-format off
4760	BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
4761	.addMBB(MBB: &LoopBB);
4762	// clang-format on
4763
4764	return InsertPt->getIterator();
4765	}
4766
4767	// This has slightly sub-optimal regalloc when the source vector is killed by
4768	// the read. The register allocator does not understand that the kill is
4769	// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4770	// subregister from it, using 1 more VGPR than necessary. This was saved when
4771	// this was expanded after register allocation.
4772	static MachineBasicBlock::iterator
4773	loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4774	unsigned InitResultReg, unsigned PhiReg, int Offset,
4775	bool UseGPRIdxMode, Register &SGPRIdxReg) {
4776	MachineFunction *MF = MBB.getParent();
4777	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4778	const SIRegisterInfo *TRI = ST.getRegisterInfo();
4779	MachineRegisterInfo &MRI = MF->getRegInfo();
4780	const DebugLoc &DL = MI.getDebugLoc();
4781	MachineBasicBlock::iterator I(&MI);
4782
4783	const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4784	Register DstReg = MI.getOperand(i: `0`).getReg();
4785	Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4786	Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4787	unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4788	unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4789
4790	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
4791
4792	// Save the EXEC mask
4793	// clang-format off
4794	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: SaveExec)
4795	.addReg(RegNo: Exec);
4796	// clang-format on
4797
4798	auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
4799
4800	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4801
4802	auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: LoopBB, DL, Idx: Idx,
4803	InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
4804	Offset, UseGPRIdxMode, SGPRIdxReg);
4805
4806	MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4807	MachineFunction::iterator MBBI(LoopBB);
4808	++MBBI;
4809	MF->insert(MBBI, MBB: LandingPad);
4810	LoopBB->removeSuccessor(Succ: RemainderBB);
4811	LandingPad->addSuccessor(Succ: RemainderBB);
4812	LoopBB->addSuccessor(Succ: LandingPad);
4813	MachineBasicBlock::iterator First = LandingPad->begin();
4814	// clang-format off
4815	BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: Exec)
4816	.addReg(RegNo: SaveExec);
4817	// clang-format on
4818
4819	return InsPt;
4820	}
4821
4822	// Returns subreg index, offset
4823	static std::pair<unsigned, int>
4824	computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4825	const TargetRegisterClass SuperRC, unsigned* VecReg,
4826	int Offset) {
4827	int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / `32`;
4828
4829	// Skip out of bounds offsets, or else we would end up using an undefined
4830	// register.
4831	if (Offset >= NumElts \|\| Offset < `0`)
4832	return std::pair(AMDGPU::sub0, Offset);
4833
4834	return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), `0`);
4835	}
4836
4837	static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4838	MachineRegisterInfo &MRI, MachineInstr &MI,
4839	int Offset) {
4840	MachineBasicBlock *MBB = MI.getParent();
4841	const DebugLoc &DL = MI.getDebugLoc();
4842	MachineBasicBlock::iterator I(&MI);
4843
4844	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4845
4846	assert(Idx->getReg() != AMDGPU::NoRegister);
4847
4848	if (Offset == `0`) {
4849	// clang-format off
4850	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
4851	.add(MO: *Idx);
4852	// clang-format on
4853	} else {
4854	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4855	.add(MO: *Idx)
4856	.addImm(Val: Offset);
4857	}
4858	}
4859
4860	static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4861	MachineRegisterInfo &MRI, MachineInstr &MI,
4862	int Offset) {
4863	MachineBasicBlock *MBB = MI.getParent();
4864	const DebugLoc &DL = MI.getDebugLoc();
4865	MachineBasicBlock::iterator I(&MI);
4866
4867	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4868
4869	if (Offset == `0`)
4870	return Idx->getReg();
4871
4872	Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4873	BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
4874	.add(MO: *Idx)
4875	.addImm(Val: Offset);
4876	return Tmp;
4877	}
4878
4879	static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4880	MachineBasicBlock &MBB,
4881	const GCNSubtarget &ST) {
4882	const SIInstrInfo *TII = ST.getInstrInfo();
4883	const SIRegisterInfo &TRI = TII->getRegisterInfo();
4884	MachineFunction *MF = MBB.getParent();
4885	MachineRegisterInfo &MRI = MF->getRegInfo();
4886
4887	Register Dst = MI.getOperand(i: `0`).getReg();
4888	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4889	Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
4890	int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4891
4892	const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
4893	const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4894
4895	unsigned SubReg;
4896	std::tie(args&: SubReg, args&: Offset) =
4897	computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
4898
4899	const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4900
4901	// Check for a SGPR index.
4902	if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4903	MachineBasicBlock::iterator I(&MI);
4904	const DebugLoc &DL = MI.getDebugLoc();
4905
4906	if (UseGPRIdxMode) {
4907	// TODO: Look at the uses to avoid the copy. This may require rescheduling
4908	// to avoid interfering with other uses, so probably requires a new
4909	// optimization pass.
4910	Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4911
4912	const MCInstrDesc &GPRIDXDesc =
4913	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: true*);
4914	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4915	.addReg(RegNo: SrcReg)
4916	.addReg(RegNo: Idx)
4917	.addImm(Val: SubReg);
4918	} else {
4919	setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4920
4921	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4922	.addReg(RegNo: SrcReg, flags: `0`, SubReg)
4923	.addReg(RegNo: SrcReg, flags: RegState::Implicit);
4924	}
4925
4926	MI.eraseFromParent();
4927
4928	return &MBB;
4929	}
4930
4931	// Control flow needs to be inserted if indexing with a VGPR.
4932	const DebugLoc &DL = MI.getDebugLoc();
4933	MachineBasicBlock::iterator I(&MI);
4934
4935	Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4936	Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4937
4938	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
4939
4940	Register SGPRIdxReg;
4941	auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
4942	UseGPRIdxMode, SGPRIdxReg);
4943
4944	MachineBasicBlock *LoopBB = InsPt ->getParent();
4945
4946	if (UseGPRIdxMode) {
4947	const MCInstrDesc &GPRIDXDesc =
4948	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: true*);
4949
4950	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4951	.addReg(RegNo: SrcReg)
4952	.addReg(RegNo: SGPRIdxReg)
4953	.addImm(Val: SubReg);
4954	} else {
4955	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4956	.addReg(RegNo: SrcReg, flags: `0`, SubReg)
4957	.addReg(RegNo: SrcReg, flags: RegState::Implicit);
4958	}
4959
4960	MI.eraseFromParent();
4961
4962	return LoopBB;
4963	}
4964
4965	static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4966	MachineBasicBlock &MBB,
4967	const GCNSubtarget &ST) {
4968	const SIInstrInfo *TII = ST.getInstrInfo();
4969	const SIRegisterInfo &TRI = TII->getRegisterInfo();
4970	MachineFunction *MF = MBB.getParent();
4971	MachineRegisterInfo &MRI = MF->getRegInfo();
4972
4973	Register Dst = MI.getOperand(i: `0`).getReg();
4974	const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
4975	const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4976	const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
4977	int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4978	const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
4979	const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4980
4981	// This can be an immediate, but will be folded later.
4982	assert(Val->getReg());
4983
4984	unsigned SubReg;
4985	std::tie(args&: SubReg, args&: Offset) =
4986	computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
4987	const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4988
4989	if (Idx->getReg() == AMDGPU::NoRegister) {
4990	MachineBasicBlock::iterator I(&MI);
4991	const DebugLoc &DL = MI.getDebugLoc();
4992
4993	assert(Offset == `0`);
4994
4995	BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
4996	.add(MO: *SrcVec)
4997	.add(MO: *Val)
4998	.addImm(Val: SubReg);
4999
5000	MI.eraseFromParent();
5001	return &MBB;
5002	}
5003
5004	// Check for a SGPR index.
5005	if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5006	MachineBasicBlock::iterator I(&MI);
5007	const DebugLoc &DL = MI.getDebugLoc();
5008
5009	if (UseGPRIdxMode) {
5010	Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5011
5012	const MCInstrDesc &GPRIDXDesc =
5013	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: false*);
5014	BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5015	.addReg(RegNo: SrcVec->getReg())
5016	.add(MO: *Val)
5017	.addReg(RegNo: Idx)
5018	.addImm(Val: SubReg);
5019	} else {
5020	setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5021
5022	const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5023	VecSize: TRI.getRegSizeInBits(RC: VecRC), EltSize: `32`, IsSGPR: false*);
5024	BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5025	.addReg(RegNo: SrcVec->getReg())
5026	.add(MO: *Val)
5027	.addImm(Val: SubReg);
5028	}
5029	MI.eraseFromParent();
5030	return &MBB;
5031	}
5032
5033	// Control flow needs to be inserted if indexing with a VGPR.
5034	if (Val->isReg())
5035	MRI.clearKillFlags(Reg: Val->getReg());
5036
5037	const DebugLoc &DL = MI.getDebugLoc();
5038
5039	Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5040
5041	Register SGPRIdxReg;
5042	auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5043	UseGPRIdxMode, SGPRIdxReg);
5044	MachineBasicBlock *LoopBB = InsPt ->getParent();
5045
5046	if (UseGPRIdxMode) {
5047	const MCInstrDesc &GPRIDXDesc =
5048	TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: VecRC), IsIndirectSrc: false*);
5049
5050	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5051	.addReg(RegNo: PhiReg)
5052	.add(MO: *Val)
5053	.addReg(RegNo: SGPRIdxReg)
5054	.addImm(Val: SubReg);
5055	} else {
5056	const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5057	VecSize: TRI.getRegSizeInBits(RC: VecRC), EltSize: `32`, IsSGPR: false*);
5058	BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5059	.addReg(RegNo: PhiReg)
5060	.add(MO: *Val)
5061	.addImm(Val: SubReg);
5062	}
5063
5064	MI.eraseFromParent();
5065	return LoopBB;
5066	}
5067
5068	static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5069	switch (Opc) {
5070	case AMDGPU::S_MIN_U32:
5071	return std::numeric_limits<uint32_t>::max();
5072	case AMDGPU::S_MIN_I32:
5073	return std::numeric_limits<int32_t>::max();
5074	case AMDGPU::S_MAX_U32:
5075	return std::numeric_limits<uint32_t>::min();
5076	case AMDGPU::S_MAX_I32:
5077	return std::numeric_limits<int32_t>::min();
5078	case AMDGPU::S_ADD_I32:
5079	case AMDGPU::S_SUB_I32:
5080	case AMDGPU::S_OR_B32:
5081	case AMDGPU::S_XOR_B32:
5082	return std::numeric_limits<uint32_t>::min();
5083	case AMDGPU::S_AND_B32:
5084	return std::numeric_limits<uint32_t>::max();
5085	default:
5086	llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5087	}
5088	}
5089
5090	static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5091	MachineBasicBlock &BB,
5092	const GCNSubtarget &ST,
5093	unsigned Opc) {
5094	MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5095	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5096	const DebugLoc &DL = MI.getDebugLoc();
5097	const SIInstrInfo *TII = ST.getInstrInfo();
5098
5099	// Reduction operations depend on whether the input operand is SGPR or VGPR.
5100	Register SrcReg = MI.getOperand(i: `1`).getReg();
5101	bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5102	Register DstReg = MI.getOperand(i: `0`).getReg();
5103	MachineBasicBlock RetBB = nullptr*;
5104	if (isSGPR) {
5105	switch (Opc) {
5106	case AMDGPU::S_MIN_U32:
5107	case AMDGPU::S_MIN_I32:
5108	case AMDGPU::S_MAX_U32:
5109	case AMDGPU::S_MAX_I32:
5110	case AMDGPU::S_AND_B32:
5111	case AMDGPU::S_OR_B32: {
5112	// Idempotent operations.
5113	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5114	RetBB = &BB;
5115	break;
5116	}
5117	case AMDGPU::S_XOR_B32:
5118	case AMDGPU::S_ADD_I32:
5119	case AMDGPU::S_SUB_I32: {
5120	const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5121	const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5122	Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5123	Register ActiveLanes = MRI.createVirtualRegister(RegClass: DstRegClass);
5124
5125	bool IsWave32 = ST.isWave32();
5126	unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5127	MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5128	unsigned CountReg =
5129	IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5130
5131	auto Exec =
5132	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5133
5134	auto NewAccumulator = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: CountReg), DestReg: ActiveLanes)
5135	.addReg(RegNo: Exec ->getOperand(i: `0`).getReg());
5136
5137	switch (Opc) {
5138	case AMDGPU::S_XOR_B32: {
5139	// Performing an XOR operation on a uniform value
5140	// depends on the parity of the number of active lanes.
5141	// For even parity, the result will be 0, for odd
5142	// parity the result will be the same as the input value.
5143	Register ParityRegister = MRI.createVirtualRegister(RegClass: DstRegClass);
5144
5145	auto ParityReg =
5146	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5147	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg())
5148	.addImm(Val: `1`);
5149	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5150	.addReg(RegNo: SrcReg)
5151	.addReg(RegNo: ParityReg ->getOperand(i: `0`).getReg());
5152	break;
5153	}
5154	case AMDGPU::S_SUB_I32: {
5155	Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5156
5157	// Take the negation of the source operand.
5158	auto InvertedValReg =
5159	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: NegatedVal)
5160	.addImm(Val: -`1`)
5161	.addReg(RegNo: SrcReg);
5162	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5163	.addReg(RegNo: InvertedValReg ->getOperand(i: `0`).getReg())
5164	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg());
5165	break;
5166	}
5167	case AMDGPU::S_ADD_I32: {
5168	BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5169	.addReg(RegNo: SrcReg)
5170	.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg());
5171	break;
5172	}
5173	}
5174	RetBB = &BB;
5175	}
5176	}
5177	} else {
5178	// TODO: Implement DPP Strategy and switch based on immediate strategy
5179	// operand. For now, for all the cases (default, Iterative and DPP we use
5180	// iterative approach by default.)
5181
5182	// To reduce the VGPR using iterative approach, we need to iterate
5183	// over all the active lanes. Lowering consists of ComputeLoop,
5184	// which iterate over only active lanes. We use copy of EXEC register
5185	// as induction variable and every active lane modifies it using bitset0
5186	// so that we will get the next active lane for next iteration.
5187	MachineBasicBlock::iterator I = BB.end();
5188	Register SrcReg = MI.getOperand(i: `1`).getReg();
5189
5190	// Create Control flow for loop
5191	// Split MI's Machine Basic block into For loop
5192	auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5193
5194	// Create virtual registers required for lowering.
5195	const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5196	const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5197	Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5198	Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5199
5200	Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5201	Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5202	Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5203
5204	Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass);
5205	Register LaneValueReg =
5206	MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5207
5208	bool IsWave32 = ST.isWave32();
5209	unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5210	unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5211
5212	// Create initial values of induction variable from Exec, Accumulator and
5213	// insert branch instr to newly created ComputeBlock
5214	uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5215	auto TmpSReg =
5216	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5217	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: InitalValReg)
5218	.addImm(Val: InitalValue);
5219	// clang-format off
5220	BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5221	.addMBB(MBB: ComputeLoop);
5222	// clang-format on
5223
5224	// Start constructing ComputeLoop
5225	I = ComputeLoop->end();
5226	auto Accumulator =
5227	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5228	.addReg(RegNo: InitalValReg)
5229	.addMBB(MBB: &BB);
5230	auto ActiveBits =
5231	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5232	.addReg(RegNo: TmpSReg ->getOperand(i: `0`).getReg())
5233	.addMBB(MBB: &BB);
5234
5235	// Perform the computations
5236	unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5237	auto FF1 = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5238	.addReg(RegNo: ActiveBits ->getOperand(i: `0`).getReg());
5239	auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
5240	MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32), DestReg: LaneValueReg)
5241	.addReg(RegNo: SrcReg)
5242	.addReg(RegNo: FF1 ->getOperand(i: `0`).getReg());
5243	auto NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
5244	.addReg(RegNo: Accumulator ->getOperand(i: `0`).getReg())
5245	.addReg(RegNo: LaneValue ->getOperand(i: `0`).getReg());
5246
5247	// Manipulate the iterator to get the next active lane
5248	unsigned BITSETOpc =
5249	IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5250	auto NewActiveBits =
5251	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
5252	.addReg(RegNo: FF1 ->getOperand(i: `0`).getReg())
5253	.addReg(RegNo: ActiveBits ->getOperand(i: `0`).getReg());
5254
5255	// Add phi nodes
5256	Accumulator.addReg(RegNo: NewAccumulator ->getOperand(i: `0`).getReg())
5257	.addMBB(MBB: ComputeLoop);
5258	ActiveBits.addReg(RegNo: NewActiveBits ->getOperand(i: `0`).getReg())
5259	.addMBB(MBB: ComputeLoop);
5260
5261	// Creating branching
5262	unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5263	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
5264	.addReg(RegNo: NewActiveBits ->getOperand(i: `0`).getReg())
5265	.addImm(Val: `0`);
5266	BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5267	.addMBB(MBB: ComputeLoop);
5268
5269	RetBB = ComputeEnd;
5270	}
5271	MI.eraseFromParent();
5272	return RetBB;
5273	}
5274
5275	MachineBasicBlock *
5276	SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5277	MachineBasicBlock BB) const* {
5278
5279	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5280	MachineFunction *MF = BB->getParent();
5281	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5282
5283	switch (MI.getOpcode()) {
5284	case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5285	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MIN_U32);
5286	case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5287	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MIN_I32);
5288	case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5289	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MAX_U32);
5290	case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5291	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_MAX_I32);
5292	case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5293	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_ADD_I32);
5294	case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5295	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_SUB_I32);
5296	case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5297	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_AND_B32);
5298	case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5299	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_OR_B32);
5300	case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5301	return lowerWaveReduce(MI, BB&: BB, ST: getSubtarget(), Opc: AMDGPU::S_XOR_B32);
5302	case AMDGPU::S_UADDO_PSEUDO:
5303	case AMDGPU::S_USUBO_PSEUDO: {
5304	const DebugLoc &DL = MI.getDebugLoc();
5305	MachineOperand &Dest0 = MI.getOperand(i: `0`);
5306	MachineOperand &Dest1 = MI.getOperand(i: `1`);
5307	MachineOperand &Src0 = MI.getOperand(i: `2`);
5308	MachineOperand &Src1 = MI.getOperand(i: `3`);
5309
5310	unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5311	? AMDGPU::S_ADD_I32
5312	: AMDGPU::S_SUB_I32;
5313	// clang-format off
5314	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
5315	.add(MO: Src0)
5316	.add(MO: Src1);
5317	// clang-format on
5318
5319	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: Dest1.getReg())
5320	.addImm(Val: `1`)
5321	.addImm(Val: `0`);
5322
5323	MI.eraseFromParent();
5324	return BB;
5325	}
5326	case AMDGPU::S_ADD_U64_PSEUDO:
5327	case AMDGPU::S_SUB_U64_PSEUDO: {
5328	// For targets older than GFX12, we emit a sequence of 32-bit operations.
5329	// For GFX12, we emit s_add_u64 and s_sub_u64.
5330	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5331	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5332	const DebugLoc &DL = MI.getDebugLoc();
5333	MachineOperand &Dest = MI.getOperand(i: `0`);
5334	MachineOperand &Src0 = MI.getOperand(i: `1`);
5335	MachineOperand &Src1 = MI.getOperand(i: `2`);
5336	bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5337	if (Subtarget->hasScalarAddSub64()) {
5338	unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5339	// clang-format off
5340	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5341	.add(MO: Src0)
5342	.add(MO: Src1);
5343	// clang-format on
5344	} else {
5345	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5346	const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5347
5348	Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5349	Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5350
5351	MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5352	MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5353	MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5354	MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5355
5356	MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5357	MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5358	MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5359	MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5360
5361	unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5362	unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5363	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5364	.add(MO: Src0Sub0)
5365	.add(MO: Src1Sub0);
5366	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5367	.add(MO: Src0Sub1)
5368	.add(MO: Src1Sub1);
5369	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5370	.addReg(RegNo: DestSub0)
5371	.addImm(Val: AMDGPU::sub0)
5372	.addReg(RegNo: DestSub1)
5373	.addImm(Val: AMDGPU::sub1);
5374	}
5375	MI.eraseFromParent();
5376	return BB;
5377	}
5378	case AMDGPU::V_ADD_U64_PSEUDO:
5379	case AMDGPU::V_SUB_U64_PSEUDO: {
5380	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5381	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5382	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5383	const DebugLoc &DL = MI.getDebugLoc();
5384
5385	bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5386
5387	MachineOperand &Dest = MI.getOperand(i: `0`);
5388	MachineOperand &Src0 = MI.getOperand(i: `1`);
5389	MachineOperand &Src1 = MI.getOperand(i: `2`);
5390
5391	if (IsAdd && ST.hasLshlAddU64Inst()) {
5392	auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
5393	DestReg: Dest.getReg())
5394	.add(MO: Src0)
5395	.addImm(Val: `0`)
5396	.add(MO: Src1);
5397	TII->legalizeOperands(MI&: *Add);
5398	MI.eraseFromParent();
5399	return BB;
5400	}
5401
5402	const auto *CarryRC = TRI->getWaveMaskRegClass();
5403
5404	Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5405	Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5406
5407	Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5408	Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5409
5410	const TargetRegisterClass *Src0RC = Src0.isReg()
5411	? MRI.getRegClass(Reg: Src0.getReg())
5412	: &AMDGPU::VReg_64RegClass;
5413	const TargetRegisterClass *Src1RC = Src1.isReg()
5414	? MRI.getRegClass(Reg: Src1.getReg())
5415	: &AMDGPU::VReg_64RegClass;
5416
5417	const TargetRegisterClass *Src0SubRC =
5418	TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5419	const TargetRegisterClass *Src1SubRC =
5420	TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5421
5422	MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5423	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5424	MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5425	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5426
5427	MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5428	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5429	MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5430	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5431
5432	unsigned LoOpc =
5433	IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5434	MachineInstr LoHalf = BuildMI(BB&: BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5435	.addReg(RegNo: CarryReg, flags: RegState::Define)
5436	.add(MO: SrcReg0Sub0)
5437	.add(MO: SrcReg1Sub0)
5438	.addImm(Val: `0`); // clamp bit
5439
5440	unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5441	MachineInstr *HiHalf =
5442	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5443	.addReg(RegNo: DeadCarryReg, flags: RegState::Define \| RegState::Dead)
5444	.add(MO: SrcReg0Sub1)
5445	.add(MO: SrcReg1Sub1)
5446	.addReg(RegNo: CarryReg, flags: RegState::Kill)
5447	.addImm(Val: `0`); // clamp bit
5448
5449	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5450	.addReg(RegNo: DestSub0)
5451	.addImm(Val: AMDGPU::sub0)
5452	.addReg(RegNo: DestSub1)
5453	.addImm(Val: AMDGPU::sub1);
5454	TII->legalizeOperands(MI&: *LoHalf);
5455	TII->legalizeOperands(MI&: *HiHalf);
5456	MI.eraseFromParent();
5457	return BB;
5458	}
5459	case AMDGPU::S_ADD_CO_PSEUDO:
5460	case AMDGPU::S_SUB_CO_PSEUDO: {
5461	// This pseudo has a chance to be selected
5462	// only from uniform add/subcarry node. All the VGPR operands
5463	// therefore assumed to be splat vectors.
5464	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5465	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5466	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5467	MachineBasicBlock::iterator MII = MI;
5468	const DebugLoc &DL = MI.getDebugLoc();
5469	MachineOperand &Dest = MI.getOperand(i: `0`);
5470	MachineOperand &CarryDest = MI.getOperand(i: `1`);
5471	MachineOperand &Src0 = MI.getOperand(i: `2`);
5472	MachineOperand &Src1 = MI.getOperand(i: `3`);
5473	MachineOperand &Src2 = MI.getOperand(i: `4`);
5474	unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5475	? AMDGPU::S_ADDC_U32
5476	: AMDGPU::S_SUBB_U32;
5477	if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
5478	Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5479	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
5480	.addReg(RegNo: Src0.getReg());
5481	Src0.setReg(RegOp0);
5482	}
5483	if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
5484	Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5485	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
5486	.addReg(RegNo: Src1.getReg());
5487	Src1.setReg(RegOp1);
5488	}
5489	Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5490	if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
5491	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
5492	.addReg(RegNo: Src2.getReg());
5493	Src2.setReg(RegOp2);
5494	}
5495
5496	const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
5497	unsigned WaveSize = TRI->getRegSizeInBits(RC: *Src2RC);
5498	assert(WaveSize == `64` \|\| WaveSize == `32`);
5499
5500	if (WaveSize == `64`) {
5501	if (ST.hasScalarCompareEq64()) {
5502	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
5503	.addReg(RegNo: Src2.getReg())
5504	.addImm(Val: `0`);
5505	} else {
5506	const TargetRegisterClass *SubRC =
5507	TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5508	MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5509	MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
5510	MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5511	MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
5512	Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5513
5514	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
5515	.add(MO: Src2Sub0)
5516	.add(MO: Src2Sub1);
5517
5518	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5519	.addReg(RegNo: Src2_32, flags: RegState::Kill)
5520	.addImm(Val: `0`);
5521	}
5522	} else {
5523	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5524	.addReg(RegNo: Src2.getReg())
5525	.addImm(Val: `0`);
5526	}
5527
5528	// clang-format off
5529	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5530	.add(MO: Src0)
5531	.add(MO: Src1);
5532	// clang-format on
5533
5534	unsigned SelOpc =
5535	(WaveSize == `64`) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5536
5537	BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
5538	.addImm(Val: -`1`)
5539	.addImm(Val: `0`);
5540
5541	MI.eraseFromParent();
5542	return BB;
5543	}
5544	case AMDGPU::SI_INIT_M0: {
5545	MachineOperand &M0Init = MI.getOperand(i: `0`);
5546	BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
5547	MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5548	DestReg: AMDGPU::M0)
5549	.add(MO: M0Init);
5550	MI.eraseFromParent();
5551	return BB;
5552	}
5553	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5554	// Set SCC to true, in case the barrier instruction gets converted to a NOP.
5555	BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
5556	MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
5557	.addImm(Val: `0`)
5558	.addImm(Val: `0`);
5559	return BB;
5560	}
5561	case AMDGPU::GET_GROUPSTATICSIZE: {
5562	assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
5563	getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5564	DebugLoc DL = MI.getDebugLoc();
5565	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
5566	.add(MO: MI.getOperand(i: `0`))
5567	.addImm(Val: MFI->getLDSSize());
5568	MI.eraseFromParent();
5569	return BB;
5570	}
5571	case AMDGPU::GET_SHADERCYCLESHILO: {
5572	assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5573	MachineRegisterInfo &MRI = MF->getRegInfo();
5574	const DebugLoc &DL = MI.getDebugLoc();
5575	// The algorithm is:
5576	//
5577	// hi1 = getreg(SHADER_CYCLES_HI)
5578	// lo1 = getreg(SHADER_CYCLES_LO)
5579	// hi2 = getreg(SHADER_CYCLES_HI)
5580	//
5581	// If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5582	// Otherwise there was overflow and the result is hi2:0. In both cases the
5583	// result should represent the actual time at some point during the sequence
5584	// of three getregs.
5585	using namespace AMDGPU::Hwreg;
5586	Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5587	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
5588	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: `0`, Values: `32`));
5589	Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5590	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
5591	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: `0`, Values: `32`));
5592	Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5593	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
5594	.addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: `0`, Values: `32`));
5595	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
5596	.addReg(RegNo: RegHi1)
5597	.addReg(RegNo: RegHi2);
5598	Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5599	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
5600	.addReg(RegNo: RegLo1)
5601	.addImm(Val: `0`);
5602	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
5603	.add(MO: MI.getOperand(i: `0`))
5604	.addReg(RegNo: RegLo)
5605	.addImm(Val: AMDGPU::sub0)
5606	.addReg(RegNo: RegHi2)
5607	.addImm(Val: AMDGPU::sub1);
5608	MI.eraseFromParent();
5609	return BB;
5610	}
5611	case AMDGPU::SI_INDIRECT_SRC_V1:
5612	case AMDGPU::SI_INDIRECT_SRC_V2:
5613	case AMDGPU::SI_INDIRECT_SRC_V4:
5614	case AMDGPU::SI_INDIRECT_SRC_V8:
5615	case AMDGPU::SI_INDIRECT_SRC_V9:
5616	case AMDGPU::SI_INDIRECT_SRC_V10:
5617	case AMDGPU::SI_INDIRECT_SRC_V11:
5618	case AMDGPU::SI_INDIRECT_SRC_V12:
5619	case AMDGPU::SI_INDIRECT_SRC_V16:
5620	case AMDGPU::SI_INDIRECT_SRC_V32:
5621	return emitIndirectSrc(MI, MBB&: BB, ST: getSubtarget());
5622	case AMDGPU::SI_INDIRECT_DST_V1:
5623	case AMDGPU::SI_INDIRECT_DST_V2:
5624	case AMDGPU::SI_INDIRECT_DST_V4:
5625	case AMDGPU::SI_INDIRECT_DST_V8:
5626	case AMDGPU::SI_INDIRECT_DST_V9:
5627	case AMDGPU::SI_INDIRECT_DST_V10:
5628	case AMDGPU::SI_INDIRECT_DST_V11:
5629	case AMDGPU::SI_INDIRECT_DST_V12:
5630	case AMDGPU::SI_INDIRECT_DST_V16:
5631	case AMDGPU::SI_INDIRECT_DST_V32:
5632	return emitIndirectDst(MI, MBB&: BB, ST: getSubtarget());
5633	case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5634	case AMDGPU::SI_KILL_I1_PSEUDO:
5635	return splitKillBlock(MI, BB);
5636	case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5637	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5638	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5639	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5640
5641	Register Dst = MI.getOperand(i: `0`).getReg();
5642	const MachineOperand &Src0 = MI.getOperand(i: `1`);
5643	const MachineOperand &Src1 = MI.getOperand(i: `2`);
5644	const DebugLoc &DL = MI.getDebugLoc();
5645	Register SrcCond = MI.getOperand(i: `3`).getReg();
5646
5647	Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5648	Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5649	const auto *CondRC = TRI->getWaveMaskRegClass();
5650	Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
5651
5652	const TargetRegisterClass *Src0RC = Src0.isReg()
5653	? MRI.getRegClass(Reg: Src0.getReg())
5654	: &AMDGPU::VReg_64RegClass;
5655	const TargetRegisterClass *Src1RC = Src1.isReg()
5656	? MRI.getRegClass(Reg: Src1.getReg())
5657	: &AMDGPU::VReg_64RegClass;
5658
5659	const TargetRegisterClass *Src0SubRC =
5660	TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5661	const TargetRegisterClass *Src1SubRC =
5662	TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5663
5664	MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5665	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5666	MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5667	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5668
5669	MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5670	MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5671	MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5672	MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5673
5674	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
5675	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
5676	.addImm(Val: `0`)
5677	.add(MO: Src0Sub0)
5678	.addImm(Val: `0`)
5679	.add(MO: Src1Sub0)
5680	.addReg(RegNo: SrcCondCopy);
5681	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
5682	.addImm(Val: `0`)
5683	.add(MO: Src0Sub1)
5684	.addImm(Val: `0`)
5685	.add(MO: Src1Sub1)
5686	.addReg(RegNo: SrcCondCopy);
5687
5688	BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
5689	.addReg(RegNo: DstLo)
5690	.addImm(Val: AMDGPU::sub0)
5691	.addReg(RegNo: DstHi)
5692	.addImm(Val: AMDGPU::sub1);
5693	MI.eraseFromParent();
5694	return BB;
5695	}
5696	case AMDGPU::SI_BR_UNDEF: {
5697	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5698	const DebugLoc &DL = MI.getDebugLoc();
5699	MachineInstr Br = BuildMI(BB&: BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5700	.add(MO: MI.getOperand(i: `0`));
5701	Br->getOperand(i: `1`).setIsUndef(); // read undef SCC
5702	MI.eraseFromParent();
5703	return BB;
5704	}
5705	case AMDGPU::ADJCALLSTACKUP:
5706	case AMDGPU::ADJCALLSTACKDOWN: {
5707	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5708	MachineInstrBuilder MIB(*MF, &MI);
5709	MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine)
5710	.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit);
5711	return BB;
5712	}
5713	case AMDGPU::SI_CALL_ISEL: {
5714	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5715	const DebugLoc &DL = MI.getDebugLoc();
5716
5717	unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
5718
5719	MachineInstrBuilder MIB;
5720	MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
5721
5722	for (const MachineOperand &MO : MI.operands())
5723	MIB.add(MO);
5724
5725	MIB.cloneMemRefs(OtherMI: MI);
5726	MI.eraseFromParent();
5727	return BB;
5728	}
5729	case AMDGPU::V_ADD_CO_U32_e32:
5730	case AMDGPU::V_SUB_CO_U32_e32:
5731	case AMDGPU::V_SUBREV_CO_U32_e32: {
5732	// TODO: Define distinct V__I32_Pseudo instructions instead.*
5733	const DebugLoc &DL = MI.getDebugLoc();
5734	unsigned Opc = MI.getOpcode();
5735
5736	bool NeedClampOperand = false;
5737	if (TII->pseudoToMCOpcode(Opcode: Opc) == -`1`) {
5738	Opc = AMDGPU::getVOPe64(Opcode: Opc);
5739	NeedClampOperand = true;
5740	}
5741
5742	auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: `0`).getReg());
5743	if (TII->isVOP3(MI: *I)) {
5744	const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5745	const SIRegisterInfo *TRI = ST.getRegisterInfo();
5746	I.addReg(RegNo: TRI->getVCC(), flags: RegState::Define);
5747	}
5748	I.add(MO: MI.getOperand(i: `1`)).add(MO: MI.getOperand(i: `2`));
5749	if (NeedClampOperand)
5750	I.addImm(Val: `0`); // clamp bit for e64 encoding
5751
5752	TII->legalizeOperands(MI&: *I);
5753
5754	MI.eraseFromParent();
5755	return BB;
5756	}
5757	case AMDGPU::V_ADDC_U32_e32:
5758	case AMDGPU::V_SUBB_U32_e32:
5759	case AMDGPU::V_SUBBREV_U32_e32:
5760	// These instructions have an implicit use of vcc which counts towards the
5761	// constant bus limit.
5762	TII->legalizeOperands(MI);
5763	return BB;
5764	case AMDGPU::DS_GWS_INIT:
5765	case AMDGPU::DS_GWS_SEMA_BR:
5766	case AMDGPU::DS_GWS_BARRIER:
5767	TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::data0);
5768	[[fallthrough]];
5769	case AMDGPU::DS_GWS_SEMA_V:
5770	case AMDGPU::DS_GWS_SEMA_P:
5771	case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5772	// A s_waitcnt 0 is required to be the instruction immediately following.
5773	if (getSubtarget()->hasGWSAutoReplay()) {
5774	bundleInstWithWaitcnt(MI);
5775	return BB;
5776	}
5777
5778	return emitGWSMemViolTestLoop(MI, BB);
5779	case AMDGPU::S_SETREG_B32: {
5780	// Try to optimize cases that only set the denormal mode or rounding mode.
5781	//
5782	// If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5783	// denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5784	// instead.
5785	//
5786	// FIXME: This could be predicates on the immediate, but tablegen doesn't
5787	// allow you to have a no side effect instruction in the output of a
5788	// sideeffecting pattern.
5789	auto [ID, Offset, Width] =
5790	AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: `1`).getImm());
5791	if (ID != AMDGPU::Hwreg::ID_MODE)
5792	return BB;
5793
5794	const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
5795	const unsigned SetMask = WidthMask << Offset;
5796
5797	if (getSubtarget()->hasDenormModeInst()) {
5798	unsigned SetDenormOp = `0`;
5799	unsigned SetRoundOp = `0`;
5800
5801	// The dedicated instructions can only set the whole denorm or round mode
5802	// at once, not a subset of bits in either.
5803	if (SetMask ==
5804	(AMDGPU::Hwreg::FP_ROUND_MASK \| AMDGPU::Hwreg::FP_DENORM_MASK)) {
5805	// If this fully sets both the round and denorm mode, emit the two
5806	// dedicated instructions for these.
5807	SetRoundOp = AMDGPU::S_ROUND_MODE;
5808	SetDenormOp = AMDGPU::S_DENORM_MODE;
5809	} else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5810	SetRoundOp = AMDGPU::S_ROUND_MODE;
5811	} else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5812	SetDenormOp = AMDGPU::S_DENORM_MODE;
5813	}
5814
5815	if (SetRoundOp \|\| SetDenormOp) {
5816	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5817	MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: `0`).getReg());
5818	if (Def && Def->isMoveImmediate() && Def->getOperand(i: `1`).isImm()) {
5819	unsigned ImmVal = Def->getOperand(i: `1`).getImm();
5820	if (SetRoundOp) {
5821	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
5822	.addImm(Val: ImmVal & `0xf`);
5823
5824	// If we also have the denorm mode, get just the denorm mode bits.
5825	ImmVal >>= `4`;
5826	}
5827
5828	if (SetDenormOp) {
5829	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
5830	.addImm(Val: ImmVal & `0xf`);
5831	}
5832
5833	MI.eraseFromParent();
5834	return BB;
5835	}
5836	}
5837	}
5838
5839	// If only FP bits are touched, used the no side effects pseudo.
5840	if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK \|
5841	AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5842	MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
5843
5844	return BB;
5845	}
5846	case AMDGPU::S_INVERSE_BALLOT_U32:
5847	case AMDGPU::S_INVERSE_BALLOT_U64:
5848	// These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5849	// necessary. After that they are equivalent to a COPY.
5850	MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
5851	return BB;
5852	case AMDGPU::ENDPGM_TRAP: {
5853	const DebugLoc &DL = MI.getDebugLoc();
5854	if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
5855	MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
5856	MI.addOperand(Op: MachineOperand::CreateImm(Val: `0`));
5857	return BB;
5858	}
5859
5860	// We need a block split to make the real endpgm a terminator. We also don't
5861	// want to break phis in successor blocks, so we can't just delete to the
5862	// end of the block.
5863
5864	MachineBasicBlock SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false* /UpdateLiveIns/);
5865	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5866	MF->push_back(MBB: TrapBB);
5867	// clang-format off
5868	BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
5869	.addImm(Val: `0`);
5870	BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5871	.addMBB(MBB: TrapBB);
5872	// clang-format on
5873
5874	BB->addSuccessor(Succ: TrapBB);
5875	MI.eraseFromParent();
5876	return SplitBB;
5877	}
5878	case AMDGPU::SIMULATED_TRAP: {
5879	assert(Subtarget->hasPrivEnabledTrap2NopBug());
5880	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5881	MachineBasicBlock *SplitBB =
5882	TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
5883	MI.eraseFromParent();
5884	return SplitBB;
5885	}
5886	default:
5887	if (TII->isImage(MI) \|\| TII->isMUBUF(MI)) {
5888	if (!MI.mayStore())
5889	AddMemOpInit(MI);
5890	return BB;
5891	}
5892	return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
5893	}
5894	}
5895
5896	bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5897	// This currently forces unfolding various combinations of fsub into fma with
5898	// free fneg'd operands. As long as we have fast FMA (controlled by
5899	// isFMAFasterThanFMulAndFAdd), we should perform these.
5900
5901	// When fma is quarter rate, for f64 where add / sub are at best half rate,
5902	// most of these combines appear to be cycle neutral but save on instruction
5903	// count / code size.
5904	return true;
5905	}
5906
5907	bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5908
5909	EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5910	EVT VT) const {
5911	if (!VT.isVector()) {
5912	return MVT::i1;
5913	}
5914	return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
5915	}
5916
5917	MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5918	// TODO: Should i16 be used always if legal? For now it would force VALU
5919	// shifts.
5920	return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5921	}
5922
5923	LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5924	return (Ty.getScalarSizeInBits() <= `16` && Subtarget->has16BitInsts())
5925	? Ty.changeElementSize(NewEltSize: `16`)
5926	: Ty.changeElementSize(NewEltSize: `32`);
5927	}
5928
5929	// Answering this is somewhat tricky and depends on the specific device which
5930	// have different rates for fma or all f64 operations.
5931	//
5932	// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5933	// regardless of which device (although the number of cycles differs between
5934	// devices), so it is always profitable for f64.
5935	//
5936	// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5937	// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5938	// which we can always do even without fused FP ops since it returns the same
5939	// result as the separate operations and since it is always full
5940	// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5941	// however does not support denormals, so we do report fma as faster if we have
5942	// a fast fma device and require denormals.
5943	//
5944	bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5945	EVT VT) const {
5946	VT = VT.getScalarType();
5947
5948	switch (VT.getSimpleVT().SimpleTy) {
5949	case MVT::f32: {
5950	// If mad is not available this depends only on if f32 fma is full rate.
5951	if (!Subtarget->hasMadMacF32Insts())
5952	return Subtarget->hasFastFMAF32();
5953
5954	// Otherwise f32 mad is always full rate and returns the same result as
5955	// the separate operations so should be preferred over fma.
5956	// However does not support denormals.
5957	if (!denormalModeIsFlushAllF32(MF))
5958	return Subtarget->hasFastFMAF32() \|\| Subtarget->hasDLInsts();
5959
5960	// If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5961	return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5962	}
5963	case MVT::f64:
5964	return true;
5965	case MVT::f16:
5966	return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5967	default:
5968	break;
5969	}
5970
5971	return false;
5972	}
5973
5974	bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5975	LLT Ty) const {
5976	switch (Ty.getScalarSizeInBits()) {
5977	case `16`:
5978	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
5979	case `32`:
5980	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
5981	case `64`:
5982	return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
5983	default:
5984	break;
5985	}
5986
5987	return false;
5988	}
5989
5990	bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5991	if (!Ty.isScalar())
5992	return false;
5993
5994	if (Ty.getScalarSizeInBits() == `16`)
5995	return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
5996	if (Ty.getScalarSizeInBits() == `32`)
5997	return Subtarget->hasMadMacF32Insts() &&
5998	denormalModeIsFlushAllF32(MF: *MI.getMF());
5999
6000	return false;
6001	}
6002
6003	bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6004	const SDNode N) const* {
6005	// TODO: Check future ftz flag
6006	// v_mad_f32/v_mac_f32 do not support denormals.
6007	EVT VT = N->getValueType(ResNo: `0`);
6008	if (VT == MVT::f32)
6009	return Subtarget->hasMadMacF32Insts() &&
6010	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6011	if (VT == MVT::f16) {
6012	return Subtarget->hasMadF16() &&
6013	denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6014	}
6015
6016	return false;
6017	}
6018
6019	//===----------------------------------------------------------------------===//
6020	// Custom DAG Lowering Operations
6021	//===----------------------------------------------------------------------===//
6022
6023	// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6024	// wider vector type is legal.
6025	SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6026	SelectionDAG &DAG) const {
6027	unsigned Opc = Op.getOpcode();
6028	EVT VT = Op.getValueType();
6029	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4f32 \|\|
6030	VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16i16 \|\|
6031	VT == MVT::v16f16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
6032	VT == MVT::v32f32 \|\| VT == MVT::v32i16 \|\| VT == MVT::v32f16);
6033
6034	auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
6035
6036	SDLoc SL(Op);
6037	SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op ->getFlags());
6038	SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op ->getFlags());
6039
6040	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
6041	}
6042
6043	// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6044	// wider vector type is legal.
6045	SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6046	SelectionDAG &DAG) const {
6047	unsigned Opc = Op.getOpcode();
6048	EVT VT = Op.getValueType();
6049	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v4bf16 \|\|
6050	VT == MVT::v4f32 \|\| VT == MVT::v8i16 \|\| VT == MVT::v8f16 \|\|
6051	VT == MVT::v8bf16 \|\| VT == MVT::v16i16 \|\| VT == MVT::v16f16 \|\|
6052	VT == MVT::v16bf16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
6053	VT == MVT::v32f32 \|\| VT == MVT::v32i16 \|\| VT == MVT::v32f16 \|\|
6054	VT == MVT::v32bf16);
6055
6056	auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
6057	auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `1`);
6058
6059	SDLoc SL(Op);
6060
6061	SDValue OpLo =
6062	DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op ->getFlags());
6063	SDValue OpHi =
6064	DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op ->getFlags());
6065
6066	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
6067	}
6068
6069	SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6070	SelectionDAG &DAG) const {
6071	unsigned Opc = Op.getOpcode();
6072	EVT VT = Op.getValueType();
6073	assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\| VT == MVT::v8i16 \|\|
6074	VT == MVT::v8f16 \|\| VT == MVT::v4f32 \|\| VT == MVT::v16i16 \|\|
6075	VT == MVT::v16f16 \|\| VT == MVT::v8f32 \|\| VT == MVT::v16f32 \|\|
6076	VT == MVT::v32f32 \|\| VT == MVT::v32f16 \|\| VT == MVT::v32i16 \|\|
6077	VT == MVT::v4bf16 \|\| VT == MVT::v8bf16 \|\| VT == MVT::v16bf16 \|\|
6078	VT == MVT::v32bf16);
6079
6080	SDValue Op0 = Op.getOperand(i: `0`);
6081	auto [Lo0, Hi0] = Op0.getValueType().isVector()
6082	? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`)
6083	: std::pair(Op0, Op0);
6084
6085	auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `1`);
6086	auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `2`);
6087
6088	SDLoc SL(Op);
6089	auto ResVT = DAG.GetSplitDestVTs(VT);
6090
6091	SDValue OpLo =
6092	DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op ->getFlags());
6093	SDValue OpHi =
6094	DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op ->getFlags());
6095
6096	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT, N1: OpLo, N2: OpHi);
6097	}
6098
6099	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6100	switch (Op.getOpcode()) {
6101	default:
6102	return AMDGPUTargetLowering::LowerOperation(Op, DAG);
6103	case ISD::BRCOND:
6104	return LowerBRCOND(Op, DAG);
6105	case ISD::RETURNADDR:
6106	return LowerRETURNADDR(Op, DAG);
6107	case ISD::LOAD: {
6108	SDValue Result = LowerLOAD(Op, DAG);
6109	assert((!Result.getNode() \|\| Result.getNode()->getNumValues() == `2`) &&
6110	"Load should return a value and a chain");
6111	return Result;
6112	}
6113	case ISD::FSQRT: {
6114	EVT VT = Op.getValueType();
6115	if (VT == MVT::f32)
6116	return lowerFSQRTF32(Op, DAG);
6117	if (VT == MVT::f64)
6118	return lowerFSQRTF64(Op, DAG);
6119	return SDValue ();
6120	}
6121	case ISD::FSIN:
6122	case ISD::FCOS:
6123	return LowerTrig(Op, DAG);
6124	case ISD::SELECT:
6125	return LowerSELECT(Op, DAG);
6126	case ISD::FDIV:
6127	return LowerFDIV(Op, DAG);
6128	case ISD::FFREXP:
6129	return LowerFFREXP(Op, DAG);
6130	case ISD::ATOMIC_CMP_SWAP:
6131	return LowerATOMIC_CMP_SWAP(Op, DAG);
6132	case ISD::STORE:
6133	return LowerSTORE(Op, DAG);
6134	case ISD::GlobalAddress: {
6135	MachineFunction &MF = DAG.getMachineFunction();
6136	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6137	return LowerGlobalAddress(MFI, Op, DAG);
6138	}
6139	case ISD::INTRINSIC_WO_CHAIN:
6140	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6141	case ISD::INTRINSIC_W_CHAIN:
6142	return LowerINTRINSIC_W_CHAIN(Op, DAG);
6143	case ISD::INTRINSIC_VOID:
6144	return LowerINTRINSIC_VOID(Op, DAG);
6145	case ISD::ADDRSPACECAST:
6146	return lowerADDRSPACECAST(Op, DAG);
6147	case ISD::INSERT_SUBVECTOR:
6148	return lowerINSERT_SUBVECTOR(Op, DAG);
6149	case ISD::INSERT_VECTOR_ELT:
6150	return lowerINSERT_VECTOR_ELT(Op, DAG);
6151	case ISD::EXTRACT_VECTOR_ELT:
6152	return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6153	case ISD::VECTOR_SHUFFLE:
6154	return lowerVECTOR_SHUFFLE(Op, DAG);
6155	case ISD::SCALAR_TO_VECTOR:
6156	return lowerSCALAR_TO_VECTOR(Op, DAG);
6157	case ISD::BUILD_VECTOR:
6158	return lowerBUILD_VECTOR(Op, DAG);
6159	case ISD::FP_ROUND:
6160	case ISD::STRICT_FP_ROUND:
6161	return lowerFP_ROUND(Op, DAG);
6162	case ISD::TRAP:
6163	return lowerTRAP(Op, DAG);
6164	case ISD::DEBUGTRAP:
6165	return lowerDEBUGTRAP(Op, DAG);
6166	case ISD::ABS:
6167	case ISD::FABS:
6168	case ISD::FNEG:
6169	case ISD::FCANONICALIZE:
6170	case ISD::BSWAP:
6171	return splitUnaryVectorOp(Op, DAG);
6172	case ISD::FMINNUM:
6173	case ISD::FMAXNUM:
6174	return lowerFMINNUM_FMAXNUM(Op, DAG);
6175	case ISD::FMINIMUMNUM:
6176	case ISD::FMAXIMUMNUM:
6177	return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6178	case ISD::FMINIMUM:
6179	case ISD::FMAXIMUM:
6180	return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6181	case ISD::FLDEXP:
6182	case ISD::STRICT_FLDEXP:
6183	return lowerFLDEXP(Op, DAG);
6184	case ISD::FMA:
6185	return splitTernaryVectorOp(Op, DAG);
6186	case ISD::FP_TO_SINT:
6187	case ISD::FP_TO_UINT:
6188	return LowerFP_TO_INT(Op, DAG);
6189	case ISD::SHL:
6190	case ISD::SRA:
6191	case ISD::SRL:
6192	case ISD::ADD:
6193	case ISD::SUB:
6194	case ISD::SMIN:
6195	case ISD::SMAX:
6196	case ISD::UMIN:
6197	case ISD::UMAX:
6198	case ISD::FADD:
6199	case ISD::FMUL:
6200	case ISD::FMINNUM_IEEE:
6201	case ISD::FMAXNUM_IEEE:
6202	case ISD::UADDSAT:
6203	case ISD::USUBSAT:
6204	case ISD::SADDSAT:
6205	case ISD::SSUBSAT:
6206	return splitBinaryVectorOp(Op, DAG);
6207	case ISD::FCOPYSIGN:
6208	return lowerFCOPYSIGN(Op, DAG);
6209	case ISD::MUL:
6210	return lowerMUL(Op, DAG);
6211	case ISD::SMULO:
6212	case ISD::UMULO:
6213	return lowerXMULO(Op, DAG);
6214	case ISD::SMUL_LOHI:
6215	case ISD::UMUL_LOHI:
6216	return lowerXMUL_LOHI(Op, DAG);
6217	case ISD::DYNAMIC_STACKALLOC:
6218	return LowerDYNAMIC_STACKALLOC(Op, DAG);
6219	case ISD::STACKSAVE:
6220	return LowerSTACKSAVE(Op, DAG);
6221	case ISD::GET_ROUNDING:
6222	return lowerGET_ROUNDING(Op, DAG);
6223	case ISD::SET_ROUNDING:
6224	return lowerSET_ROUNDING(Op, DAG);
6225	case ISD::PREFETCH:
6226	return lowerPREFETCH(Op, DAG);
6227	case ISD::FP_EXTEND:
6228	case ISD::STRICT_FP_EXTEND:
6229	return lowerFP_EXTEND(Op, DAG);
6230	case ISD::GET_FPENV:
6231	return lowerGET_FPENV(Op, DAG);
6232	case ISD::SET_FPENV:
6233	return lowerSET_FPENV(Op, DAG);
6234	}
6235	return SDValue ();
6236	}
6237
6238	// Used for D16: Casts the result of an instruction into the right vector,
6239	// packs values if loads return unpacked values.
6240	static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
6241	const SDLoc &DL, SelectionDAG &DAG,
6242	bool Unpacked) {
6243	if (!LoadVT.isVector())
6244	return Result;
6245
6246	// Cast back to the original packed type or to a larger type that is a
6247	// multiple of 32 bit for D16. Widening the return type is a required for
6248	// legalization.
6249	EVT FittingLoadVT = LoadVT;
6250	if ((LoadVT.getVectorNumElements() % `2`) == `1`) {
6251	FittingLoadVT =
6252	EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
6253	NumElements: LoadVT.getVectorNumElements() + `1`);
6254	}
6255
6256	if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6257	// Truncate to v2i16/v4i16.
6258	EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6259
6260	// Workaround legalizer not scalarizing truncate after vector op
6261	// legalization but not creating intermediate vector trunc.
6262	SmallVector<SDValue, `4`> Elts;
6263	DAG.ExtractVectorElements(Op: Result, Args&: Elts);
6264	for (SDValue &Elt : Elts)
6265	Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
6266
6267	// Pad illegal v1i16/v3fi6 to v4i16
6268	if ((LoadVT.getVectorNumElements() % `2`) == `1`)
6269	Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
6270
6271	Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
6272
6273	// Bitcast to original type (v2f16/v4f16).
6274	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
6275	}
6276
6277	// Cast back to the original packed type.
6278	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
6279	}
6280
6281	SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6282	SelectionDAG &DAG,
6283	ArrayRef<SDValue> Ops,
6284	bool IsIntrinsic) const {
6285	SDLoc DL(M);
6286
6287	bool Unpacked = Subtarget->hasUnpackedD16VMem();
6288	EVT LoadVT = M->getValueType(ResNo: `0`);
6289
6290	EVT EquivLoadVT = LoadVT;
6291	if (LoadVT.isVector()) {
6292	if (Unpacked) {
6293	EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
6294	NumElements: LoadVT.getVectorNumElements());
6295	} else if ((LoadVT.getVectorNumElements() % `2`) == `1`) {
6296	// Widen v3f16 to legal type
6297	EquivLoadVT =
6298	EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
6299	NumElements: LoadVT.getVectorNumElements() + `1`);
6300	}
6301	}
6302
6303	// Change from v4f16/v2f16 to EquivLoadVT.
6304	SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
6305
6306	SDValue Load = DAG.getMemIntrinsicNode(
6307	Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
6308	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
6309
6310	SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
6311
6312	return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: `1`)}, dl: DL);
6313	}
6314
6315	SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode M, bool* IsFormat,
6316	SelectionDAG &DAG,
6317	ArrayRef<SDValue> Ops) const {
6318	SDLoc DL(M);
6319	EVT LoadVT = M->getValueType(ResNo: `0`);
6320	EVT EltType = LoadVT.getScalarType();
6321	EVT IntVT = LoadVT.changeTypeToInteger();
6322
6323	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
6324
6325	assert(M->getNumValues() == `2` \|\| M->getNumValues() == `3`);
6326	bool IsTFE = M->getNumValues() == `3`;
6327
6328	unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6329	: AMDGPUISD::BUFFER_LOAD_FORMAT)
6330	: IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6331	: AMDGPUISD::BUFFER_LOAD;
6332
6333	if (IsD16) {
6334	return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6335	}
6336
6337	// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6338	if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < `32`)
6339	return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
6340	IsTFE);
6341
6342	if (isTypeLegal(VT: LoadVT)) {
6343	return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
6344	MMO: M->getMemOperand(), DAG);
6345	}
6346
6347	EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
6348	SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
6349	SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
6350	MMO: M->getMemOperand(), DAG);
6351	return DAG.getMergeValues(
6352	Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: `1`)},
6353	dl: DL);
6354	}
6355
6356	static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6357	SelectionDAG &DAG) {
6358	EVT VT = N->getValueType(ResNo: `0`);
6359	unsigned CondCode = N->getConstantOperandVal(Num: `3`);
6360	if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
6361	return DAG.getPOISON(VT);
6362
6363	ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6364
6365	SDValue LHS = N->getOperand(Num: `1`);
6366	SDValue RHS = N->getOperand(Num: `2`);
6367
6368	SDLoc DL(N);
6369
6370	EVT CmpVT = LHS.getValueType();
6371	if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
6372	unsigned PromoteOp =
6373	ICmpInst::isSigned(predicate: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6374	LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
6375	RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
6376	}
6377
6378	ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
6379
6380	unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6381	EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6382
6383	SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
6384	N3: DAG.getCondCode(Cond: CCOpcode));
6385	if (VT.bitsEq(VT: CCVT))
6386	return SetCC;
6387	return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
6388	}
6389
6390	static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6391	SelectionDAG &DAG) {
6392	EVT VT = N->getValueType(ResNo: `0`);
6393
6394	unsigned CondCode = N->getConstantOperandVal(Num: `3`);
6395	if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
6396	return DAG.getPOISON(VT);
6397
6398	SDValue Src0 = N->getOperand(Num: `1`);
6399	SDValue Src1 = N->getOperand(Num: `2`);
6400	EVT CmpVT = Src0.getValueType();
6401	SDLoc SL(N);
6402
6403	if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
6404	Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
6405	Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
6406	}
6407
6408	FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6409	ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
6410	unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6411	EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6412	SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
6413	N3: DAG.getCondCode(Cond: CCOpcode));
6414	if (VT.bitsEq(VT: CCVT))
6415	return SetCC;
6416	return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
6417	}
6418
6419	static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6420	SelectionDAG &DAG) {
6421	EVT VT = N->getValueType(ResNo: `0`);
6422	SDValue Src = N->getOperand(Num: `1`);
6423	SDLoc SL(N);
6424
6425	if (Src.getOpcode() == ISD::SETCC) {
6426	// (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6427	return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: `0`),
6428	N2: Src.getOperand(i: `1`), N3: Src.getOperand(i: `2`));
6429	}
6430	if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
6431	// (ballot 0) -> 0
6432	if (Arg->isZero())
6433	return DAG.getConstant(Val: `0`, DL: SL, VT);
6434
6435	// (ballot 1) -> EXEC/EXEC_LO
6436	if (Arg->isOne()) {
6437	Register Exec;
6438	if (VT.getScalarSizeInBits() == `32`)
6439	Exec = AMDGPU::EXEC_LO;
6440	else if (VT.getScalarSizeInBits() == `64`)
6441	Exec = AMDGPU::EXEC;
6442	else
6443	return SDValue ();
6444
6445	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
6446	}
6447	}
6448
6449	// (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6450	// ISD::SETNE)
6451	return DAG.getNode(
6452	Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
6453	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
6454	}
6455
6456	static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6457	SelectionDAG &DAG) {
6458	EVT VT = N->getValueType(ResNo: `0`);
6459	unsigned ValSize = VT.getSizeInBits();
6460	unsigned IID = N->getConstantOperandVal(Num: `0`);
6461	bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 \|\|
6462	IID == Intrinsic::amdgcn_permlanex16;
6463	bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive \|\|
6464	IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6465	SDLoc SL(N);
6466	MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
6467	const GCNSubtarget *ST = TLI.getSubtarget();
6468	unsigned SplitSize = `32`;
6469	if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % `64` == `0`) &&
6470	ST->hasDPALU_DPP() &&
6471	AMDGPU::isLegalDPALU_DPPControl(DC: N->getConstantOperandVal(Num: `3`)))
6472	SplitSize = `64`;
6473
6474	auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6475	SDValue Src2, MVT ValT) -> SDValue {
6476	SmallVector<SDValue, `8`> Operands;
6477	switch (IID) {
6478	case Intrinsic::amdgcn_permlane16:
6479	case Intrinsic::amdgcn_permlanex16:
6480	case Intrinsic::amdgcn_update_dpp:
6481	Operands.push_back(Elt: N->getOperand(Num: `6`));
6482	Operands.push_back(Elt: N->getOperand(Num: `5`));
6483	Operands.push_back(Elt: N->getOperand(Num: `4`));
6484	[[fallthrough]];
6485	case Intrinsic::amdgcn_writelane:
6486	Operands.push_back(Elt: Src2);
6487	[[fallthrough]];
6488	case Intrinsic::amdgcn_readlane:
6489	case Intrinsic::amdgcn_set_inactive:
6490	case Intrinsic::amdgcn_set_inactive_chain_arg:
6491	case Intrinsic::amdgcn_mov_dpp8:
6492	Operands.push_back(Elt: Src1);
6493	[[fallthrough]];
6494	case Intrinsic::amdgcn_readfirstlane:
6495	case Intrinsic::amdgcn_permlane64:
6496	Operands.push_back(Elt: Src0);
6497	break;
6498	default:
6499	llvm_unreachable("unhandled lane op");
6500	}
6501
6502	Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
6503	std::reverse(first: Operands.begin(), last: Operands.end());
6504
6505	if (SDNode *GL = N->getGluedNode()) {
6506	assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6507	GL = GL->getOperand(Num: `0`).getNode();
6508	Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6509	Operand: SDValue (GL, `0`)));
6510	}
6511
6512	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
6513	};
6514
6515	SDValue Src0 = N->getOperand(Num: `1`);
6516	SDValue Src1, Src2;
6517	if (IID == Intrinsic::amdgcn_readlane \|\| IID == Intrinsic::amdgcn_writelane \|\|
6518	IID == Intrinsic::amdgcn_mov_dpp8 \|\|
6519	IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16) {
6520	Src1 = N->getOperand(Num: `2`);
6521	if (IID == Intrinsic::amdgcn_writelane \|\|
6522	IID == Intrinsic::amdgcn_update_dpp \|\| IsPermLane16)
6523	Src2 = N->getOperand(Num: `3`);
6524	}
6525
6526	if (ValSize == SplitSize) {
6527	// Already legal
6528	return SDValue ();
6529	}
6530
6531	if (ValSize < `32`) {
6532	bool IsFloat = VT.isFloatingPoint();
6533	Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
6534	DL: SL, VT: MVT::i32);
6535
6536	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16) {
6537	Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
6538	DL: SL, VT: MVT::i32);
6539	}
6540
6541	if (IID == Intrinsic::amdgcn_writelane) {
6542	Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
6543	DL: SL, VT: MVT::i32);
6544	}
6545
6546	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, MVT::i32);
6547	SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
6548	return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
6549	}
6550
6551	if (ValSize % SplitSize != `0`)
6552	return SDValue ();
6553
6554	auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6555	EVT VT = N->getValueType(ResNo: `0`);
6556	unsigned NE = VT.getVectorNumElements();
6557	EVT EltVT = VT.getVectorElementType();
6558	SmallVector<SDValue, `8`> Scalars;
6559	unsigned NumOperands = N->getNumOperands();
6560	SmallVector<SDValue, `4`> Operands(NumOperands);
6561	SDNode *GL = N->getGluedNode();
6562
6563	// only handle convergencectrl_glue
6564	assert(!GL \|\| GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6565
6566	for (unsigned i = `0`; i != NE; ++i) {
6567	for (unsigned j = `0`, e = GL ? NumOperands - `1` : NumOperands; j != e;
6568	++j) {
6569	SDValue Operand = N->getOperand(Num: j);
6570	EVT OperandVT = Operand.getValueType();
6571	if (OperandVT.isVector()) {
6572	// A vector operand; extract a single element.
6573	EVT OperandEltVT = OperandVT.getVectorElementType();
6574	Operands [j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
6575	N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
6576	} else {
6577	// A scalar operand; just use it as is.
6578	Operands [j] = Operand;
6579	}
6580	}
6581
6582	if (GL)
6583	Operands [NumOperands - `1`] =
6584	DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6585	Operand: SDValue (GL->getOperand(Num: `0`).getNode(), `0`));
6586
6587	Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
6588	}
6589
6590	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
6591	return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
6592	};
6593
6594	if (VT.isVector()) {
6595	switch (MVT::SimpleValueType EltTy =
6596	VT.getVectorElementType().getSimpleVT().SimpleTy) {
6597	case MVT::i32:
6598	case MVT::f32:
6599	if (SplitSize == `32`) {
6600	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, VT.getSimpleVT());
6601	return unrollLaneOp (LaneOp.getNode());
6602	}
6603	[[fallthrough]];
6604	case MVT::i16:
6605	case MVT::f16:
6606	case MVT::bf16: {
6607	unsigned SubVecNumElt =
6608	SplitSize / VT.getVectorElementType().getSizeInBits();
6609	MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
6610	SmallVector<SDValue, `4`> Pieces;
6611	SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6612	for (unsigned i = `0`, EltIdx = `0`; i < ValSize / SplitSize; i++) {
6613	Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
6614	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6615
6616	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\|
6617	IsPermLane16)
6618	Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
6619	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6620
6621	if (IID == Intrinsic::amdgcn_writelane)
6622	Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
6623	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6624
6625	Pieces.push_back(
6626	Elt: IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16
6627	? createLaneOp (Src0SubVec, Src1SubVec, Src2, SubVecVT)
6628	: createLaneOp (Src0SubVec, Src1, Src2SubVec, SubVecVT));
6629	EltIdx += SubVecNumElt;
6630	}
6631	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
6632	}
6633	default:
6634	// Handle all other cases by bitcasting to i32 vectors
6635	break;
6636	}
6637	}
6638
6639	MVT VecVT =
6640	MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
6641	Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
6642
6643	if (IID == Intrinsic::amdgcn_update_dpp \|\| IsSetInactive \|\| IsPermLane16)
6644	Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
6645
6646	if (IID == Intrinsic::amdgcn_writelane)
6647	Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
6648
6649	SDValue LaneOp = createLaneOp (Src0, Src1, Src2, VecVT);
6650	SDValue UnrolledLaneOp = unrollLaneOp (LaneOp.getNode());
6651	return DAG.getBitcast(VT, V: UnrolledLaneOp);
6652	}
6653
6654	void SITargetLowering::ReplaceNodeResults(SDNode *N,
6655	SmallVectorImpl<SDValue> &Results,
6656	SelectionDAG &DAG) const {
6657	switch (N->getOpcode()) {
6658	case ISD::INSERT_VECTOR_ELT: {
6659	if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue (N, `0`), DAG))
6660	Results.push_back(Elt: Res);
6661	return;
6662	}
6663	case ISD::EXTRACT_VECTOR_ELT: {
6664	if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue (N, `0`), DAG))
6665	Results.push_back(Elt: Res);
6666	return;
6667	}
6668	case ISD::INTRINSIC_WO_CHAIN: {
6669	unsigned IID = N->getConstantOperandVal(Num: `0`);
6670	switch (IID) {
6671	case Intrinsic::amdgcn_make_buffer_rsrc:
6672	Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
6673	return;
6674	case Intrinsic::amdgcn_cvt_pkrtz: {
6675	SDValue Src0 = N->getOperand(Num: `1`);
6676	SDValue Src1 = N->getOperand(Num: `2`);
6677	SDLoc SL(N);
6678	SDValue Cvt =
6679	DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
6680	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
6681	return;
6682	}
6683	case Intrinsic::amdgcn_cvt_pknorm_i16:
6684	case Intrinsic::amdgcn_cvt_pknorm_u16:
6685	case Intrinsic::amdgcn_cvt_pk_i16:
6686	case Intrinsic::amdgcn_cvt_pk_u16: {
6687	SDValue Src0 = N->getOperand(Num: `1`);
6688	SDValue Src1 = N->getOperand(Num: `2`);
6689	SDLoc SL(N);
6690	unsigned Opcode;
6691
6692	if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6693	Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6694	else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6695	Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6696	else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6697	Opcode = AMDGPUISD::CVT_PK_I16_I32;
6698	else
6699	Opcode = AMDGPUISD::CVT_PK_U16_U32;
6700
6701	EVT VT = N->getValueType(ResNo: `0`);
6702	if (isTypeLegal(VT))
6703	Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
6704	else {
6705	SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
6706	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
6707	}
6708	return;
6709	}
6710	case Intrinsic::amdgcn_s_buffer_load: {
6711	// Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6712	// s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6713	// combiner tries to merge the s_buffer_load_u8 with a sext instruction
6714	// (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6715	// s_buffer_load_i8.
6716	if (!Subtarget->hasScalarSubwordLoads())
6717	return;
6718	SDValue Op = SDValue (N, `0`);
6719	SDValue Rsrc = Op.getOperand(i: `1`);
6720	SDValue Offset = Op.getOperand(i: `2`);
6721	SDValue CachePolicy = Op.getOperand(i: `3`);
6722	EVT VT = Op.getValueType();
6723	assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6724	SDLoc DL(Op);
6725	MachineFunction &MF = DAG.getMachineFunction();
6726	const DataLayout &DataLayout = DAG.getDataLayout();
6727	Align Alignment =
6728	DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
6729	MachineMemOperand *MMO = MF.getMachineMemOperand(
6730	PtrInfo: MachinePointerInfo (),
6731	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
6732	MachineMemOperand::MOInvariant,
6733	Size: VT.getStoreSize(), BaseAlignment: Alignment);
6734	SDValue LoadVal;
6735	if (!Offset ->isDivergent()) {
6736	SDValue Ops[] = {Rsrc, // source register
6737	Offset, CachePolicy};
6738	SDValue BufferLoad =
6739	DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
6740	VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
6741	LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
6742	} else {
6743	SDValue Ops[] = {
6744	DAG.getEntryNode(), // Chain
6745	Rsrc, // rsrc
6746	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
6747	{}, // voffset
6748	{}, // soffset
6749	{}, // offset
6750	CachePolicy, // cachepolicy
6751	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
6752	};
6753	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`], Alignment: Align (`4`));
6754	LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
6755	}
6756	Results.push_back(Elt: LoadVal);
6757	return;
6758	}
6759	case Intrinsic::amdgcn_dead: {
6760	for (unsigned I = `0`, E = N->getNumValues(); I < E; ++I)
6761	Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
6762	return;
6763	}
6764	}
6765	break;
6766	}
6767	case ISD::INTRINSIC_W_CHAIN: {
6768	if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue (N, `0`), DAG)) {
6769	if (Res.getOpcode() == ISD::MERGE_VALUES) {
6770	// FIXME: Hacky
6771	for (unsigned I = `0`; I < Res.getNumOperands(); I++) {
6772	Results.push_back(Elt: Res.getOperand(i: I));
6773	}
6774	} else {
6775	Results.push_back(Elt: Res);
6776	Results.push_back(Elt: Res.getValue(R: `1`));
6777	}
6778	return;
6779	}
6780
6781	break;
6782	}
6783	case ISD::SELECT: {
6784	SDLoc SL(N);
6785	EVT VT = N->getValueType(ResNo: `0`);
6786	EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
6787	SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: `1`));
6788	SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: `2`));
6789
6790	EVT SelectVT = NewVT;
6791	if (NewVT.bitsLT(VT: MVT::i32)) {
6792	LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
6793	RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
6794	SelectVT = MVT::i32;
6795	}
6796
6797	SDValue NewSelect =
6798	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: `0`), N2: LHS, N3: RHS);
6799
6800	if (NewVT != SelectVT)
6801	NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
6802	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
6803	return;
6804	}
6805	case ISD::FNEG: {
6806	if (N->getValueType(ResNo: `0`) != MVT::v2f16)
6807	break;
6808
6809	SDLoc SL(N);
6810	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: `0`));
6811
6812	SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
6813	N2: DAG.getConstant(Val: `0x80008000`, DL: SL, VT: MVT::i32));
6814	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6815	return;
6816	}
6817	case ISD::FABS: {
6818	if (N->getValueType(ResNo: `0`) != MVT::v2f16)
6819	break;
6820
6821	SDLoc SL(N);
6822	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: `0`));
6823
6824	SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
6825	N2: DAG.getConstant(Val: `0x7fff7fff`, DL: SL, VT: MVT::i32));
6826	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6827	return;
6828	}
6829	case ISD::FSQRT: {
6830	if (N->getValueType(ResNo: `0`) != MVT::f16)
6831	break;
6832	Results.push_back(Elt: lowerFSQRTF16(Op: SDValue (N, `0`), DAG));
6833	break;
6834	}
6835	default:
6836	AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6837	break;
6838	}
6839	}
6840
6841	/// Helper function for LowerBRCOND
6842	static SDNode findUser(SDValue Value, unsigned* Opcode) {
6843
6844	for (SDUse &U : Value ->uses()) {
6845	if (U.get() != Value)
6846	continue;
6847
6848	if (U.getUser()->getOpcode() == Opcode)
6849	return U.getUser();
6850	}
6851	return nullptr;
6852	}
6853
6854	unsigned SITargetLowering::isCFIntrinsic(const SDNode Intr) const* {
6855	if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6856	switch (Intr->getConstantOperandVal(Num: `1`)) {
6857	case Intrinsic::amdgcn_if:
6858	return AMDGPUISD::IF;
6859	case Intrinsic::amdgcn_else:
6860	return AMDGPUISD::ELSE;
6861	case Intrinsic::amdgcn_loop:
6862	return AMDGPUISD::LOOP;
6863	case Intrinsic::amdgcn_end_cf:
6864	llvm_unreachable("should not occur");
6865	default:
6866	return `0`;
6867	}
6868	}
6869
6870	// break, if_break, else_break are all only used as inputs to loop, not
6871	// directly as branch conditions.
6872	return `0`;
6873	}
6874
6875	bool SITargetLowering::shouldEmitFixup(const GlobalValue GV) const* {
6876	const Triple &TT = getTargetMachine().getTargetTriple();
6877	return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
6878	GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6879	AMDGPU::shouldEmitConstantsToTextSection(TT);
6880	}
6881
6882	bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue GV) const* {
6883	if (Subtarget->isAmdPalOS() \|\| Subtarget->isMesa3DOS())
6884	return false;
6885
6886	// FIXME: Either avoid relying on address space here or change the default
6887	// address space for functions to avoid the explicit check.
6888	return (GV->getValueType()->isFunctionTy() \|\|
6889	!isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
6890	!shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6891	}
6892
6893	bool SITargetLowering::shouldEmitPCReloc(const GlobalValue GV) const* {
6894	return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6895	}
6896
6897	bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue GV) const* {
6898	if (!GV->hasExternalLinkage())
6899	return true;
6900
6901	const auto OS = getTargetMachine().getTargetTriple().getOS();
6902	return OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL;
6903	}
6904
6905	/// This transforms the control flow intrinsics to get the branch destination as
6906	/// last parameter, also switches branch target with BR if the need arise
6907	SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6908	SDLoc DL(BRCOND);
6909
6910	SDNode *Intr = BRCOND.getOperand(i: `1`).getNode();
6911	SDValue Target = BRCOND.getOperand(i: `2`);
6912	SDNode BR = nullptr*;
6913	SDNode SetCC = nullptr*;
6914
6915	if (Intr->getOpcode() == ISD::SETCC) {
6916	// As long as we negate the condition everything is fine
6917	SetCC = Intr;
6918	Intr = SetCC->getOperand(Num: `0`).getNode();
6919
6920	} else {
6921	// Get the target from BR if we don't negate the condition
6922	BR = findUser(Value: BRCOND, Opcode: ISD::BR);
6923	assert(BR && "brcond missing unconditional branch user");
6924	Target = BR->getOperand(Num: `1`);
6925	}
6926
6927	unsigned CFNode = isCFIntrinsic(Intr);
6928	if (CFNode == `0`) {
6929	// This is a uniform branch so we don't need to legalize.
6930	return BRCOND;
6931	}
6932
6933	bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID \|\|
6934	Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6935
6936	assert(!SetCC \|\|
6937	(SetCC->getConstantOperandVal(`1`) == `1` &&
6938	cast<CondCodeSDNode>(SetCC->getOperand(`2`).getNode())->get() ==
6939	ISD::SETNE));
6940
6941	// operands of the new intrinsic call
6942	SmallVector<SDValue, `4`> Ops;
6943	if (HaveChain)
6944	Ops.push_back(Elt: BRCOND.getOperand(i: `0`));
6945
6946	Ops.append(in_start: Intr->op_begin() + (HaveChain ? `2` : `1`), in_end: Intr->op_end());
6947	Ops.push_back(Elt: Target);
6948
6949	ArrayRef<EVT> Res(Intr->value_begin() + `1`, Intr->value_end());
6950
6951	// build the new intrinsic call
6952	SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
6953
6954	if (!HaveChain) {
6955	SDValue Ops[] = {SDValue (Result, `0`), BRCOND.getOperand(i: `0`)};
6956
6957	Result = DAG.getMergeValues(Ops, dl: DL).getNode();
6958	}
6959
6960	if (BR) {
6961	// Give the branch instruction our target
6962	SDValue Ops[] = {BR->getOperand(Num: `0`), BRCOND.getOperand(i: `2`)};
6963	SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
6964	DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
6965	}
6966
6967	SDValue Chain = SDValue (Result, Result->getNumValues() - `1`);
6968
6969	// Copy the intrinsic results to registers
6970	for (unsigned i = `1`, e = Intr->getNumValues() - `1`; i != e; ++i) {
6971	SDNode *CopyToReg = findUser(Value: SDValue (Intr, i), Opcode: ISD::CopyToReg);
6972	if (!CopyToReg)
6973	continue;
6974
6975	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: `1`),
6976	N: SDValue (Result, i - `1`), Glue: SDValue ());
6977
6978	DAG.ReplaceAllUsesWith(From: SDValue (CopyToReg, `0`), To: CopyToReg->getOperand(Num: `0`));
6979	}
6980
6981	// Remove the old intrinsic from the chain
6982	DAG.ReplaceAllUsesOfValueWith(From: SDValue (Intr, Intr->getNumValues() - `1`),
6983	To: Intr->getOperand(Num: `0`));
6984
6985	return Chain;
6986	}
6987
6988	SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6989	MVT VT = Op.getSimpleValueType();
6990	SDLoc DL(Op);
6991	// Checking the depth
6992	if (Op.getConstantOperandVal(i: `0`) != `0`)
6993	return DAG.getConstant(Val: `0`, DL, VT);
6994
6995	MachineFunction &MF = DAG.getMachineFunction();
6996	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6997	// Check for kernel and shader functions
6998	if (Info->isEntryFunction())
6999	return DAG.getConstant(Val: `0`, DL, VT);
7000
7001	MachineFrameInfo &MFI = MF.getFrameInfo();
7002	// There is a call to @llvm.returnaddress in this function
7003	MFI.setReturnAddressIsTaken(true);
7004
7005	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7006	// Get the return address reg and mark it as an implicit live-in
7007	Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
7008	RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
7009
7010	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
7011	}
7012
7013	SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7014	const SDLoc &DL, EVT VT) const {
7015	return Op.getValueType().bitsLE(VT)
7016	? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
7017	: DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
7018	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
7019	}
7020
7021	SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7022	SelectionDAG &DAG) const {
7023	EVT DstVT = Op.getValueType();
7024	unsigned NumElts = DstVT.getVectorNumElements();
7025	assert(NumElts > `2` && isPowerOf2_32(NumElts));
7026
7027	auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: `0`);
7028
7029	SDLoc DL(Op);
7030	unsigned Opc = Op.getOpcode();
7031	SDValue Flags = Op.getOperand(i: `1`);
7032	EVT HalfDstVT =
7033	EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / `2`);
7034	SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
7035	SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
7036
7037	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
7038	}
7039
7040	SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7041	SDValue Src = Op.getOperand(i: `0`);
7042	EVT SrcVT = Src.getValueType();
7043	EVT DstVT = Op.getValueType();
7044
7045	if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7046	assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7047	if (SrcVT.getScalarType() != MVT::f32)
7048	return SDValue ();
7049	return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7050	}
7051
7052	if (SrcVT.getScalarType() != MVT::f64)
7053	return Op;
7054
7055	SDLoc DL(Op);
7056	if (DstVT == MVT::f16) {
7057	// TODO: Handle strictfp
7058	if (Op.getOpcode() != ISD::FP_ROUND)
7059	return Op;
7060
7061	if (!Subtarget->has16BitInsts()) {
7062	SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
7063	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
7064	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
7065	}
7066	if (getTargetMachine().Options.UnsafeFPMath) {
7067	SDValue Flags = Op.getOperand(i: `1`);
7068	SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
7069	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
7070	}
7071	SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7072	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
7073	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
7074	}
7075
7076	assert(DstVT.getScalarType() == MVT::bf16 &&
7077	"custom lower FP_ROUND for f16 or bf16");
7078	assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7079
7080	// Round-inexact-to-odd f64 to f32, then do the final rounding using the
7081	// hardware f32 -> bf16 instruction.
7082	EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(EltVT: MVT::f32) :
7083	MVT::f32;
7084	SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
7085	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
7086	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
7087	}
7088
7089	SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7090	SelectionDAG &DAG) const {
7091	EVT VT = Op.getValueType();
7092	const MachineFunction &MF = DAG.getMachineFunction();
7093	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7094	bool IsIEEEMode = Info->getMode().IEEE;
7095
7096	// FIXME: Assert during selection that this is only selected for
7097	// ieee_mode. Currently a combine can produce the ieee version for non-ieee
7098	// mode functions, but this happens to be OK since it's only done in cases
7099	// where there is known no sNaN.
7100	if (IsIEEEMode)
7101	return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
7102
7103	if (VT == MVT::v4f16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16f16 \|\|
7104	VT == MVT::v16bf16)
7105	return splitBinaryVectorOp(Op, DAG);
7106	return Op;
7107	}
7108
7109	SDValue
7110	SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7111	SelectionDAG &DAG) const {
7112	EVT VT = Op.getValueType();
7113	const MachineFunction &MF = DAG.getMachineFunction();
7114	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7115	bool IsIEEEMode = Info->getMode().IEEE;
7116
7117	if (IsIEEEMode)
7118	return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
7119
7120	if (VT == MVT::v4f16 \|\| VT == MVT::v8f16 \|\| VT == MVT::v16f16 \|\|
7121	VT == MVT::v16bf16)
7122	return splitBinaryVectorOp(Op, DAG);
7123	return Op;
7124	}
7125
7126	SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7127	SelectionDAG &DAG) const {
7128	EVT VT = Op.getValueType();
7129	if (VT.isVector())
7130	return splitBinaryVectorOp(Op, DAG);
7131
7132	assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
7133	Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7134	"should not need to widen f16 minimum/maximum to v2f16");
7135
7136	// Widen f16 operation to v2f16
7137
7138	// fminimum f16:x, f16:y ->
7139	// extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7140	// (v2f16 (scalar_to_vector y))), 0
7141	SDLoc SL(Op);
7142	SDValue WideSrc0 =
7143	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: `0`));
7144	SDValue WideSrc1 =
7145	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: `1`));
7146
7147	SDValue Widened =
7148	DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
7149
7150	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
7151	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
7152	}
7153
7154	SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7155	bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7156	EVT VT = Op.getValueType();
7157	assert(VT == MVT::f16);
7158
7159	SDValue Exp = Op.getOperand(i: IsStrict ? `2` : `1`);
7160	EVT ExpVT = Exp.getValueType();
7161	if (ExpVT == MVT::i16)
7162	return Op;
7163
7164	SDLoc DL(Op);
7165
7166	// Correct the exponent type for f16 to i16.
7167	// Clamp the range of the exponent to the instruction's range.
7168
7169	// TODO: This should be a generic narrowing legalization, and can easily be
7170	// for GlobalISel.
7171
7172	SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: `16`), DL, VT: ExpVT);
7173	SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
7174
7175	SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: `16`), DL, VT: ExpVT);
7176	SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
7177
7178	SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
7179
7180	if (IsStrict) {
7181	return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
7182	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`), TruncExp});
7183	}
7184
7185	return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: `0`), N2: TruncExp);
7186	}
7187
7188	static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
7189	switch (Op ->getOpcode()) {
7190	case ISD::SRA:
7191	case ISD::SMIN:
7192	case ISD::SMAX:
7193	return ISD::SIGN_EXTEND;
7194	case ISD::SRL:
7195	case ISD::UMIN:
7196	case ISD::UMAX:
7197	return ISD::ZERO_EXTEND;
7198	case ISD::ADD:
7199	case ISD::SUB:
7200	case ISD::AND:
7201	case ISD::OR:
7202	case ISD::XOR:
7203	case ISD::SHL:
7204	case ISD::SELECT:
7205	case ISD::MUL:
7206	// operation result won't be influenced by garbage high bits.
7207	// TODO: are all of those cases correct, and are there more?
7208	return ISD::ANY_EXTEND;
7209	case ISD::SETCC: {
7210	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
7211	return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7212	}
7213	default:
7214	llvm_unreachable("unexpected opcode!");
7215	}
7216	}
7217
7218	SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7219	DAGCombinerInfo &DCI) const {
7220	const unsigned Opc = Op.getOpcode();
7221	assert(Opc == ISD::ADD \|\| Opc == ISD::SUB \|\| Opc == ISD::SHL \|\|
7222	Opc == ISD::SRL \|\| Opc == ISD::SRA \|\| Opc == ISD::AND \|\|
7223	Opc == ISD::OR \|\| Opc == ISD::XOR \|\| Opc == ISD::MUL \|\|
7224	Opc == ISD::SETCC \|\| Opc == ISD::SELECT \|\| Opc == ISD::SMIN \|\|
7225	Opc == ISD::SMAX \|\| Opc == ISD::UMIN \|\| Opc == ISD::UMAX);
7226
7227	EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7228	: Op ->getOperand(Num: `0`).getValueType();
7229	auto ExtTy = OpTy.changeElementType(EltVT: MVT::i32);
7230
7231	if (DCI.isBeforeLegalizeOps() \|\|
7232	isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
7233	return SDValue ();
7234
7235	auto &DAG = DCI.DAG;
7236
7237	SDLoc DL(Op);
7238	SDValue LHS;
7239	SDValue RHS;
7240	if (Opc == ISD::SELECT) {
7241	LHS = Op ->getOperand(Num: `1`);
7242	RHS = Op ->getOperand(Num: `2`);
7243	} else {
7244	LHS = Op ->getOperand(Num: `0`);
7245	RHS = Op ->getOperand(Num: `1`);
7246	}
7247
7248	const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7249	LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
7250
7251	// Special case: for shifts, the RHS always needs a zext.
7252	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
7253	RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
7254	else
7255	RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
7256
7257	// setcc always return i1/i1 vec so no need to truncate after.
7258	if (Opc == ISD::SETCC) {
7259	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
7260	return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
7261	}
7262
7263	// For other ops, we extend the operation's return type as well so we need to
7264	// truncate back to the original type.
7265	SDValue NewVal;
7266	if (Opc == ISD::SELECT)
7267	NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op ->getOperand(Num: `0`), LHS, RHS});
7268	else
7269	NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
7270
7271	return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
7272	}
7273
7274	SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7275	SDValue Mag = Op.getOperand(i: `0`);
7276	EVT MagVT = Mag.getValueType();
7277
7278	if (MagVT.getVectorNumElements() > `2`)
7279	return splitBinaryVectorOp(Op, DAG);
7280
7281	SDValue Sign = Op.getOperand(i: `1`);
7282	EVT SignVT = Sign.getValueType();
7283
7284	if (MagVT == SignVT)
7285	return Op;
7286
7287	// fcopysign v2f16:mag, v2f32:sign ->
7288	// fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7289
7290	SDLoc SL(Op);
7291	SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
7292	SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
7293
7294	SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
7295
7296	return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
7297	}
7298
7299	// Custom lowering for vector multiplications and s_mul_u64.
7300	SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7301	EVT VT = Op.getValueType();
7302
7303	// Split vector operands.
7304	if (VT.isVector())
7305	return splitBinaryVectorOp(Op, DAG);
7306
7307	assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7308
7309	// There are four ways to lower s_mul_u64:
7310	//
7311	// 1. If all the operands are uniform, then we lower it as it is.
7312	//
7313	// 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7314	// multiplications because there is not a vector equivalent of s_mul_u64.
7315	//
7316	// 3. If the cost model decides that it is more efficient to use vector
7317	// registers, then we have to split s_mul_u64 in 32-bit multiplications.
7318	// This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7319	//
7320	// 4. If the cost model decides to use vector registers and both of the
7321	// operands are zero-extended/sign-extended from 32-bits, then we split the
7322	// s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7323	// possible to check if the operands are zero-extended or sign-extended in
7324	// SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7325	// s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7326	// s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7327	// If the cost model decides that we have to use vector registers, then
7328	// splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7329	// s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7330	// decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7331	// s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7332	// SIInstrInfo.cpp .
7333
7334	if (Op ->isDivergent())
7335	return SDValue ();
7336
7337	SDValue Op0 = Op.getOperand(i: `0`);
7338	SDValue Op1 = Op.getOperand(i: `1`);
7339	// If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7340	// with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7341	// 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7342	KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
7343	unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7344	KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
7345	unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7346	SDLoc SL(Op);
7347	if (Op0LeadingZeros >= `32` && Op1LeadingZeros >= `32`)
7348	return SDValue (
7349	DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), `0`);
7350	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
7351	unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
7352	if (Op0SignBits >= `33` && Op1SignBits >= `33`)
7353	return SDValue (
7354	DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), `0`);
7355	// If all the operands are uniform, then we lower s_mul_u64 as it is.
7356	return Op;
7357	}
7358
7359	SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7360	EVT VT = Op.getValueType();
7361	SDLoc SL(Op);
7362	SDValue LHS = Op.getOperand(i: `0`);
7363	SDValue RHS = Op.getOperand(i: `1`);
7364	bool isSigned = Op.getOpcode() == ISD::SMULO;
7365
7366	if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
7367	const APInt &C = RHSC->getAPIntValue();
7368	// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7369	if (C.isPowerOf2()) {
7370	// smulo(x, signed_min) is same as umulo(x, signed_min).
7371	bool UseArithShift = isSigned && !C.isMinSignedValue();
7372	SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
7373	SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
7374	SDValue Overflow =
7375	DAG.getSetCC(DL: SL, VT: MVT::i1,
7376	LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
7377	N1: Result, N2: ShiftAmt),
7378	RHS: LHS, Cond: ISD::SETNE);
7379	return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
7380	}
7381	}
7382
7383	SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
7384	SDValue Top =
7385	DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
7386
7387	SDValue Sign = isSigned
7388	? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
7389	N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - `1`,
7390	DL: SL, VT: MVT::i32))
7391	: DAG.getConstant(Val: `0`, DL: SL, VT);
7392	SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
7393
7394	return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
7395	}
7396
7397	SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7398	if (Op ->isDivergent()) {
7399	// Select to V_MAD_[IU]64_[IU]32.
7400	return Op;
7401	}
7402	if (Subtarget->hasSMulHi()) {
7403	// Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7404	return SDValue ();
7405	}
7406	// The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7407	// calculate the high part, so we might as well do the whole thing with
7408	// V_MAD_[IU]64_[IU]32.
7409	return Op;
7410	}
7411
7412	SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7413	if (!Subtarget->isTrapHandlerEnabled() \|\|
7414	Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7415	return lowerTrapEndpgm(Op, DAG);
7416
7417	return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7418	: lowerTrapHsaQueuePtr(Op, DAG);
7419	}
7420
7421	SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7422	SDLoc SL(Op);
7423	SDValue Chain = Op.getOperand(i: `0`);
7424	return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
7425	}
7426
7427	SDValue
7428	SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7429	const SDLoc &DL, Align Alignment,
7430	ImplicitParameter Param) const {
7431	MachineFunction &MF = DAG.getMachineFunction();
7432	uint64_t Offset = getImplicitParameterOffset(MF, Param);
7433	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
7434	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7435	return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7436	MMOFlags: MachineMemOperand::MODereferenceable \|
7437	MachineMemOperand::MOInvariant);
7438	}
7439
7440	SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7441	SelectionDAG &DAG) const {
7442	SDLoc SL(Op);
7443	SDValue Chain = Op.getOperand(i: `0`);
7444
7445	SDValue QueuePtr;
7446	// For code object version 5, QueuePtr is passed through implicit kernarg.
7447	const Module *M = DAG.getMachineFunction().getFunction().getParent();
7448	if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
7449	QueuePtr =
7450	loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align (`8`), Param: QUEUE_PTR);
7451	} else {
7452	MachineFunction &MF = DAG.getMachineFunction();
7453	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7454	Register UserSGPR = Info->getQueuePtrUserSGPR();
7455
7456	if (UserSGPR == AMDGPU::NoRegister) {
7457	// We probably are in a function incorrectly marked with
7458	// amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7459	// trap, so just use a null pointer.
7460	QueuePtr = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i64);
7461	} else {
7462	QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
7463	VT: MVT::i64);
7464	}
7465	}
7466
7467	SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
7468	SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue ());
7469
7470	uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7471	SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
7472	ToReg.getValue(R: `1`)};
7473	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7474	}
7475
7476	SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7477	SDLoc SL(Op);
7478	SDValue Chain = Op.getOperand(i: `0`);
7479
7480	// We need to simulate the 's_trap 2' instruction on targets that run in
7481	// PRIV=1 (where it is treated as a nop).
7482	if (Subtarget->hasPrivEnabledTrap2NopBug())
7483	return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
7484
7485	uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7486	SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
7487	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7488	}
7489
7490	SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7491	SDLoc SL(Op);
7492	SDValue Chain = Op.getOperand(i: `0`);
7493	MachineFunction &MF = DAG.getMachineFunction();
7494
7495	if (!Subtarget->isTrapHandlerEnabled() \|\|
7496	Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7497	LLVMContext &Ctx = MF.getFunction().getContext();
7498	Ctx.diagnose(DI: DiagnosticInfoUnsupported (MF.getFunction(),
7499	"debugtrap handler not supported",
7500	Op.getDebugLoc(), DS_Warning));
7501	return Chain;
7502	}
7503
7504	uint64_t TrapID =
7505	static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7506	SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
7507	return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7508	}
7509
7510	SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7511	SelectionDAG &DAG) const {
7512	if (Subtarget->hasApertureRegs()) {
7513	const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7514	? AMDGPU::SRC_SHARED_BASE
7515	: AMDGPU::SRC_PRIVATE_BASE;
7516	// Note: this feature (register) is broken. When used as a 32-bit operand,
7517	// it returns a wrong value (all zeroes?). The real value is in the upper 32
7518	// bits.
7519	//
7520	// To work around the issue, directly emit a 64 bit mov from this register
7521	// then extract the high bits. Note that this shouldn't even result in a
7522	// shift being emitted and simply become a pair of registers (e.g.):
7523	// s_mov_b64 s[6:7], src_shared_base
7524	// v_mov_b32_e32 v1, s7
7525	//
7526	// FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7527	// coalescing would kick in and it would think it's okay to use the "HI"
7528	// subregister directly (instead of extracting the HI 32 bits) which is an
7529	// artificial (unusable) register.
7530	// Register TableGen definitions would need an overhaul to get rid of the
7531	// artificial "HI" aperture registers and prevent this kind of issue from
7532	// happening.
7533	SDNode *Mov = DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64,
7534	Op1: DAG.getRegister(Reg: ApertureRegNo, VT: MVT::i64));
7535	return DAG.getNode(
7536	Opcode: ISD::TRUNCATE, DL, VT: MVT::i32,
7537	Operand: DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64,
7538	Ops: {SDValue (Mov, `0`), DAG.getConstant(Val: `32`, DL, VT: MVT::i64)}));
7539	}
7540
7541	// For code object version 5, private_base and shared_base are passed through
7542	// implicit kernargs.
7543	const Module *M = DAG.getMachineFunction().getFunction().getParent();
7544	if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
7545	ImplicitParameter Param =
7546	(AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7547	return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align (`4`), Param);
7548	}
7549
7550	MachineFunction &MF = DAG.getMachineFunction();
7551	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7552	Register UserSGPR = Info->getQueuePtrUserSGPR();
7553	if (UserSGPR == AMDGPU::NoRegister) {
7554	// We probably are in a function incorrectly marked with
7555	// amdgpu-no-queue-ptr. This is undefined.
7556	return DAG.getPOISON(VT: MVT::i32);
7557	}
7558
7559	SDValue QueuePtr =
7560	CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
7561
7562	// Offset into amd_queue_t for group_segment_aperture_base_hi /
7563	// private_segment_aperture_base_hi.
7564	uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? `0x40` : `0x44`;
7565
7566	SDValue Ptr =
7567	DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
7568
7569	// TODO: Use custom target PseudoSourceValue.
7570	// TODO: We should use the value from the IR intrinsic call, but it might not
7571	// be available and how do we get it?
7572	MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7573	return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: `1`), Ptr, PtrInfo,
7574	Alignment: commonAlignment(A: Align (`64`), Offset: StructOffset),
7575	MMOFlags: MachineMemOperand::MODereferenceable \|
7576	MachineMemOperand::MOInvariant);
7577	}
7578
7579	/// Return true if the value is a known valid address, such that a null check is
7580	/// not necessary.
7581	static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7582	const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7583	if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7584	return true;
7585
7586	if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7587	return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7588
7589	// TODO: Search through arithmetic, handle arguments and loads
7590	// marked nonnull.
7591	return false;
7592	}
7593
7594	SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7595	SelectionDAG &DAG) const {
7596	SDLoc SL(Op);
7597
7598	const AMDGPUTargetMachine &TM =
7599	static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7600
7601	unsigned DestAS, SrcAS;
7602	SDValue Src;
7603	bool IsNonNull = false;
7604	if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
7605	SrcAS = ASC->getSrcAddressSpace();
7606	Src = ASC->getOperand(Num: `0`);
7607	DestAS = ASC->getDestAddressSpace();
7608	} else {
7609	assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7610	Op.getConstantOperandVal(`0`) ==
7611	Intrinsic::amdgcn_addrspacecast_nonnull);
7612	Src = Op ->getOperand(Num: `1`);
7613	SrcAS = Op ->getConstantOperandVal(Num: `2`);
7614	DestAS = Op ->getConstantOperandVal(Num: `3`);
7615	IsNonNull = true;
7616	}
7617
7618	SDValue FlatNullPtr = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i64);
7619
7620	// flat -> local/private
7621	if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7622	if (DestAS == AMDGPUAS::LOCAL_ADDRESS \|\|
7623	DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7624	SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7625
7626	if (IsNonNull \|\| isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7627	return Ptr;
7628
7629	unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
7630	SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7631	SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
7632
7633	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
7634	N3: SegmentNullPtr);
7635	}
7636	}
7637
7638	// local/private -> flat
7639	if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7640	if (SrcAS == AMDGPUAS::LOCAL_ADDRESS \|\|
7641	SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7642
7643	SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
7644	SDValue CvtPtr =
7645	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
7646	CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
7647
7648	if (IsNonNull \|\| isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7649	return CvtPtr;
7650
7651	unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
7652	SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7653
7654	SDValue NonNull =
7655	DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
7656
7657	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
7658	N3: FlatNullPtr);
7659	}
7660	}
7661
7662	if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7663	Op.getValueType() == MVT::i64) {
7664	const SIMachineFunctionInfo *Info =
7665	DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7666	SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
7667	SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
7668	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
7669	}
7670
7671	if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7672	Src.getValueType() == MVT::i64)
7673	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7674
7675	// global <-> flat are no-ops and never emitted.
7676
7677	// Invalid casts are poison.
7678	return DAG.getPOISON(VT: Op ->getValueType(ResNo: `0`));
7679	}
7680
7681	// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7682	// the small vector and inserting them into the big vector. That is better than
7683	// the default expansion of doing it via a stack slot. Even though the use of
7684	// the stack slot would be optimized away afterwards, the stack slot itself
7685	// remains.
7686	SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7687	SelectionDAG &DAG) const {
7688	SDValue Vec = Op.getOperand(i: `0`);
7689	SDValue Ins = Op.getOperand(i: `1`);
7690	SDValue Idx = Op.getOperand(i: `2`);
7691	EVT VecVT = Vec.getValueType();
7692	EVT InsVT = Ins.getValueType();
7693	EVT EltVT = VecVT.getVectorElementType();
7694	unsigned InsNumElts = InsVT.getVectorNumElements();
7695	unsigned IdxVal = Idx ->getAsZExtVal();
7696	SDLoc SL(Op);
7697
7698	if (EltVT.getScalarSizeInBits() == `16` && IdxVal % `2` == `0`) {
7699	// Insert 32-bit registers at a time.
7700	assert(InsNumElts % `2` == `0` && "expect legal vector types");
7701
7702	unsigned VecNumElts = VecVT.getVectorNumElements();
7703	EVT NewVecVT =
7704	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / `2`);
7705	EVT NewInsVT = InsNumElts == `2` ? MVT::i32
7706	: EVT::getVectorVT(Context&: *DAG.getContext(),
7707	VT: MVT::i32, NumElements: InsNumElts / `2`);
7708
7709	Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
7710	Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
7711
7712	for (unsigned I = `0`; I != InsNumElts / `2`; ++I) {
7713	SDValue Elt;
7714	if (InsNumElts == `2`) {
7715	Elt = Ins;
7716	} else {
7717	Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
7718	N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7719	}
7720	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
7721	N3: DAG.getConstant(Val: IdxVal / `2` + I, DL: SL, VT: MVT::i32));
7722	}
7723
7724	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
7725	}
7726
7727	for (unsigned I = `0`; I != InsNumElts; ++I) {
7728	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
7729	N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7730	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
7731	N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
7732	}
7733	return Vec;
7734	}
7735
7736	SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7737	SelectionDAG &DAG) const {
7738	SDValue Vec = Op.getOperand(i: `0`);
7739	SDValue InsVal = Op.getOperand(i: `1`);
7740	SDValue Idx = Op.getOperand(i: `2`);
7741	EVT VecVT = Vec.getValueType();
7742	EVT EltVT = VecVT.getVectorElementType();
7743	unsigned VecSize = VecVT.getSizeInBits();
7744	unsigned EltSize = EltVT.getSizeInBits();
7745	SDLoc SL(Op);
7746
7747	// Specially handle the case of v4i16 with static indexing.
7748	unsigned NumElts = VecVT.getVectorNumElements();
7749	auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
7750	if (NumElts == `4` && EltSize == `16` && KIdx) {
7751	SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
7752
7753	SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7754	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
7755	SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7756	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
7757
7758	SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
7759	SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
7760
7761	unsigned Idx = KIdx->getZExtValue();
7762	bool InsertLo = Idx < `2`;
7763	SDValue InsHalf = DAG.getNode(
7764	Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
7765	N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
7766	N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - `2`), DL: SL, VT: MVT::i32));
7767
7768	InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
7769
7770	SDValue Concat =
7771	InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
7772	: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
7773
7774	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
7775	}
7776
7777	// Static indexing does not lower to stack access, and hence there is no need
7778	// for special custom lowering to avoid stack access.
7779	if (isa<ConstantSDNode>(Val: Idx))
7780	return SDValue ();
7781
7782	// Avoid stack access for dynamic indexing by custom lowering to
7783	// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7784
7785	assert(VecSize <= `64` && "Expected target vector size to be <= 64 bits");
7786
7787	MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7788
7789	// Convert vector index to bit-index and get the required bit mask.
7790	assert(isPowerOf2_32(EltSize));
7791	const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
7792	SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7793	SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7794	SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
7795	N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
7796
7797	// 1. Create a congruent vector with the target value in each element.
7798	SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
7799	Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
7800
7801	// 2. Mask off all other indices except the required index within (1).
7802	SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
7803
7804	// 3. Mask off the required index within the target vector.
7805	SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7806	SDValue RHS =
7807	DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
7808
7809	// 4. Get (2) and (3) ORed into the target vector.
7810	SDValue BFI =
7811	DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
7812
7813	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
7814	}
7815
7816	SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7817	SelectionDAG &DAG) const {
7818	SDLoc SL(Op);
7819
7820	EVT ResultVT = Op.getValueType();
7821	SDValue Vec = Op.getOperand(i: `0`);
7822	SDValue Idx = Op.getOperand(i: `1`);
7823	EVT VecVT = Vec.getValueType();
7824	unsigned VecSize = VecVT.getSizeInBits();
7825	EVT EltVT = VecVT.getVectorElementType();
7826
7827	DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7828
7829	// Make sure we do any optimizations that will make it easier to fold
7830	// source modifiers before obscuring it with bit operations.
7831
7832	// XXX - Why doesn't this get called when vector_shuffle is expanded?
7833	if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
7834	return Combined;
7835
7836	if (VecSize == `128` \|\| VecSize == `256` \|\| VecSize == `512`) {
7837	SDValue Lo, Hi;
7838	auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
7839
7840	if (VecSize == `128`) {
7841	SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
7842	Lo = DAG.getBitcast(VT: LoVT,
7843	V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7844	N2: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32)));
7845	Hi = DAG.getBitcast(VT: HiVT,
7846	V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7847	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32)));
7848	} else if (VecSize == `256`) {
7849	SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
7850	SDValue Parts[`4`];
7851	for (unsigned P = `0`; P < `4`; ++P) {
7852	Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7853	N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7854	}
7855
7856	Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7857	N1: Parts[`0`], N2: Parts[`1`]));
7858	Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7859	N1: Parts[`2`], N2: Parts[`3`]));
7860	} else {
7861	assert(VecSize == `512`);
7862
7863	SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
7864	SDValue Parts[`8`];
7865	for (unsigned P = `0`; P < `8`; ++P) {
7866	Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7867	N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7868	}
7869
7870	Lo = DAG.getBitcast(VT: LoVT,
7871	V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7872	N1: Parts[`0`], N2: Parts[`1`], N3: Parts[`2`], N4: Parts[`3`]));
7873	Hi = DAG.getBitcast(VT: HiVT,
7874	V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7875	N1: Parts[`4`], N2: Parts[`5`], N3: Parts[`6`], N4: Parts[`7`]));
7876	}
7877
7878	EVT IdxVT = Idx.getValueType();
7879	unsigned NElem = VecVT.getVectorNumElements();
7880	assert(isPowerOf2_32(NElem));
7881	SDValue IdxMask = DAG.getConstant(Val: NElem / `2` - `1`, DL: SL, VT: IdxVT);
7882	SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
7883	SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
7884	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
7885	}
7886
7887	assert(VecSize <= `64`);
7888
7889	MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7890
7891	// If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7892	SDValue VecBC = peekThroughBitcasts(V: Vec);
7893	if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7894	SDValue Src = VecBC.getOperand(i: `0`);
7895	Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
7896	Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
7897	}
7898
7899	unsigned EltSize = EltVT.getSizeInBits();
7900	assert(isPowerOf2_32(EltSize));
7901
7902	SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7903
7904	// Convert vector index to bit-index ( EltSize)*
7905	SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7906
7907	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7908	SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
7909
7910	if (ResultVT == MVT::f16 \|\| ResultVT == MVT::bf16) {
7911	SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
7912	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
7913	}
7914
7915	return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
7916	}
7917
7918	static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7919	assert(Elt % `2` == `0`);
7920	return Mask [Elt + `1`] == Mask [Elt] + `1` && (Mask [Elt] % `2` == `0`);
7921	}
7922
7923	static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7924	assert(Elt % `2` == `0`);
7925	return Mask [Elt] >= `0` && Mask [Elt + `1`] >= `0` && (Mask [Elt] & `1`) &&
7926	!(Mask [Elt + `1`] & `1`);
7927	}
7928
7929	SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7930	SelectionDAG &DAG) const {
7931	SDLoc SL(Op);
7932	EVT ResultVT = Op.getValueType();
7933	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
7934	MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7935	const int NewSrcNumElts = `2`;
7936	MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
7937	int SrcNumElts = Op.getOperand(i: `0`).getValueType().getVectorNumElements();
7938
7939	// Break up the shuffle into registers sized pieces.
7940	//
7941	// We're trying to form sub-shuffles that the register allocation pipeline
7942	// won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7943	// blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7944	// pair of copies into a consecutive register copy, so use the ordinary
7945	// extract_vector_elt lowering unless we can use the shuffle.
7946	//
7947	// TODO: This is a bit of hack, and we should probably always use
7948	// extract_subvector for the largest possible subvector we can (or at least
7949	// use it for PackVT aligned pieces). However we have worse support for
7950	// combines on them don't directly treat extract_subvector / insert_subvector
7951	// as legal. The DAG scheduler also ends up doing a worse job with the
7952	// extract_subvectors.
7953	const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == `16`;
7954
7955	// vector_shuffle <0,1,6,7> lhs, rhs
7956	// -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7957	//
7958	// vector_shuffle <6,7,2,3> lhs, rhs
7959	// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7960	//
7961	// vector_shuffle <6,7,0,1> lhs, rhs
7962	// -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7963
7964	// Avoid scalarizing when both halves are reading from consecutive elements.
7965
7966	// If we're treating 2 element shuffles as legal, also create odd-to-even
7967	// shuffles of neighboring pairs.
7968	//
7969	// vector_shuffle <3,2,7,6> lhs, rhs
7970	// -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7971	// vector_shuffle <1, 0> (extract_subvector rhs, 2)
7972
7973	SmallVector<SDValue, `16`> Pieces;
7974	for (int I = `0`, N = ResultVT.getVectorNumElements(); I != N; I += `2`) {
7975	if (ShouldUseConsecutiveExtract &&
7976	elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
7977	const int Idx = SVN->getMaskElt(Idx: I);
7978	int VecIdx = Idx < SrcNumElts ? `0` : `1`;
7979	int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7980	SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
7981	N1: SVN->getOperand(Num: VecIdx),
7982	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7983	Pieces.push_back(Elt: SubVec);
7984	} else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
7985	isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
7986	int Idx0 = SVN->getMaskElt(Idx: I);
7987	int Idx1 = SVN->getMaskElt(Idx: I + `1`);
7988
7989	SDValue SrcOp0 = SVN->getOperand(Num: `0`);
7990	SDValue SrcOp1 = SrcOp0;
7991	if (Idx0 >= SrcNumElts) {
7992	SrcOp0 = SVN->getOperand(Num: `1`);
7993	Idx0 -= SrcNumElts;
7994	}
7995
7996	if (Idx1 >= SrcNumElts) {
7997	SrcOp1 = SVN->getOperand(Num: `1`);
7998	Idx1 -= SrcNumElts;
7999	}
8000
8001	int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - `1`);
8002	int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - `1`);
8003
8004	// Extract nearest even aligned piece.
8005	SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
8006	N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
8007	SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
8008	N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
8009
8010	int NewMaskIdx0 = Idx0 - AlignedIdx0;
8011	int NewMaskIdx1 = Idx1 - AlignedIdx1;
8012
8013	SDValue Result0 = SubVec0;
8014	SDValue Result1 = SubVec0;
8015
8016	if (SubVec0 != SubVec1) {
8017	NewMaskIdx1 += NewSrcNumElts;
8018	Result1 = SubVec1;
8019	} else {
8020	Result1 = DAG.getPOISON(VT: PackVT);
8021	}
8022
8023	SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
8024	Mask: {NewMaskIdx0, NewMaskIdx1});
8025	Pieces.push_back(Elt: Shuf);
8026	} else {
8027	const int Idx0 = SVN->getMaskElt(Idx: I);
8028	const int Idx1 = SVN->getMaskElt(Idx: I + `1`);
8029	int VecIdx0 = Idx0 < SrcNumElts ? `0` : `1`;
8030	int VecIdx1 = Idx1 < SrcNumElts ? `0` : `1`;
8031	int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8032	int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8033
8034	SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
8035	SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
8036	N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
8037
8038	SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
8039	SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
8040	N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
8041	Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
8042	}
8043	}
8044
8045	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
8046	}
8047
8048	SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8049	SelectionDAG &DAG) const {
8050	SDValue SVal = Op.getOperand(i: `0`);
8051	EVT ResultVT = Op.getValueType();
8052	EVT SValVT = SVal.getValueType();
8053	SDValue UndefVal = DAG.getPOISON(VT: SValVT);
8054	SDLoc SL(Op);
8055
8056	SmallVector<SDValue, `8`> VElts;
8057	VElts.push_back(Elt: SVal);
8058	for (int I = `1`, E = ResultVT.getVectorNumElements(); I < E; ++I)
8059	VElts.push_back(Elt: UndefVal);
8060
8061	return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
8062	}
8063
8064	SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8065	SelectionDAG &DAG) const {
8066	SDLoc SL(Op);
8067	EVT VT = Op.getValueType();
8068
8069	if (VT == MVT::v2f16 \|\| VT == MVT::v2i16 \|\| VT == MVT::v2bf16) {
8070	assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8071
8072	SDValue Lo = Op.getOperand(i: `0`);
8073	SDValue Hi = Op.getOperand(i: `1`);
8074
8075	// Avoid adding defined bits with the zero_extend.
8076	if (Hi.isUndef()) {
8077	Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
8078	SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
8079	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
8080	}
8081
8082	Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
8083	Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
8084
8085	SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
8086	N2: DAG.getConstant(Val: `16`, DL: SL, VT: MVT::i32));
8087	if (Lo.isUndef())
8088	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
8089
8090	Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
8091	Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
8092
8093	SDValue Or =
8094	DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
8095	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
8096	}
8097
8098	// Split into 2-element chunks.
8099	const unsigned NumParts = VT.getVectorNumElements() / `2`;
8100	EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: `2`);
8101	MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
8102
8103	SmallVector<SDValue> Casts;
8104	for (unsigned P = `0`; P < NumParts; ++P) {
8105	SDValue Vec = DAG.getBuildVector(
8106	VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * `2`), Op.getOperand(i: P * `2` + `1`)});
8107	Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
8108	}
8109
8110	SDValue Blend =
8111	DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
8112	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
8113	}
8114
8115	bool SITargetLowering::isOffsetFoldingLegal(
8116	const GlobalAddressSDNode GA) const* {
8117	// OSes that use ELF REL relocations (instead of RELA) can only store a
8118	// 32-bit addend in the instruction, so it is not safe to allow offset folding
8119	// which can create arbitrary 64-bit addends. (This is only a problem for
8120	// R_AMDGPU_32_HI relocations since other relocation types are unaffected by*
8121	// the high 32 bits of the addend.)
8122	//
8123	// This should be kept in sync with how HasRelocationAddend is initialized in
8124	// the constructor of ELFAMDGPUAsmBackend.
8125	if (!Subtarget->isAmdHsaOS())
8126	return false;
8127
8128	// We can fold offsets for anything that doesn't require a GOT relocation.
8129	return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS \|\|
8130	GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
8131	GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
8132	!shouldEmitGOTReloc(GV: GA->getGlobal());
8133	}
8134
8135	static SDValue
8136	buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
8137	const SDLoc &DL, int64_t Offset, EVT PtrVT,
8138	unsigned GAFlags = SIInstrInfo::MO_NONE) {
8139	assert(isInt<`32`>(Offset + `4`) && "32-bit offset is expected!");
8140	// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8141	// lowered to the following code sequence:
8142	//
8143	// For constant address space:
8144	// s_getpc_b64 s[0:1]
8145	// s_add_u32 s0, s0, $symbol
8146	// s_addc_u32 s1, s1, 0
8147	//
8148	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
8149	// a fixup or relocation is emitted to replace $symbol with a literal
8150	// constant, which is a pc-relative offset from the encoding of the $symbol
8151	// operand to the global variable.
8152	//
8153	// For global address space:
8154	// s_getpc_b64 s[0:1]
8155	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8156	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8157	//
8158	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
8159	// fixups or relocations are emitted to replace $symbol@@lo and*
8160	// $symbol@@hi with lower 32 bits and higher 32 bits of a literal constant,*
8161	// which is a 64-bit pc-relative offset from the encoding of the $symbol
8162	// operand to the global variable.
8163	SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
8164	SDValue PtrHi;
8165	if (GAFlags == SIInstrInfo::MO_NONE)
8166	PtrHi = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32);
8167	else
8168	PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + `1`);
8169	return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
8170	}
8171
8172	SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8173	SDValue Op,
8174	SelectionDAG &DAG) const {
8175	GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
8176	SDLoc DL(GSD);
8177	EVT PtrVT = Op.getValueType();
8178
8179	const GlobalValue *GV = GSD->getGlobal();
8180	if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8181	shouldUseLDSConstAddress(GV)) \|\|
8182	GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|
8183	GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
8184	if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8185	GV->hasExternalLinkage()) {
8186	Type *Ty = GV->getValueType();
8187	// HIP uses an unsized array `extern __shared__ T s[]` or similar
8188	// zero-sized type in other languages to declare the dynamic shared
8189	// memory which size is not known at the compile time. They will be
8190	// allocated by the runtime and placed directly after the static
8191	// allocated ones. They all share the same offset.
8192	if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8193	assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8194	// Adjust alignment for that dynamic shared memory array.
8195	Function &F = DAG.getMachineFunction().getFunction();
8196	MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV));
8197	MFI->setUsesDynamicLDS(true);
8198	return SDValue (
8199	DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), `0`);
8200	}
8201	}
8202	return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
8203	}
8204
8205	if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8206	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
8207	TargetFlags: SIInstrInfo::MO_ABS32_LO);
8208	return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
8209	}
8210
8211	if (Subtarget->isAmdPalOS() \|\| Subtarget->isMesa3DOS()) {
8212	SDValue AddrLo = DAG.getTargetGlobalAddress(
8213	GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
8214	AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), `0`};
8215
8216	SDValue AddrHi = DAG.getTargetGlobalAddress(
8217	GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
8218	AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), `0`};
8219
8220	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
8221	}
8222
8223	if (shouldEmitFixup(GV))
8224	return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
8225
8226	if (shouldEmitPCReloc(GV))
8227	return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
8228	GAFlags: SIInstrInfo::MO_REL32);
8229
8230	SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: `0`, PtrVT,
8231	GAFlags: SIInstrInfo::MO_GOTPCREL32);
8232	PointerType *PtrTy =
8233	PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
8234	const DataLayout &DataLayout = DAG.getDataLayout();
8235	Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
8236	MachinePointerInfo PtrInfo =
8237	MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
8238
8239	return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
8240	MMOFlags: MachineMemOperand::MODereferenceable \|
8241	MachineMemOperand::MOInvariant);
8242	}
8243
8244	SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
8245	const SDLoc &DL, SDValue V) const {
8246	// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8247	// the destination register.
8248	//
8249	// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8250	// so we will end up with redundant moves to m0.
8251	//
8252	// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8253
8254	// A Null SDValue creates a glue result.
8255	SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
8256	Op1: V, Op2: Chain);
8257	return SDValue (M0, `0`);
8258	}
8259
8260	SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8261	MVT VT,
8262	unsigned Offset) const {
8263	SDLoc SL(Op);
8264	SDValue Param = lowerKernargMemParameter(
8265	DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align (`4`), Signed: false);
8266	// The local size values will have the hi 16-bits as zero.
8267	return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
8268	N2: DAG.getValueType(VT));
8269	}
8270
8271	static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8272	EVT VT) {
8273	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8274	DAG.getMachineFunction().getFunction(),
8275	"non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8276	return DAG.getPOISON(VT);
8277	}
8278
8279	static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8280	EVT VT) {
8281	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8282	DAG.getMachineFunction().getFunction(),
8283	"intrinsic not supported on subtarget", DL.getDebugLoc()));
8284	return DAG.getPOISON(VT);
8285	}
8286
8287	static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
8288	ArrayRef<SDValue> Elts) {
8289	assert(!Elts.empty());
8290	MVT Type;
8291	unsigned NumElts = Elts.size();
8292
8293	if (NumElts <= `12`) {
8294	Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
8295	} else {
8296	assert(Elts.size() <= `16`);
8297	Type = MVT::v16f32;
8298	NumElts = `16`;
8299	}
8300
8301	SmallVector<SDValue, `16`> VecElts(NumElts);
8302	for (unsigned i = `0`; i < Elts.size(); ++i) {
8303	SDValue Elt = Elts [i];
8304	if (Elt.getValueType() != MVT::f32)
8305	Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
8306	VecElts [i] = Elt;
8307	}
8308	for (unsigned i = Elts.size(); i < NumElts; ++i)
8309	VecElts [i] = DAG.getPOISON(VT: MVT::f32);
8310
8311	if (NumElts == `1`)
8312	return VecElts [`0`];
8313	return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
8314	}
8315
8316	static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8317	SDValue Src, int ExtraElts) {
8318	EVT SrcVT = Src.getValueType();
8319
8320	SmallVector<SDValue, `8`> Elts;
8321
8322	if (SrcVT.isVector())
8323	DAG.ExtractVectorElements(Op: Src, Args&: Elts);
8324	else
8325	Elts.push_back(Elt: Src);
8326
8327	SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
8328	while (ExtraElts--)
8329	Elts.push_back(Elt: Undef);
8330
8331	return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
8332	}
8333
8334	// Re-construct the required return value for a image load intrinsic.
8335	// This is more complicated due to the optional use TexFailCtrl which means the
8336	// required return type is an aggregate
8337	static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
8338	ArrayRef<EVT> ResultTypes, bool IsTexFail,
8339	bool Unpacked, bool IsD16, int DMaskPop,
8340	int NumVDataDwords, bool IsAtomicPacked16Bit,
8341	const SDLoc &DL) {
8342	// Determine the required return type. This is the same regardless of
8343	// IsTexFail flag
8344	EVT ReqRetVT = ResultTypes [`0`];
8345	int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : `1`;
8346	int NumDataDwords = ((IsD16 && !Unpacked) \|\| IsAtomicPacked16Bit)
8347	? (ReqRetNumElts + `1`) / `2`
8348	: ReqRetNumElts;
8349
8350	int MaskPopDwords = (!IsD16 \|\| Unpacked) ? DMaskPop : (DMaskPop + `1`) / `2`;
8351
8352	MVT DataDwordVT =
8353	NumDataDwords == `1` ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
8354
8355	MVT MaskPopVT =
8356	MaskPopDwords == `1` ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
8357
8358	SDValue Data(Result, `0`);
8359	SDValue TexFail;
8360
8361	if (DMaskPop > `0` && Data.getValueType() != MaskPopVT) {
8362	SDValue ZeroIdx = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
8363	if (MaskPopVT.isVector()) {
8364	Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
8365	N1: SDValue (Result, `0`), N2: ZeroIdx);
8366	} else {
8367	Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
8368	N1: SDValue (Result, `0`), N2: ZeroIdx);
8369	}
8370	}
8371
8372	if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8373	Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
8374	ExtraElts: NumDataDwords - MaskPopDwords);
8375
8376	if (IsD16)
8377	Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
8378
8379	EVT LegalReqRetVT = ReqRetVT;
8380	if (!ReqRetVT.isVector()) {
8381	if (!Data.getValueType().isInteger())
8382	Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
8383	VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
8384	Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
8385	} else {
8386	// We need to widen the return vector to a legal type
8387	if ((ReqRetVT.getVectorNumElements() % `2`) == `1` &&
8388	ReqRetVT.getVectorElementType().getSizeInBits() == `16`) {
8389	LegalReqRetVT =
8390	EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
8391	NumElements: ReqRetVT.getVectorNumElements() + `1`);
8392	}
8393	}
8394	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
8395
8396	if (IsTexFail) {
8397	TexFail =
8398	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue (Result, `0`),
8399	N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
8400
8401	return DAG.getMergeValues(Ops: {Data, TexFail, SDValue (Result, `1`)}, dl: DL);
8402	}
8403
8404	if (Result->getNumValues() == `1`)
8405	return Data;
8406
8407	return DAG.getMergeValues(Ops: {Data, SDValue (Result, `1`)}, dl: DL);
8408	}
8409
8410	static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8411	SDValue LWE, bool* &IsTexFail) {
8412	auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
8413
8414	uint64_t Value = TexFailCtrlConst->getZExtValue();
8415	if (Value) {
8416	IsTexFail = true;
8417	}
8418
8419	SDLoc DL(TexFailCtrlConst);
8420	*TFE = DAG.getTargetConstant(Val: (Value & `0x1`) ? `1` : `0`, DL, VT: MVT::i32);
8421	Value &= ~(uint64_t)`0x1`;
8422	*LWE = DAG.getTargetConstant(Val: (Value & `0x2`) ? `1` : `0`, DL, VT: MVT::i32);
8423	Value &= ~(uint64_t)`0x2`;
8424
8425	return Value == `0`;
8426	}
8427
8428	static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
8429	MVT PackVectorVT,
8430	SmallVectorImpl<SDValue> &PackedAddrs,
8431	unsigned DimIdx, unsigned EndIdx,
8432	unsigned NumGradients) {
8433	SDLoc DL(Op);
8434	for (unsigned I = DimIdx; I < EndIdx; I++) {
8435	SDValue Addr = Op.getOperand(i: I);
8436
8437	// Gradients are packed with undef for each coordinate.
8438	// In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8439	// 1D: undef,dx/dh; undef,dx/dv
8440	// 2D: dy/dh,dx/dh; dy/dv,dx/dv
8441	// 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8442	if (((I + `1`) >= EndIdx) \|\|
8443	((NumGradients / `2`) % `2` == `1` && (I == DimIdx + (NumGradients / `2`) - `1` \|\|
8444	I == DimIdx + NumGradients - `1`))) {
8445	if (Addr.getValueType() != MVT::i16)
8446	Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
8447	Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
8448	} else {
8449	Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + `1`)});
8450	I++;
8451	}
8452	Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
8453	PackedAddrs.push_back(Elt: Addr);
8454	}
8455	}
8456
8457	SDValue SITargetLowering::lowerImage(SDValue Op,
8458	const AMDGPU::ImageDimIntrinsicInfo *Intr,
8459	SelectionDAG &DAG, bool WithChain) const {
8460	SDLoc DL(Op);
8461	MachineFunction &MF = DAG.getMachineFunction();
8462	const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8463	const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8464	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
8465	const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
8466	unsigned IntrOpcode = Intr->BaseOpcode;
8467	bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
8468	bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
8469	bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
8470
8471	SmallVector<EVT, `3`> ResultTypes(Op ->values());
8472	SmallVector<EVT, `3`> OrigResultTypes(Op ->values());
8473	bool IsD16 = false;
8474	bool IsG16 = false;
8475	bool IsA16 = false;
8476	SDValue VData;
8477	int NumVDataDwords = `0`;
8478	bool AdjustRetType = false;
8479	bool IsAtomicPacked16Bit = false;
8480
8481	// Offset of intrinsic arguments
8482	const unsigned ArgOffset = WithChain ? `2` : `1`;
8483
8484	unsigned DMask;
8485	unsigned DMaskLanes = `0`;
8486
8487	if (BaseOpcode->Atomic) {
8488	VData = Op.getOperand(i: `2`);
8489
8490	IsAtomicPacked16Bit =
8491	(Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 \|\|
8492	Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8493
8494	bool Is64Bit = VData.getValueSizeInBits() == `64`;
8495	if (BaseOpcode->AtomicX2) {
8496	SDValue VData2 = Op.getOperand(i: `3`);
8497	VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8498	Ops: {VData, VData2});
8499	if (Is64Bit)
8500	VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
8501
8502	ResultTypes [`0`] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8503	DMask = Is64Bit ? `0xf` : `0x3`;
8504	NumVDataDwords = Is64Bit ? `4` : `2`;
8505	} else {
8506	DMask = Is64Bit ? `0x3` : `0x1`;
8507	NumVDataDwords = Is64Bit ? `2` : `1`;
8508	}
8509	} else {
8510	DMask = Op ->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
8511	DMaskLanes = BaseOpcode->Gather4 ? `4` : llvm::popcount(Value: DMask);
8512
8513	if (BaseOpcode->Store) {
8514	VData = Op.getOperand(i: `2`);
8515
8516	MVT StoreVT = VData.getSimpleValueType();
8517	if (StoreVT.getScalarType() == MVT::f16) {
8518	if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
8519	return Op; // D16 is unsupported for this instruction
8520
8521	IsD16 = true;
8522	VData = handleD16VData(VData, DAG, ImageStore: true);
8523	}
8524
8525	NumVDataDwords = (VData.getValueType().getSizeInBits() + `31`) / `32`;
8526	} else if (!BaseOpcode->NoReturn) {
8527	// Work out the num dwords based on the dmask popcount and underlying type
8528	// and whether packing is supported.
8529	MVT LoadVT = ResultTypes [`0`].getSimpleVT();
8530	if (LoadVT.getScalarType() == MVT::f16) {
8531	if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
8532	return Op; // D16 is unsupported for this instruction
8533
8534	IsD16 = true;
8535	}
8536
8537	// Confirm that the return type is large enough for the dmask specified
8538	if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|
8539	(!LoadVT.isVector() && DMaskLanes > `1`))
8540	return Op;
8541
8542	// The sq block of gfx8 and gfx9 do not estimate register use correctly
8543	// for d16 image_gather4, image_gather4_l, and image_gather4_lz
8544	// instructions.
8545	if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8546	!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8547	NumVDataDwords = (DMaskLanes + `1`) / `2`;
8548	else
8549	NumVDataDwords = DMaskLanes;
8550
8551	AdjustRetType = true;
8552	}
8553	}
8554
8555	unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8556	SmallVector<SDValue, `4`> VAddrs;
8557
8558	// Check for 16 bit addresses or derivatives and pack if true.
8559	MVT VAddrVT =
8560	Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
8561	MVT VAddrScalarVT = VAddrVT.getScalarType();
8562	MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8563	IsG16 = VAddrScalarVT == MVT::f16 \|\| VAddrScalarVT == MVT::i16;
8564
8565	VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
8566	VAddrScalarVT = VAddrVT.getScalarType();
8567	MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8568	IsA16 = VAddrScalarVT == MVT::f16 \|\| VAddrScalarVT == MVT::i16;
8569
8570	// Push back extra arguments.
8571	for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8572	if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
8573	assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8574	// Special handling of bias when A16 is on. Bias is of type half but
8575	// occupies full 32-bit.
8576	SDValue Bias = DAG.getBuildVector(
8577	VT: MVT::v2f16, DL,
8578	Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
8579	VAddrs.push_back(Elt: Bias);
8580	} else {
8581	assert((!IsA16 \|\| Intr->NumBiasArgs == `0` \|\| I != Intr->BiasIndex) &&
8582	"Bias needs to be converted to 16 bit in A16 mode");
8583	VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
8584	}
8585	}
8586
8587	if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8588	// 16 bit gradients are supported, but are tied to the A16 control
8589	// so both gradients and addresses must be 16 bit
8590	LLVM_DEBUG(
8591	dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8592	"require 16 bit args for both gradients and addresses");
8593	return Op;
8594	}
8595
8596	if (IsA16) {
8597	if (!ST->hasA16()) {
8598	LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8599	"support 16 bit addresses\n");
8600	return Op;
8601	}
8602	}
8603
8604	// We've dealt with incorrect input so we know that if IsA16, IsG16
8605	// are set then we have to compress/pack operands (either address,
8606	// gradient or both)
8607	// In the case where a16 and gradients are tied (no G16 support) then we
8608	// have already verified that both IsA16 and IsG16 are true
8609	if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8610	// Activate g16
8611	const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8612	AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
8613	IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8614	}
8615
8616	// Add gradients (packed or unpacked)
8617	if (IsG16) {
8618	// Pack the gradients
8619	// const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8620	packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
8621	DimIdx: ArgOffset + Intr->GradientStart,
8622	EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
8623	} else {
8624	for (unsigned I = ArgOffset + Intr->GradientStart;
8625	I < ArgOffset + Intr->CoordStart; I++)
8626	VAddrs.push_back(Elt: Op.getOperand(i: I));
8627	}
8628
8629	// Add addresses (packed or unpacked)
8630	if (IsA16) {
8631	packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
8632	DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
8633	NumGradients: `0` / No gradients /);
8634	} else {
8635	// Add uncompressed address
8636	for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8637	VAddrs.push_back(Elt: Op.getOperand(i: I));
8638	}
8639
8640	// If the register allocator cannot place the address registers contiguously
8641	// without introducing moves, then using the non-sequential address encoding
8642	// is always preferable, since it saves VALU instructions and is usually a
8643	// wash in terms of code size or even better.
8644	//
8645	// However, we currently have no way of hinting to the register allocator that
8646	// MIMG addresses should be placed contiguously when it is possible to do so,
8647	// so force non-NSA for the common 2-address case as a heuristic.
8648	//
8649	// SIShrinkInstructions will convert NSA encodings to non-NSA after register
8650	// allocation when possible.
8651	//
8652	// Partial NSA is allowed on GFX11+ where the final register is a contiguous
8653	// set of the remaining addresses.
8654	const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
8655	const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8656	const bool UseNSA = ST->hasNSAEncoding() &&
8657	VAddrs.size() >= ST->getNSAThreshold(MF) &&
8658	(VAddrs.size() <= NSAMaxSize \|\| HasPartialNSAEncoding);
8659	const bool UsePartialNSA =
8660	UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8661
8662	SDValue VAddr;
8663	if (UsePartialNSA) {
8664	VAddr = getBuildDwordsVector(DAG, DL,
8665	Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - `1`));
8666	} else if (!UseNSA) {
8667	VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
8668	}
8669
8670	SDValue True = DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1);
8671	SDValue False = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1);
8672	SDValue Unorm;
8673	if (!BaseOpcode->Sampler) {
8674	Unorm = True;
8675	} else {
8676	uint64_t UnormConst =
8677	Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
8678
8679	Unorm = UnormConst ? True : False;
8680	}
8681
8682	SDValue TFE;
8683	SDValue LWE;
8684	SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
8685	bool IsTexFail = false;
8686	if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
8687	return Op;
8688
8689	if (IsTexFail) {
8690	if (!DMaskLanes) {
8691	// Expecting to get an error flag since TFC is on - and dmask is 0
8692	// Force dmask to be at least 1 otherwise the instruction will fail
8693	DMask = `0x1`;
8694	DMaskLanes = `1`;
8695	NumVDataDwords = `1`;
8696	}
8697	NumVDataDwords += `1`;
8698	AdjustRetType = true;
8699	}
8700
8701	// Has something earlier tagged that the return type needs adjusting
8702	// This happens if the instruction is a load or has set TexFailCtrl flags
8703	if (AdjustRetType) {
8704	// NumVDataDwords reflects the true number of dwords required in the return
8705	// type
8706	if (DMaskLanes == `0` && !BaseOpcode->Store) {
8707	// This is a no-op load. This can be eliminated
8708	SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
8709	if (isa<MemSDNode>(Val: Op))
8710	return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: `0`)}, dl: DL);
8711	return Undef;
8712	}
8713
8714	EVT NewVT = NumVDataDwords > `1` ? EVT::getVectorVT(Context&: *DAG.getContext(),
8715	VT: MVT::i32, NumElements: NumVDataDwords)
8716	: MVT::i32;
8717
8718	ResultTypes [`0`] = NewVT;
8719	if (ResultTypes.size() == `3`) {
8720	// Original result was aggregate type used for TexFailCtrl results
8721	// The actual instruction returns as a vector type which has now been
8722	// created. Remove the aggregate result.
8723	ResultTypes.erase(CI: &ResultTypes [`1`]);
8724	}
8725	}
8726
8727	unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
8728	if (BaseOpcode->Atomic)
8729	CPol \|= AMDGPU::CPol::GLC; // TODO no-return optimization
8730	if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) \|
8731	AMDGPU::CPol::VOLATILE))
8732	return Op;
8733
8734	SmallVector<SDValue, `26`> Ops;
8735	if (BaseOpcode->Store \|\| BaseOpcode->Atomic)
8736	Ops.push_back(Elt: VData); // vdata
8737	if (UsePartialNSA) {
8738	append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - `1`));
8739	Ops.push_back(Elt: VAddr);
8740	} else if (UseNSA)
8741	append_range(C&: Ops, R&: VAddrs);
8742	else
8743	Ops.push_back(Elt: VAddr);
8744	SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
8745	EVT RsrcVT = Rsrc.getValueType();
8746	if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8747	return Op;
8748	Ops.push_back(Elt: Rsrc);
8749	if (BaseOpcode->Sampler) {
8750	SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
8751	if (Samp.getValueType() != MVT::v4i32)
8752	return Op;
8753	Ops.push_back(Elt: Samp);
8754	}
8755	Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
8756	if (IsGFX10Plus)
8757	Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
8758	if (!IsGFX12Plus \|\| BaseOpcode->Sampler \|\| BaseOpcode->MSAA)
8759	Ops.push_back(Elt: Unorm);
8760	Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
8761	Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
8762	ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
8763	? True
8764	: False);
8765	if (IsGFX10Plus)
8766	Ops.push_back(Elt: IsA16 ? True : False);
8767
8768	if (!Subtarget->hasGFX90AInsts())
8769	Ops.push_back(Elt: TFE); // tfe
8770	else if (TFE ->getAsZExtVal()) {
8771	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8772	DAG.getMachineFunction().getFunction(),
8773	"TFE is not supported on this GPU", DL.getDebugLoc()));
8774	}
8775
8776	if (!IsGFX12Plus \|\| BaseOpcode->Sampler \|\| BaseOpcode->MSAA)
8777	Ops.push_back(Elt: LWE); // lwe
8778	if (!IsGFX10Plus)
8779	Ops.push_back(Elt: DimInfo->DA ? True : False);
8780	if (BaseOpcode->HasD16)
8781	Ops.push_back(Elt: IsD16 ? True : False);
8782	if (isa<MemSDNode>(Val: Op))
8783	Ops.push_back(Elt: Op.getOperand(i: `0`)); // chain
8784
8785	int NumVAddrDwords =
8786	UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / `32`;
8787	int Opcode = -`1`;
8788
8789	if (IsGFX12Plus) {
8790	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
8791	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8792	} else if (IsGFX11Plus) {
8793	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8794	MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
8795	: AMDGPU::MIMGEncGfx11Default,
8796	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8797	} else if (IsGFX10Plus) {
8798	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8799	MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
8800	: AMDGPU::MIMGEncGfx10Default,
8801	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8802	} else {
8803	if (Subtarget->hasGFX90AInsts()) {
8804	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
8805	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8806	if (Opcode == -`1`) {
8807	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8808	DAG.getMachineFunction().getFunction(),
8809	"requested image instruction is not supported on this GPU",
8810	DL.getDebugLoc()));
8811
8812	unsigned Idx = `0`;
8813	SmallVector<SDValue, `3`> RetValues(OrigResultTypes.size());
8814	for (EVT VT : OrigResultTypes) {
8815	if (VT == MVT::Other)
8816	RetValues [Idx++] = Op.getOperand(i: `0`); // Chain
8817	else
8818	RetValues [Idx++] = DAG.getPOISON(VT);
8819	}
8820
8821	return DAG.getMergeValues(Ops: RetValues, dl: DL);
8822	}
8823	}
8824	if (Opcode == -`1` &&
8825	Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8826	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
8827	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8828	if (Opcode == -`1`)
8829	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
8830	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8831	}
8832	if (Opcode == -`1`)
8833	return Op;
8834
8835	MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
8836	if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
8837	MachineMemOperand *MemRef = MemOp->getMemOperand();
8838	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
8839	}
8840
8841	if (BaseOpcode->AtomicX2) {
8842	SmallVector<SDValue, `1`> Elt;
8843	DAG.ExtractVectorElements(Op: SDValue (NewNode, `0`), Args&: Elt, Start: `0`, Count: `1`);
8844	return DAG.getMergeValues(Ops: {Elt [`0`], SDValue (NewNode, `1`)}, dl: DL);
8845	}
8846	if (BaseOpcode->NoReturn)
8847	return SDValue (NewNode, `0`);
8848	return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
8849	Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
8850	NumVDataDwords, IsAtomicPacked16Bit, DL);
8851	}
8852
8853	SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8854	SDValue Offset, SDValue CachePolicy,
8855	SelectionDAG &DAG) const {
8856	MachineFunction &MF = DAG.getMachineFunction();
8857
8858	const DataLayout &DataLayout = DAG.getDataLayout();
8859	Align Alignment =
8860	DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
8861
8862	MachineMemOperand *MMO = MF.getMachineMemOperand(
8863	PtrInfo: MachinePointerInfo (),
8864	F: MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
8865	MachineMemOperand::MOInvariant,
8866	Size: VT.getStoreSize(), BaseAlignment: Alignment);
8867
8868	if (!Offset ->isDivergent()) {
8869	SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8870
8871	// Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8872	// s_buffer_load_u16 instruction is emitted for both signed and unsigned
8873	// loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8874	// and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8875	if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8876	SDValue BufferLoad =
8877	DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
8878	VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
8879	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
8880	}
8881
8882	// Widen vec3 load to vec4.
8883	if (VT.isVector() && VT.getVectorNumElements() == `3` &&
8884	!Subtarget->hasScalarDwordx3Loads()) {
8885	EVT WidenedVT =
8886	EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: `4`);
8887	auto WidenedOp = DAG.getMemIntrinsicNode(
8888	Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
8889	MMO: MF.getMachineMemOperand(MMO, Offset: `0`, Size: WidenedVT.getStoreSize()));
8890	auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
8891	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
8892	return Subvector;
8893	}
8894
8895	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
8896	VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
8897	}
8898
8899	// We have a divergent offset. Emit a MUBUF buffer load instead. We can
8900	// assume that the buffer is unswizzled.
8901	SDValue Ops[] = {
8902	DAG.getEntryNode(), // Chain
8903	Rsrc, // rsrc
8904	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
8905	{}, // voffset
8906	{}, // soffset
8907	{}, // offset
8908	CachePolicy, // cachepolicy
8909	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
8910	};
8911	if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8912	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`], Alignment: Align (`4`));
8913	return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
8914	}
8915
8916	SmallVector<SDValue, `4`> Loads;
8917	unsigned NumLoads = `1`;
8918	MVT LoadVT = VT.getSimpleVT();
8919	unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : `1`;
8920	assert((LoadVT.getScalarType() == MVT::i32 \|\|
8921	LoadVT.getScalarType() == MVT::f32));
8922
8923	if (NumElts == `8` \|\| NumElts == `16`) {
8924	NumLoads = NumElts / `4`;
8925	LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: `4`);
8926	}
8927
8928	SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
8929
8930	// Use the alignment to ensure that the required offsets will fit into the
8931	// immediate offsets.
8932	setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[`3`],
8933	Alignment: NumLoads > `1` ? Align (`16` * NumLoads) : Align (`4`));
8934
8935	uint64_t InstOffset = Ops[`5`]->getAsZExtVal();
8936	for (unsigned i = `0`; i < NumLoads; ++i) {
8937	Ops[`5`] = DAG.getTargetConstant(Val: InstOffset + `16` * i, DL, VT: MVT::i32);
8938	Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8939	MemVT: LoadVT, MMO, DAG));
8940	}
8941
8942	if (NumElts == `8` \|\| NumElts == `16`)
8943	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
8944
8945	return Loads [`0`];
8946	}
8947
8948	SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8949	// With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8950	if (!Subtarget->hasArchitectedSGPRs())
8951	return {};
8952	SDLoc SL(Op);
8953	MVT VT = MVT::i32;
8954	SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
8955	return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
8956	N2: DAG.getConstant(Val: `25`, DL: SL, VT), N3: DAG.getConstant(Val: `5`, DL: SL, VT));
8957	}
8958
8959	SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8960	unsigned Dim,
8961	const ArgDescriptor &Arg) const {
8962	SDLoc SL(Op);
8963	MachineFunction &MF = DAG.getMachineFunction();
8964	unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
8965	if (MaxID == `0`)
8966	return DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
8967
8968	// It's undefined behavior if a function marked with the amdgpu-no-*
8969	// attributes uses the corresponding intrinsic.
8970	if (!Arg)
8971	return DAG.getPOISON(VT: Op ->getValueType(ResNo: `0`));
8972
8973	SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
8974	SL: SDLoc (DAG.getEntryNode()), Arg);
8975
8976	// Don't bother inserting AssertZext for packed IDs since we're emitting the
8977	// masking operations anyway.
8978	//
8979	// TODO: We could assert the top bit is 0 for the source copy.
8980	if (Arg.isMasked())
8981	return Val;
8982
8983	// Preserve the known bits after expansion to a copy.
8984	EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
8985	return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
8986	N2: DAG.getValueType(SmallVT));
8987	}
8988
8989	SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8990	SelectionDAG &DAG) const {
8991	MachineFunction &MF = DAG.getMachineFunction();
8992	auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8993
8994	EVT VT = Op.getValueType();
8995	SDLoc DL(Op);
8996	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
8997
8998	// TODO: Should this propagate fast-math-flags?
8999
9000	switch (IntrinsicID) {
9001	case Intrinsic::amdgcn_implicit_buffer_ptr: {
9002	if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
9003	return emitNonHSAIntrinsicError(DAG, DL, VT);
9004	return getPreloadedValue(DAG, MFI: *MFI, VT,
9005	PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
9006	}
9007	case Intrinsic::amdgcn_dispatch_ptr:
9008	case Intrinsic::amdgcn_queue_ptr: {
9009	if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
9010	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9011	MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9012	DL.getDebugLoc()));
9013	return DAG.getPOISON(VT);
9014	}
9015
9016	auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9017	? AMDGPUFunctionArgInfo::DISPATCH_PTR
9018	: AMDGPUFunctionArgInfo::QUEUE_PTR;
9019	return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
9020	}
9021	case Intrinsic::amdgcn_implicitarg_ptr: {
9022	if (MFI->isEntryFunction())
9023	return getImplicitArgPtr(DAG, SL: DL);
9024	return getPreloadedValue(DAG, MFI: *MFI, VT,
9025	PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
9026	}
9027	case Intrinsic::amdgcn_kernarg_segment_ptr: {
9028	if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) {
9029	// This only makes sense to call in a kernel, so just lower to null.
9030	return DAG.getConstant(Val: `0`, DL, VT);
9031	}
9032
9033	return getPreloadedValue(DAG, MFI: *MFI, VT,
9034	PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
9035	}
9036	case Intrinsic::amdgcn_dispatch_id: {
9037	return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
9038	}
9039	case Intrinsic::amdgcn_rcp:
9040	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: `1`));
9041	case Intrinsic::amdgcn_rsq:
9042	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: `1`));
9043	case Intrinsic::amdgcn_rsq_legacy:
9044	if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9045	return emitRemovedIntrinsicError(DAG, DL, VT);
9046	return SDValue ();
9047	case Intrinsic::amdgcn_rcp_legacy:
9048	if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9049	return emitRemovedIntrinsicError(DAG, DL, VT);
9050	return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: `1`));
9051	case Intrinsic::amdgcn_rsq_clamp: {
9052	if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9053	return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: `1`));
9054
9055	Type Type = VT.getTypeForEVT(Context&: DAG.getContext());
9056	APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
9057	APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
9058
9059	SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: `1`));
9060	SDValue Tmp =
9061	DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
9062	return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
9063	N2: DAG.getConstantFP(Val: Min, DL, VT));
9064	}
9065	case Intrinsic::r600_read_ngroups_x:
9066	if (Subtarget->isAmdHsaOS())
9067	return emitNonHSAIntrinsicError(DAG, DL, VT);
9068
9069	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9070	Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align (`4`),
9071	Signed: false);
9072	case Intrinsic::r600_read_ngroups_y:
9073	if (Subtarget->isAmdHsaOS())
9074	return emitNonHSAIntrinsicError(DAG, DL, VT);
9075
9076	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9077	Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align (`4`),
9078	Signed: false);
9079	case Intrinsic::r600_read_ngroups_z:
9080	if (Subtarget->isAmdHsaOS())
9081	return emitNonHSAIntrinsicError(DAG, DL, VT);
9082
9083	return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9084	Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align (`4`),
9085	Signed: false);
9086	case Intrinsic::r600_read_local_size_x:
9087	if (Subtarget->isAmdHsaOS())
9088	return emitNonHSAIntrinsicError(DAG, DL, VT);
9089
9090	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9091	Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
9092	case Intrinsic::r600_read_local_size_y:
9093	if (Subtarget->isAmdHsaOS())
9094	return emitNonHSAIntrinsicError(DAG, DL, VT);
9095
9096	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9097	Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
9098	case Intrinsic::r600_read_local_size_z:
9099	if (Subtarget->isAmdHsaOS())
9100	return emitNonHSAIntrinsicError(DAG, DL, VT);
9101
9102	return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9103	Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
9104	case Intrinsic::amdgcn_workgroup_id_x:
9105	return getPreloadedValue(DAG, MFI: *MFI, VT,
9106	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
9107	case Intrinsic::amdgcn_workgroup_id_y:
9108	return getPreloadedValue(DAG, MFI: *MFI, VT,
9109	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
9110	case Intrinsic::amdgcn_workgroup_id_z:
9111	return getPreloadedValue(DAG, MFI: *MFI, VT,
9112	PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
9113	case Intrinsic::amdgcn_wave_id:
9114	return lowerWaveID(DAG, Op);
9115	case Intrinsic::amdgcn_lds_kernel_id: {
9116	if (MFI->isEntryFunction())
9117	return getLDSKernelId(DAG, SL: DL);
9118	return getPreloadedValue(DAG, MFI: *MFI, VT,
9119	PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
9120	}
9121	case Intrinsic::amdgcn_workitem_id_x:
9122	return lowerWorkitemID(DAG, Op, Dim: `0`, Arg: MFI->getArgInfo().WorkItemIDX);
9123	case Intrinsic::amdgcn_workitem_id_y:
9124	return lowerWorkitemID(DAG, Op, Dim: `1`, Arg: MFI->getArgInfo().WorkItemIDY);
9125	case Intrinsic::amdgcn_workitem_id_z:
9126	return lowerWorkitemID(DAG, Op, Dim: `2`, Arg: MFI->getArgInfo().WorkItemIDZ);
9127	case Intrinsic::amdgcn_wavefrontsize:
9128	return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9129	DL: SDLoc (Op), VT: MVT::i32);
9130	case Intrinsic::amdgcn_s_buffer_load: {
9131	unsigned CPol = Op.getConstantOperandVal(i: `3`);
9132	// s_buffer_load, because of how it's optimized, can't be volatile
9133	// so reject ones with the volatile bit set.
9134	if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9135	? AMDGPU::CPol::ALL
9136	: AMDGPU::CPol::ALL_pregfx12))
9137	return Op;
9138	return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: `1`), Offset: Op.getOperand(i: `2`),
9139	CachePolicy: Op.getOperand(i: `3`), DAG);
9140	}
9141	case Intrinsic::amdgcn_fdiv_fast:
9142	return lowerFDIV_FAST(Op, DAG);
9143	case Intrinsic::amdgcn_sin:
9144	return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: `1`));
9145
9146	case Intrinsic::amdgcn_cos:
9147	return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: `1`));
9148
9149	case Intrinsic::amdgcn_mul_u24:
9150	return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: `1`),
9151	N2: Op.getOperand(i: `2`));
9152	case Intrinsic::amdgcn_mul_i24:
9153	return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: `1`),
9154	N2: Op.getOperand(i: `2`));
9155
9156	case Intrinsic::amdgcn_log_clamp: {
9157	if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9158	return SDValue ();
9159
9160	return emitRemovedIntrinsicError(DAG, DL, VT);
9161	}
9162	case Intrinsic::amdgcn_fract:
9163	return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: `1`));
9164
9165	case Intrinsic::amdgcn_class:
9166	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: `1`),
9167	N2: Op.getOperand(i: `2`));
9168	case Intrinsic::amdgcn_div_fmas:
9169	return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: `1`),
9170	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`), N4: Op.getOperand(i: `4`));
9171
9172	case Intrinsic::amdgcn_div_fixup:
9173	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: `1`),
9174	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9175
9176	case Intrinsic::amdgcn_div_scale: {
9177	const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: `3`));
9178
9179	// Translate to the operands expected by the machine instruction. The
9180	// first parameter must be the same as the first instruction.
9181	SDValue Numerator = Op.getOperand(i: `1`);
9182	SDValue Denominator = Op.getOperand(i: `2`);
9183
9184	// Note this order is opposite of the machine instruction's operations,
9185	// which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9186	// intrinsic has the numerator as the first operand to match a normal
9187	// division operation.
9188
9189	SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9190
9191	return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op ->getVTList(), N1: Src0,
9192	N2: Denominator, N3: Numerator);
9193	}
9194	case Intrinsic::amdgcn_icmp: {
9195	// There is a Pat that handles this variant, so return it as-is.
9196	if (Op.getOperand(i: `1`).getValueType() == MVT::i1 &&
9197	Op.getConstantOperandVal(i: `2`) == `0` &&
9198	Op.getConstantOperandVal(i: `3`) == ICmpInst::Predicate::ICMP_NE)
9199	return Op;
9200	return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9201	}
9202	case Intrinsic::amdgcn_fcmp: {
9203	return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9204	}
9205	case Intrinsic::amdgcn_ballot:
9206	return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9207	case Intrinsic::amdgcn_fmed3:
9208	return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: `1`),
9209	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9210	case Intrinsic::amdgcn_fdot2:
9211	return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: `1`),
9212	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`), N4: Op.getOperand(i: `4`));
9213	case Intrinsic::amdgcn_fmul_legacy:
9214	return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: `1`),
9215	N2: Op.getOperand(i: `2`));
9216	case Intrinsic::amdgcn_sffbh:
9217	return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: `1`));
9218	case Intrinsic::amdgcn_sbfe:
9219	return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: `1`),
9220	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9221	case Intrinsic::amdgcn_ubfe:
9222	return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: `1`),
9223	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9224	case Intrinsic::amdgcn_cvt_pkrtz:
9225	case Intrinsic::amdgcn_cvt_pknorm_i16:
9226	case Intrinsic::amdgcn_cvt_pknorm_u16:
9227	case Intrinsic::amdgcn_cvt_pk_i16:
9228	case Intrinsic::amdgcn_cvt_pk_u16: {
9229	// FIXME: Stop adding cast if v2f16/v2i16 are legal.
9230	EVT VT = Op.getValueType();
9231	unsigned Opcode;
9232
9233	if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9234	Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
9235	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9236	Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
9237	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9238	Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
9239	else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9240	Opcode = AMDGPUISD::CVT_PK_I16_I32;
9241	else
9242	Opcode = AMDGPUISD::CVT_PK_U16_U32;
9243
9244	if (isTypeLegal(VT))
9245	return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
9246
9247	SDValue Node =
9248	DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
9249	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
9250	}
9251	case Intrinsic::amdgcn_fmad_ftz:
9252	return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: `1`),
9253	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9254
9255	case Intrinsic::amdgcn_if_break:
9256	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
9257	Op1: Op ->getOperand(Num: `1`), Op2: Op ->getOperand(Num: `2`)),
9258	`0`);
9259
9260	case Intrinsic::amdgcn_groupstaticsize: {
9261	Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
9262	if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
9263	return Op;
9264
9265	const Module *M = MF.getFunction().getParent();
9266	const GlobalValue *GV =
9267	Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
9268	SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: `0`,
9269	TargetFlags: SIInstrInfo::MO_ABS32_LO);
9270	return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), `0`};
9271	}
9272	case Intrinsic::amdgcn_is_shared:
9273	case Intrinsic::amdgcn_is_private: {
9274	SDLoc SL(Op);
9275	unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9276	? AMDGPUAS::LOCAL_ADDRESS
9277	: AMDGPUAS::PRIVATE_ADDRESS;
9278	SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
9279	SDValue SrcVec =
9280	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
9281
9282	SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
9283	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
9284	return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
9285	}
9286	case Intrinsic::amdgcn_perm:
9287	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: `1`),
9288	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
9289	case Intrinsic::amdgcn_reloc_constant: {
9290	Module M = const_cast<Module >(MF.getFunction().getParent());
9291	const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: `1`))->getMD();
9292	auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: `0`))->getString();
9293	auto *RelocSymbol = cast<GlobalVariable>(
9294	Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
9295	SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: `0`,
9296	TargetFlags: SIInstrInfo::MO_ABS32_LO);
9297	return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), `0`};
9298	}
9299	case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9300	case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9301	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9302	case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9303	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9304	case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9305	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9306	case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9307	if (Op.getOperand(i: `4`).getValueType() == MVT::i32)
9308	return SDValue ();
9309
9310	SDLoc SL(Op);
9311	auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `4`), DL: SL, VT: MVT::i32);
9312	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
9313	N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`),
9314	N4: Op.getOperand(i: `3`), N5: IndexKeyi32);
9315	}
9316	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9317	case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9318	case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9319	if (Op.getOperand(i: `6`).getValueType() == MVT::i32)
9320	return SDValue ();
9321
9322	SDLoc SL(Op);
9323	auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `6`), DL: SL, VT: MVT::i32);
9324	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
9325	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`), Op.getOperand(i: `2`),
9326	Op.getOperand(i: `3`), Op.getOperand(i: `4`), Op.getOperand(i: `5`),
9327	IndexKeyi32, Op.getOperand(i: `7`)});
9328	}
9329	case Intrinsic::amdgcn_addrspacecast_nonnull:
9330	return lowerADDRSPACECAST(Op, DAG);
9331	case Intrinsic::amdgcn_readlane:
9332	case Intrinsic::amdgcn_readfirstlane:
9333	case Intrinsic::amdgcn_writelane:
9334	case Intrinsic::amdgcn_permlane16:
9335	case Intrinsic::amdgcn_permlanex16:
9336	case Intrinsic::amdgcn_permlane64:
9337	case Intrinsic::amdgcn_set_inactive:
9338	case Intrinsic::amdgcn_set_inactive_chain_arg:
9339	case Intrinsic::amdgcn_mov_dpp8:
9340	case Intrinsic::amdgcn_update_dpp:
9341	return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
9342	case Intrinsic::amdgcn_dead: {
9343	SmallVector<SDValue, `8`> Poisons;
9344	for (const EVT ValTy : Op.getNode()->values())
9345	Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
9346	return DAG.getMergeValues(Ops: Poisons, dl: SDLoc (Op));
9347	}
9348	default:
9349	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9350	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
9351	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
9352
9353	return Op;
9354	}
9355	}
9356
9357	// On targets not supporting constant in soffset field, turn zero to
9358	// SGPR_NULL to avoid generating an extra s_mov with zero.
9359	static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
9360	const GCNSubtarget *Subtarget) {
9361	if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
9362	return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
9363	return SOffset;
9364	}
9365
9366	SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9367	SelectionDAG &DAG,
9368	unsigned NewOpcode) const {
9369	SDLoc DL(Op);
9370
9371	SDValue VData = Op.getOperand(i: `2`);
9372	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
9373	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
9374	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
9375	SDValue Ops[] = {
9376	Op.getOperand(i: `0`), // Chain
9377	VData, // vdata
9378	Rsrc, // rsrc
9379	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
9380	VOffset, // voffset
9381	SOffset, // soffset
9382	Offset, // offset
9383	Op.getOperand(i: `6`), // cachepolicy
9384	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
9385	};
9386
9387	auto *M = cast<MemSDNode>(Val&: Op);
9388
9389	EVT MemVT = VData.getValueType();
9390	return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op ->getVTList(), Ops, MemVT,
9391	MMO: M->getMemOperand());
9392	}
9393
9394	SDValue
9395	SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9396	unsigned NewOpcode) const {
9397	SDLoc DL(Op);
9398
9399	SDValue VData = Op.getOperand(i: `2`);
9400	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
9401	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
9402	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
9403	SDValue Ops[] = {
9404	Op.getOperand(i: `0`), // Chain
9405	VData, // vdata
9406	Rsrc, // rsrc
9407	Op.getOperand(i: `4`), // vindex
9408	VOffset, // voffset
9409	SOffset, // soffset
9410	Offset, // offset
9411	Op.getOperand(i: `7`), // cachepolicy
9412	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
9413	};
9414
9415	auto *M = cast<MemSDNode>(Val&: Op);
9416
9417	EVT MemVT = VData.getValueType();
9418	return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op ->getVTList(), Ops, MemVT,
9419	MMO: M->getMemOperand());
9420	}
9421
9422	SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9423	SelectionDAG &DAG) const {
9424	unsigned IntrID = Op.getConstantOperandVal(i: `1`);
9425	SDLoc DL(Op);
9426
9427	switch (IntrID) {
9428	case Intrinsic::amdgcn_ds_ordered_add:
9429	case Intrinsic::amdgcn_ds_ordered_swap: {
9430	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9431	SDValue Chain = M->getOperand(Num: `0`);
9432	SDValue M0 = M->getOperand(Num: `2`);
9433	SDValue Value = M->getOperand(Num: `3`);
9434	unsigned IndexOperand = M->getConstantOperandVal(Num: `7`);
9435	unsigned WaveRelease = M->getConstantOperandVal(Num: `8`);
9436	unsigned WaveDone = M->getConstantOperandVal(Num: `9`);
9437
9438	unsigned OrderedCountIndex = IndexOperand & `0x3f`;
9439	IndexOperand &= ~`0x3f`;
9440	unsigned CountDw = `0`;
9441
9442	if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9443	CountDw = (IndexOperand >> `24`) & `0xf`;
9444	IndexOperand &= ~(`0xf` << `24`);
9445
9446	if (CountDw < `1` \|\| CountDw > `4`) {
9447	const Function &Fn = DAG.getMachineFunction().getFunction();
9448	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9449	Fn, "ds_ordered_count: dword count must be between 1 and 4",
9450	DL.getDebugLoc()));
9451	CountDw = `1`;
9452	}
9453	}
9454
9455	if (IndexOperand) {
9456	const Function &Fn = DAG.getMachineFunction().getFunction();
9457	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9458	Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9459	}
9460
9461	if (WaveDone && !WaveRelease) {
9462	// TODO: Move this to IR verifier
9463	const Function &Fn = DAG.getMachineFunction().getFunction();
9464	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9465	Fn, "ds_ordered_count: wave_done requires wave_release",
9466	DL.getDebugLoc()));
9467	}
9468
9469	unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? `0` : `1`;
9470	unsigned ShaderType =
9471	SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
9472	unsigned Offset0 = OrderedCountIndex << `2`;
9473	unsigned Offset1 = WaveRelease \| (WaveDone << `1`) \| (Instruction << `4`);
9474
9475	if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9476	Offset1 \|= (CountDw - `1`) << `6`;
9477
9478	if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9479	Offset1 \|= ShaderType << `2`;
9480
9481	unsigned Offset = Offset0 \| (Offset1 << `8`);
9482
9483	SDValue Ops[] = {
9484	Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
9485	copyToM0(DAG, Chain, DL, V: M0).getValue(R: `1`), // Glue
9486	};
9487	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
9488	VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
9489	MMO: M->getMemOperand());
9490	}
9491	case Intrinsic::amdgcn_raw_buffer_load:
9492	case Intrinsic::amdgcn_raw_ptr_buffer_load:
9493	case Intrinsic::amdgcn_raw_atomic_buffer_load:
9494	case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9495	case Intrinsic::amdgcn_raw_buffer_load_format:
9496	case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9497	const bool IsFormat =
9498	IntrID == Intrinsic::amdgcn_raw_buffer_load_format \|\|
9499	IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9500
9501	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
9502	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `3`), DAG);
9503	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `4`), DAG, Subtarget);
9504	SDValue Ops[] = {
9505	Op.getOperand(i: `0`), // Chain
9506	Rsrc, // rsrc
9507	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
9508	VOffset, // voffset
9509	SOffset, // soffset
9510	Offset, // offset
9511	Op.getOperand(i: `5`), // cachepolicy, swizzled buffer
9512	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
9513	};
9514
9515	auto *M = cast<MemSDNode>(Val&: Op);
9516	return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9517	}
9518	case Intrinsic::amdgcn_struct_buffer_load:
9519	case Intrinsic::amdgcn_struct_ptr_buffer_load:
9520	case Intrinsic::amdgcn_struct_buffer_load_format:
9521	case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9522	case Intrinsic::amdgcn_struct_atomic_buffer_load:
9523	case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9524	const bool IsFormat =
9525	IntrID == Intrinsic::amdgcn_struct_buffer_load_format \|\|
9526	IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9527
9528	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
9529	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
9530	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
9531	SDValue Ops[] = {
9532	Op.getOperand(i: `0`), // Chain
9533	Rsrc, // rsrc
9534	Op.getOperand(i: `3`), // vindex
9535	VOffset, // voffset
9536	SOffset, // soffset
9537	Offset, // offset
9538	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
9539	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
9540	};
9541
9542	return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
9543	}
9544	case Intrinsic::amdgcn_raw_tbuffer_load:
9545	case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9546	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9547	EVT LoadVT = Op.getValueType();
9548	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
9549	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `3`), DAG);
9550	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `4`), DAG, Subtarget);
9551
9552	SDValue Ops[] = {
9553	Op.getOperand(i: `0`), // Chain
9554	Rsrc, // rsrc
9555	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
9556	VOffset, // voffset
9557	SOffset, // soffset
9558	Offset, // offset
9559	Op.getOperand(i: `5`), // format
9560	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
9561	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
9562	};
9563
9564	if (LoadVT.getScalarType() == MVT::f16)
9565	return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9566	Ops);
9567	return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9568	VTList: Op ->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
9569	DAG);
9570	}
9571	case Intrinsic::amdgcn_struct_tbuffer_load:
9572	case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9573	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9574	EVT LoadVT = Op.getValueType();
9575	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
9576	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
9577	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
9578
9579	SDValue Ops[] = {
9580	Op.getOperand(i: `0`), // Chain
9581	Rsrc, // rsrc
9582	Op.getOperand(i: `3`), // vindex
9583	VOffset, // voffset
9584	SOffset, // soffset
9585	Offset, // offset
9586	Op.getOperand(i: `6`), // format
9587	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
9588	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
9589	};
9590
9591	if (LoadVT.getScalarType() == MVT::f16)
9592	return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9593	Ops);
9594	return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9595	VTList: Op ->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
9596	DAG);
9597	}
9598	case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9599	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9600	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9601	case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9602	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9603	return lowerStructBufferAtomicIntrin(Op, DAG,
9604	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9605	case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9606	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9607	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9608	case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9609	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9610	return lowerStructBufferAtomicIntrin(Op, DAG,
9611	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9612	case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9613	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9614	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9615	case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9616	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9617	return lowerStructBufferAtomicIntrin(Op, DAG,
9618	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9619	case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9620	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9621	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9622	case Intrinsic::amdgcn_raw_buffer_atomic_add:
9623	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9624	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9625	case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9626	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9627	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9628	case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9629	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9630	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9631	case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9632	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9633	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9634	case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9635	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9636	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9637	case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9638	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9639	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9640	case Intrinsic::amdgcn_raw_buffer_atomic_and:
9641	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9642	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9643	case Intrinsic::amdgcn_raw_buffer_atomic_or:
9644	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9645	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9646	case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9647	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9648	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9649	case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9650	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9651	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9652	case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9653	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9654	return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9655	case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9656	return lowerRawBufferAtomicIntrin(Op, DAG,
9657	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9658	case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9659	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9660	return lowerStructBufferAtomicIntrin(Op, DAG,
9661	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9662	case Intrinsic::amdgcn_struct_buffer_atomic_add:
9663	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9664	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9665	case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9666	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9667	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9668	case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9669	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9670	return lowerStructBufferAtomicIntrin(Op, DAG,
9671	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9672	case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9673	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9674	return lowerStructBufferAtomicIntrin(Op, DAG,
9675	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9676	case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9677	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9678	return lowerStructBufferAtomicIntrin(Op, DAG,
9679	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9680	case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9681	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9682	return lowerStructBufferAtomicIntrin(Op, DAG,
9683	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9684	case Intrinsic::amdgcn_struct_buffer_atomic_and:
9685	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9686	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9687	case Intrinsic::amdgcn_struct_buffer_atomic_or:
9688	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9689	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9690	case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9691	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9692	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9693	case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9694	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9695	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9696	case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9697	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9698	return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9699	case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9700	return lowerStructBufferAtomicIntrin(Op, DAG,
9701	NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9702
9703	case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9704	case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9705	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `4`), DAG);
9706	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
9707	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
9708	SDValue Ops[] = {
9709	Op.getOperand(i: `0`), // Chain
9710	Op.getOperand(i: `2`), // src
9711	Op.getOperand(i: `3`), // cmp
9712	Rsrc, // rsrc
9713	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
9714	VOffset, // voffset
9715	SOffset, // soffset
9716	Offset, // offset
9717	Op.getOperand(i: `7`), // cachepolicy
9718	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
9719	};
9720	EVT VT = Op.getValueType();
9721	auto *M = cast<MemSDNode>(Val&: Op);
9722
9723	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9724	VTList: Op ->getVTList(), Ops, MemVT: VT,
9725	MMO: M->getMemOperand());
9726	}
9727	case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9728	case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9729	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op ->getOperand(Num: `4`), DAG);
9730	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `6`), DAG);
9731	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `7`), DAG, Subtarget);
9732	SDValue Ops[] = {
9733	Op.getOperand(i: `0`), // Chain
9734	Op.getOperand(i: `2`), // src
9735	Op.getOperand(i: `3`), // cmp
9736	Rsrc, // rsrc
9737	Op.getOperand(i: `5`), // vindex
9738	VOffset, // voffset
9739	SOffset, // soffset
9740	Offset, // offset
9741	Op.getOperand(i: `8`), // cachepolicy
9742	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
9743	};
9744	EVT VT = Op.getValueType();
9745	auto *M = cast<MemSDNode>(Val&: Op);
9746
9747	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9748	VTList: Op ->getVTList(), Ops, MemVT: VT,
9749	MMO: M->getMemOperand());
9750	}
9751	case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
9752	case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
9753	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9754	SDValue NodePtr = M->getOperand(Num: `2`);
9755	SDValue RayExtent = M->getOperand(Num: `3`);
9756	SDValue InstanceMask = M->getOperand(Num: `4`);
9757	SDValue RayOrigin = M->getOperand(Num: `5`);
9758	SDValue RayDir = M->getOperand(Num: `6`);
9759	SDValue Offsets = M->getOperand(Num: `7`);
9760	SDValue TDescr = M->getOperand(Num: `8`);
9761
9762	assert(NodePtr.getValueType() == MVT::i64);
9763	assert(RayDir.getValueType() == MVT::v3f32);
9764
9765	if (!Subtarget->hasBVHDualAndBVH8Insts()) {
9766	emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9767	return SDValue ();
9768	}
9769
9770	bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
9771	const unsigned NumVDataDwords = `10`;
9772	const unsigned NumVAddrDwords = IsBVH8 ? `11` : `12`;
9773	int Opcode = AMDGPU::getMIMGOpcode(
9774	BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
9775	: AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
9776	MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9777	assert(Opcode != -`1`);
9778
9779	SmallVector<SDValue, `7`> Ops;
9780	Ops.push_back(Elt: NodePtr);
9781	Ops.push_back(Elt: DAG.getBuildVector(
9782	VT: MVT::v2i32, DL,
9783	Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
9784	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
9785	Ops.push_back(Elt: RayOrigin);
9786	Ops.push_back(Elt: RayDir);
9787	Ops.push_back(Elt: Offsets);
9788	Ops.push_back(Elt: TDescr);
9789	Ops.push_back(Elt: M->getChain());
9790
9791	auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9792	MachineMemOperand *MemRef = M->getMemOperand();
9793	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9794	return SDValue (NewNode, `0`);
9795	}
9796	case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9797	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9798	SDValue NodePtr = M->getOperand(Num: `2`);
9799	SDValue RayExtent = M->getOperand(Num: `3`);
9800	SDValue RayOrigin = M->getOperand(Num: `4`);
9801	SDValue RayDir = M->getOperand(Num: `5`);
9802	SDValue RayInvDir = M->getOperand(Num: `6`);
9803	SDValue TDescr = M->getOperand(Num: `7`);
9804
9805	assert(NodePtr.getValueType() == MVT::i32 \|\|
9806	NodePtr.getValueType() == MVT::i64);
9807	assert(RayDir.getValueType() == MVT::v3f16 \|\|
9808	RayDir.getValueType() == MVT::v3f32);
9809
9810	if (!Subtarget->hasGFX10_AEncoding()) {
9811	emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9812	return SDValue ();
9813	}
9814
9815	const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
9816	const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9817	const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9818	const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9819	const bool Is64 = NodePtr.getValueType() == MVT::i64;
9820	const unsigned NumVDataDwords = `4`;
9821	const unsigned NumVAddrDwords = IsA16 ? (Is64 ? `9` : `8`) : (Is64 ? `12` : `11`);
9822	const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? `4` : `5`) : NumVAddrDwords;
9823	const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9824	NumVAddrs <= Subtarget->getNSAMaxSize()) \|\|
9825	IsGFX12Plus;
9826	const unsigned BaseOpcodes[`2`][`2`] = {
9827	{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9828	{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9829	AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9830	int Opcode;
9831	if (UseNSA) {
9832	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9833	MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9834	: IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9835	: AMDGPU::MIMGEncGfx10NSA,
9836	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9837	} else {
9838	assert(!IsGFX12Plus);
9839	Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9840	MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9841	: AMDGPU::MIMGEncGfx10Default,
9842	VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9843	}
9844	assert(Opcode != -`1`);
9845
9846	SmallVector<SDValue, `16`> Ops;
9847
9848	auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9849	SmallVector<SDValue, `3`> Lanes;
9850	DAG.ExtractVectorElements(Op, Args&: Lanes, Start: `0`, Count: `3`);
9851	if (Lanes [`0`].getValueSizeInBits() == `32`) {
9852	for (unsigned I = `0`; I < `3`; ++I)
9853	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes [I]));
9854	} else {
9855	if (IsAligned) {
9856	Ops.push_back(Elt: DAG.getBitcast(
9857	VT: MVT::i32,
9858	V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes [`0`], Lanes [`1`]})));
9859	Ops.push_back(Elt: Lanes [`2`]);
9860	} else {
9861	SDValue Elt0 = Ops.pop_back_val();
9862	Ops.push_back(Elt: DAG.getBitcast(
9863	VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes [`0`]})));
9864	Ops.push_back(Elt: DAG.getBitcast(
9865	VT: MVT::i32,
9866	V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes [`1`], Lanes [`2`]})));
9867	}
9868	}
9869	};
9870
9871	if (UseNSA && IsGFX11Plus) {
9872	Ops.push_back(Elt: NodePtr);
9873	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9874	Ops.push_back(Elt: RayOrigin);
9875	if (IsA16) {
9876	SmallVector<SDValue, `3`> DirLanes, InvDirLanes, MergedLanes;
9877	DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: `0`, Count: `3`);
9878	DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: `0`, Count: `3`);
9879	for (unsigned I = `0`; I < `3`; ++I) {
9880	MergedLanes.push_back(Elt: DAG.getBitcast(
9881	VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9882	Ops: {DirLanes [I], InvDirLanes [I]})));
9883	}
9884	Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
9885	} else {
9886	Ops.push_back(Elt: RayDir);
9887	Ops.push_back(Elt: RayInvDir);
9888	}
9889	} else {
9890	if (Is64)
9891	DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: `0`,
9892	Count: `2`);
9893	else
9894	Ops.push_back(Elt: NodePtr);
9895
9896	Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9897	packLanes (RayOrigin, true);
9898	packLanes (RayDir, true);
9899	packLanes (RayInvDir, false);
9900	}
9901
9902	if (!UseNSA) {
9903	// Build a single vector containing all the operands so far prepared.
9904	if (NumVAddrDwords > `12`) {
9905	SDValue Undef = DAG.getPOISON(VT: MVT::i32);
9906	Ops.append(NumInputs: `16` - Ops.size(), Elt: Undef);
9907	}
9908	assert(Ops.size() >= `8` && Ops.size() <= `12`);
9909	SDValue MergedOps =
9910	DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
9911	Ops.clear();
9912	Ops.push_back(Elt: MergedOps);
9913	}
9914
9915	Ops.push_back(Elt: TDescr);
9916	Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
9917	Ops.push_back(Elt: M->getChain());
9918
9919	auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9920	MachineMemOperand *MemRef = M->getMemOperand();
9921	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9922	return SDValue (NewNode, `0`);
9923	}
9924	case Intrinsic::amdgcn_global_atomic_fmin_num:
9925	case Intrinsic::amdgcn_global_atomic_fmax_num:
9926	case Intrinsic::amdgcn_flat_atomic_fmin_num:
9927	case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9928	MemSDNode *M = cast<MemSDNode>(Val&: Op);
9929	SDValue Ops[] = {
9930	M->getOperand(Num: `0`), // Chain
9931	M->getOperand(Num: `2`), // Ptr
9932	M->getOperand(Num: `3`) // Value
9933	};
9934	unsigned Opcode = `0`;
9935	switch (IntrID) {
9936	case Intrinsic::amdgcn_global_atomic_fmin_num:
9937	case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9938	Opcode = ISD::ATOMIC_LOAD_FMIN;
9939	break;
9940	}
9941	case Intrinsic::amdgcn_global_atomic_fmax_num:
9942	case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9943	Opcode = ISD::ATOMIC_LOAD_FMAX;
9944	break;
9945	}
9946	default:
9947	llvm_unreachable("unhandled atomic opcode");
9948	}
9949	return DAG.getAtomic(Opcode, dl: SDLoc (Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
9950	Ops, MMO: M->getMemOperand());
9951	}
9952	case Intrinsic::amdgcn_s_get_barrier_state:
9953	case Intrinsic::amdgcn_s_get_named_barrier_state: {
9954	SDValue Chain = Op ->getOperand(Num: `0`);
9955	SmallVector<SDValue, `2`> Ops;
9956	unsigned Opc;
9957
9958	if (isa<ConstantSDNode>(Val: Op ->getOperand(Num: `2`))) {
9959	uint64_t BarID = cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`))->getZExtValue();
9960	if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9961	BarID = (BarID >> `4`) & `0x3F`;
9962	Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9963	SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
9964	Ops.push_back(Elt: K);
9965	Ops.push_back(Elt: Chain);
9966	} else {
9967	Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9968	if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9969	SDValue M0Val;
9970	M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op ->getOperand(Num: `2`),
9971	N2: DAG.getShiftAmountConstant(Val: `4`, VT: MVT::i32, DL));
9972	M0Val = SDValue (
9973	DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
9974	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
9975	`0`);
9976	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: `0`));
9977	} else
9978	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op ->getOperand(Num: `2`)).getValue(R: `0`));
9979	}
9980
9981	auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
9982	return SDValue (NewMI, `0`);
9983	}
9984	default:
9985
9986	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9987	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
9988	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9989
9990	return SDValue ();
9991	}
9992	}
9993
9994	// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9995	// dwordx4 if on SI and handle TFE loads.
9996	SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9997	SDVTList VTList,
9998	ArrayRef<SDValue> Ops, EVT MemVT,
9999	MachineMemOperand *MMO,
10000	SelectionDAG &DAG) const {
10001	LLVMContext &C = *DAG.getContext();
10002	MachineFunction &MF = DAG.getMachineFunction();
10003	EVT VT = VTList.VTs[`0`];
10004
10005	assert(VTList.NumVTs == `2` \|\| VTList.NumVTs == `3`);
10006	bool IsTFE = VTList.NumVTs == `3`;
10007	if (IsTFE) {
10008	unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: `32`);
10009	unsigned NumOpDWords = NumValueDWords + `1`;
10010	EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
10011	SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[`2`]);
10012	MachineMemOperand *OpDWordsMMO =
10013	MF.getMachineMemOperand(MMO, Offset: `0`, Size: NumOpDWords * `4`);
10014	SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
10015	MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
10016	SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10017	N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
10018	SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
10019	SDValue ValueDWords =
10020	NumValueDWords == `1`
10021	? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
10022	: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
10023	VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
10024	N2: ZeroIdx);
10025	SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
10026	return DAG.getMergeValues(Ops: {Value, Status, SDValue (Op.getNode(), `1`)}, dl: DL);
10027	}
10028
10029	if (!Subtarget->hasDwordx3LoadStores() &&
10030	(VT == MVT::v3i32 \|\| VT == MVT::v3f32)) {
10031	EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: `4`);
10032	EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: `4`);
10033	MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: `0`, Size: `16`);
10034	SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[`1`]);
10035	SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
10036	MemVT: WidenedMemVT, MMO: WidenedMMO);
10037	SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
10038	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
10039	return DAG.getMergeValues(Ops: {Value, SDValue (Op.getNode(), `1`)}, dl: DL);
10040	}
10041
10042	return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
10043	}
10044
10045	SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10046	bool ImageStore) const {
10047	EVT StoreVT = VData.getValueType();
10048
10049	// No change for f16 and legal vector D16 types.
10050	if (!StoreVT.isVector())
10051	return VData;
10052
10053	SDLoc DL(VData);
10054	unsigned NumElements = StoreVT.getVectorNumElements();
10055
10056	if (Subtarget->hasUnpackedD16VMem()) {
10057	// We need to unpack the packed data to store.
10058	EVT IntStoreVT = StoreVT.changeTypeToInteger();
10059	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10060
10061	EVT EquivStoreVT =
10062	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
10063	SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
10064	return DAG.UnrollVectorOp(N: ZExt.getNode());
10065	}
10066
10067	// The sq block of gfx8.1 does not estimate register use correctly for d16
10068	// image store instructions. The data operand is computed as if it were not a
10069	// d16 image instruction.
10070	if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10071	// Bitcast to i16
10072	EVT IntStoreVT = StoreVT.changeTypeToInteger();
10073	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10074
10075	// Decompose into scalars
10076	SmallVector<SDValue, `4`> Elts;
10077	DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
10078
10079	// Group pairs of i16 into v2i16 and bitcast to i32
10080	SmallVector<SDValue, `4`> PackedElts;
10081	for (unsigned I = `0`; I < Elts.size() / `2`; I += `1`) {
10082	SDValue Pair =
10083	DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts [I * `2`], Elts [I * `2` + `1`]});
10084	SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
10085	PackedElts.push_back(Elt: IntPair);
10086	}
10087	if ((NumElements % `2`) == `1`) {
10088	// Handle v3i16
10089	unsigned I = Elts.size() / `2`;
10090	SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
10091	Ops: {Elts [I * `2`], DAG.getPOISON(VT: MVT::i16)});
10092	SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
10093	PackedElts.push_back(Elt: IntPair);
10094	}
10095
10096	// Pad using UNDEF
10097	PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
10098
10099	// Build final vector
10100	EVT VecVT =
10101	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
10102	return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
10103	}
10104
10105	if (NumElements == `3`) {
10106	EVT IntStoreVT =
10107	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
10108	SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10109
10110	EVT WidenedStoreVT = EVT::getVectorVT(
10111	Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + `1`);
10112	EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
10113	BitWidth: WidenedStoreVT.getStoreSizeInBits());
10114	SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
10115	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
10116	}
10117
10118	assert(isTypeLegal(StoreVT));
10119	return VData;
10120	}
10121
10122	SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10123	SelectionDAG &DAG) const {
10124	SDLoc DL(Op);
10125	SDValue Chain = Op.getOperand(i: `0`);
10126	unsigned IntrinsicID = Op.getConstantOperandVal(i: `1`);
10127	MachineFunction &MF = DAG.getMachineFunction();
10128
10129	switch (IntrinsicID) {
10130	case Intrinsic::amdgcn_exp_compr: {
10131	if (!Subtarget->hasCompressedExport()) {
10132	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
10133	DAG.getMachineFunction().getFunction(),
10134	"intrinsic not supported on subtarget", DL.getDebugLoc()));
10135	}
10136	SDValue Src0 = Op.getOperand(i: `4`);
10137	SDValue Src1 = Op.getOperand(i: `5`);
10138	// Hack around illegal type on SI by directly selecting it.
10139	if (isTypeLegal(VT: Src0.getValueType()))
10140	return SDValue ();
10141
10142	const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: `6`));
10143	SDValue Undef = DAG.getPOISON(VT: MVT::f32);
10144	const SDValue Ops[] = {
10145	Op.getOperand(i: `2`), // tgt
10146	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
10147	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
10148	Undef, // src2
10149	Undef, // src3
10150	Op.getOperand(i: `7`), // vm
10151	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // compr
10152	Op.getOperand(i: `3`), // en
10153	Op.getOperand(i: `0`) // Chain
10154	};
10155
10156	unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10157	return SDValue (DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops), `0`);
10158	}
10159	case Intrinsic::amdgcn_s_barrier:
10160	case Intrinsic::amdgcn_s_barrier_signal:
10161	case Intrinsic::amdgcn_s_barrier_wait: {
10162	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10163	if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
10164	unsigned WGSize = ST.getFlatWorkGroupSizes(F: MF.getFunction()).second;
10165	if (WGSize <= ST.getWavefrontSize()) {
10166	// If the workgroup fits in a wave, remove s_barrier_signal and lower
10167	// s_barrier/s_barrier_wait to wave_barrier.
10168	if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
10169	return Op.getOperand(i: `0`);
10170	else
10171	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::WAVE_BARRIER, dl: DL,
10172	VT: MVT::Other, Op1: Op.getOperand(i: `0`)),
10173	`0`);
10174	}
10175	}
10176
10177	if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
10178	// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
10179	SDValue K =
10180	DAG.getSignedTargetConstant(Val: AMDGPU::Barrier::WORKGROUP, DL, VT: MVT::i32);
10181	SDValue BarSignal =
10182	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM, dl: DL,
10183	VT: MVT::Other, Op1: K, Op2: Op.getOperand(i: `0`)),
10184	`0`);
10185	SDValue BarWait =
10186	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_WAIT, dl: DL, VT: MVT::Other, Op1: K,
10187	Op2: BarSignal.getValue(R: `0`)),
10188	`0`);
10189	return BarWait;
10190	}
10191
10192	return SDValue ();
10193	};
10194
10195	case Intrinsic::amdgcn_struct_tbuffer_store:
10196	case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10197	SDValue VData = Op.getOperand(i: `2`);
10198	bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10199	if (IsD16)
10200	VData = handleD16VData(VData, DAG);
10201	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10202	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
10203	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
10204	SDValue Ops[] = {
10205	Chain,
10206	VData, // vdata
10207	Rsrc, // rsrc
10208	Op.getOperand(i: `4`), // vindex
10209	VOffset, // voffset
10210	SOffset, // soffset
10211	Offset, // offset
10212	Op.getOperand(i: `7`), // format
10213	Op.getOperand(i: `8`), // cachepolicy, swizzled buffer
10214	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
10215	};
10216	unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10217	: AMDGPUISD::TBUFFER_STORE_FORMAT;
10218	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10219	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
10220	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10221	}
10222
10223	case Intrinsic::amdgcn_raw_tbuffer_store:
10224	case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10225	SDValue VData = Op.getOperand(i: `2`);
10226	bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10227	if (IsD16)
10228	VData = handleD16VData(VData, DAG);
10229	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10230	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
10231	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
10232	SDValue Ops[] = {
10233	Chain,
10234	VData, // vdata
10235	Rsrc, // rsrc
10236	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10237	VOffset, // voffset
10238	SOffset, // soffset
10239	Offset, // offset
10240	Op.getOperand(i: `6`), // format
10241	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
10242	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10243	};
10244	unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10245	: AMDGPUISD::TBUFFER_STORE_FORMAT;
10246	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10247	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
10248	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10249	}
10250
10251	case Intrinsic::amdgcn_raw_buffer_store:
10252	case Intrinsic::amdgcn_raw_ptr_buffer_store:
10253	case Intrinsic::amdgcn_raw_buffer_store_format:
10254	case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10255	const bool IsFormat =
10256	IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format \|\|
10257	IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10258
10259	SDValue VData = Op.getOperand(i: `2`);
10260	EVT VDataVT = VData.getValueType();
10261	EVT EltType = VDataVT.getScalarType();
10262	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
10263	if (IsD16) {
10264	VData = handleD16VData(VData, DAG);
10265	VDataVT = VData.getValueType();
10266	}
10267
10268	if (!isTypeLegal(VT: VDataVT)) {
10269	VData =
10270	DAG.getNode(Opcode: ISD::BITCAST, DL,
10271	VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
10272	}
10273
10274	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10275	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `4`), DAG);
10276	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `5`), DAG, Subtarget);
10277	SDValue Ops[] = {
10278	Chain,
10279	VData,
10280	Rsrc,
10281	DAG.getConstant(Val: `0`, DL, VT: MVT::i32), // vindex
10282	VOffset, // voffset
10283	SOffset, // soffset
10284	Offset, // offset
10285	Op.getOperand(i: `6`), // cachepolicy, swizzled buffer
10286	DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i1), // idxen
10287	};
10288	unsigned Opc =
10289	IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
10290	Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10291	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10292
10293	// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10294	if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < `32`)
10295	return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
10296
10297	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
10298	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10299	}
10300
10301	case Intrinsic::amdgcn_struct_buffer_store:
10302	case Intrinsic::amdgcn_struct_ptr_buffer_store:
10303	case Intrinsic::amdgcn_struct_buffer_store_format:
10304	case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10305	const bool IsFormat =
10306	IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format \|\|
10307	IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10308
10309	SDValue VData = Op.getOperand(i: `2`);
10310	EVT VDataVT = VData.getValueType();
10311	EVT EltType = VDataVT.getScalarType();
10312	bool IsD16 = IsFormat && (EltType.getSizeInBits() == `16`);
10313
10314	if (IsD16) {
10315	VData = handleD16VData(VData, DAG);
10316	VDataVT = VData.getValueType();
10317	}
10318
10319	if (!isTypeLegal(VT: VDataVT)) {
10320	VData =
10321	DAG.getNode(Opcode: ISD::BITCAST, DL,
10322	VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
10323	}
10324
10325	auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `3`), DAG);
10326	auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: `5`), DAG);
10327	auto SOffset = selectSOffset(SOffset: Op.getOperand(i: `6`), DAG, Subtarget);
10328	SDValue Ops[] = {
10329	Chain,
10330	VData,
10331	Rsrc,
10332	Op.getOperand(i: `4`), // vindex
10333	VOffset, // voffset
10334	SOffset, // soffset
10335	Offset, // offset
10336	Op.getOperand(i: `7`), // cachepolicy, swizzled buffer
10337	DAG.getTargetConstant(Val: `1`, DL, VT: MVT::i1), // idxen
10338	};
10339	unsigned Opc =
10340	!IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
10341	Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10342	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10343
10344	// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10345	EVT VDataType = VData.getValueType().getScalarType();
10346	if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < `32`)
10347	return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10348
10349	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op ->getVTList(), Ops,
10350	MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10351	}
10352	case Intrinsic::amdgcn_raw_buffer_load_lds:
10353	case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10354	case Intrinsic::amdgcn_struct_buffer_load_lds:
10355	case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10356	if (!Subtarget->hasVMemToLDSLoad())
10357	return SDValue ();
10358	unsigned Opc;
10359	bool HasVIndex =
10360	IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds \|\|
10361	IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10362	unsigned OpOffset = HasVIndex ? `1` : `0`;
10363	SDValue VOffset = Op.getOperand(i: `5` + OpOffset);
10364	bool HasVOffset = !isNullConstant(V: VOffset);
10365	unsigned Size = Op ->getConstantOperandVal(Num: `4`);
10366
10367	switch (Size) {
10368	default:
10369	return SDValue ();
10370	case `1`:
10371	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10372	: AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10373	: HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10374	: AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10375	break;
10376	case `2`:
10377	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10378	: AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10379	: HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10380	: AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10381	break;
10382	case `4`:
10383	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10384	: AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10385	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10386	: AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10387	break;
10388	case `12`:
10389	if (!Subtarget->hasLDSLoadB96_B128())
10390	return SDValue ();
10391	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10392	: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10393	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10394	: AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10395	break;
10396	case `16`:
10397	if (!Subtarget->hasLDSLoadB96_B128())
10398	return SDValue ();
10399	Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10400	: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10401	: HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10402	: AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10403	break;
10404	}
10405
10406	SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: `3`));
10407
10408	SmallVector<SDValue, `8`> Ops;
10409
10410	if (HasVIndex && HasVOffset)
10411	Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
10412	Ops: {Op.getOperand(i: `5`), // VIndex
10413	VOffset}));
10414	else if (HasVIndex)
10415	Ops.push_back(Elt: Op.getOperand(i: `5`));
10416	else if (HasVOffset)
10417	Ops.push_back(Elt: VOffset);
10418
10419	SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG);
10420	Ops.push_back(Elt: Rsrc);
10421	Ops.push_back(Elt: Op.getOperand(i: `6` + OpOffset)); // soffset
10422	Ops.push_back(Elt: Op.getOperand(i: `7` + OpOffset)); // imm offset
10423	bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
10424	unsigned Aux = Op.getConstantOperandVal(i: `8` + OpOffset);
10425	Ops.push_back(Elt: DAG.getTargetConstant(
10426	Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10427	DL, VT: MVT::i8)); // cpol
10428	Ops.push_back(Elt: DAG.getTargetConstant(
10429	Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10430	? `1`
10431	: `0`,
10432	DL, VT: MVT::i8)); // swz
10433	Ops.push_back(Elt: M0Val.getValue(R: `0`)); // Chain
10434	Ops.push_back(Elt: M0Val.getValue(R: `1`)); // Glue
10435
10436	auto *M = cast<MemSDNode>(Val&: Op);
10437	MachineMemOperand *LoadMMO = M->getMemOperand();
10438	// Don't set the offset value here because the pointer points to the base of
10439	// the buffer.
10440	MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10441
10442	MachinePointerInfo StorePtrI = LoadPtrI;
10443	LoadPtrI.V = PoisonValue::get(
10444	T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
10445	LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10446	StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10447
10448	auto F = LoadMMO->getFlags() &
10449	~(MachineMemOperand::MOStore \| MachineMemOperand::MOLoad);
10450	LoadMMO =
10451	MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F \| MachineMemOperand::MOLoad, Size,
10452	BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10453
10454	MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10455	PtrInfo: StorePtrI, F: F \| MachineMemOperand::MOStore, Size: sizeof(int32_t),
10456	BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10457
10458	auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
10459	DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
10460
10461	return SDValue (Load, `0`);
10462	}
10463	// Buffers are handled by LowerBufferFatPointers, and we're going to go
10464	// for "trust me" that the remaining cases are global pointers until
10465	// such time as we can put two mem operands on an intrinsic.
10466	case Intrinsic::amdgcn_load_to_lds:
10467	case Intrinsic::amdgcn_global_load_lds: {
10468	if (!Subtarget->hasVMemToLDSLoad())
10469	return SDValue ();
10470
10471	unsigned Opc;
10472	unsigned Size = Op ->getConstantOperandVal(Num: `4`);
10473	switch (Size) {
10474	default:
10475	return SDValue ();
10476	case `1`:
10477	Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10478	break;
10479	case `2`:
10480	Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10481	break;
10482	case `4`:
10483	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10484	break;
10485	case `12`:
10486	if (!Subtarget->hasLDSLoadB96_B128())
10487	return SDValue ();
10488	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10489	break;
10490	case `16`:
10491	if (!Subtarget->hasLDSLoadB96_B128())
10492	return SDValue ();
10493	Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10494	break;
10495	}
10496
10497	SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: `3`));
10498
10499	SmallVector<SDValue, `6`> Ops;
10500
10501	SDValue Addr = Op.getOperand(i: `2`); // Global ptr
10502	SDValue VOffset;
10503	// Try to split SAddr and VOffset. Global and LDS pointers share the same
10504	// immediate offset, so we cannot use a regular SelectGlobalSAddr().
10505	if (Addr ->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10506	SDValue LHS = Addr.getOperand(i: `0`);
10507	SDValue RHS = Addr.getOperand(i: `1`);
10508
10509	if (LHS ->isDivergent())
10510	std::swap(a&: LHS, b&: RHS);
10511
10512	if (!LHS ->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10513	RHS.getOperand(i: `0`).getValueType() == MVT::i32) {
10514	// add (i64 sgpr), (zero_extend (i32 vgpr))
10515	Addr = LHS;
10516	VOffset = RHS.getOperand(i: `0`);
10517	}
10518	}
10519
10520	Ops.push_back(Elt: Addr);
10521	if (!Addr ->isDivergent()) {
10522	Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
10523	if (!VOffset)
10524	VOffset =
10525	SDValue (DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
10526	Op1: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10527	`0`);
10528	Ops.push_back(Elt: VOffset);
10529	}
10530
10531	Ops.push_back(Elt: Op.getOperand(i: `5`)); // Offset
10532	Ops.push_back(Elt: Op.getOperand(i: `6`)); // CPol
10533	Ops.push_back(Elt: M0Val.getValue(R: `0`)); // Chain
10534	Ops.push_back(Elt: M0Val.getValue(R: `1`)); // Glue
10535
10536	auto *M = cast<MemSDNode>(Val&: Op);
10537	MachineMemOperand *LoadMMO = M->getMemOperand();
10538	MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10539	LoadPtrI.Offset = Op ->getConstantOperandVal(Num: `5`);
10540	MachinePointerInfo StorePtrI = LoadPtrI;
10541	LoadPtrI.V = PoisonValue::get(
10542	T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
10543	LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10544	StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10545	auto F = LoadMMO->getFlags() &
10546	~(MachineMemOperand::MOStore \| MachineMemOperand::MOLoad);
10547	LoadMMO =
10548	MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F \| MachineMemOperand::MOLoad, Size,
10549	BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10550	MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10551	PtrInfo: StorePtrI, F: F \| MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align (`4`),
10552	AAInfo: LoadMMO->getAAInfo());
10553
10554	auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op ->getVTList(), Ops);
10555	DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
10556
10557	return SDValue (Load, `0`);
10558	}
10559	case Intrinsic::amdgcn_end_cf:
10560	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
10561	Op1: Op ->getOperand(Num: `2`), Op2: Chain),
10562	`0`);
10563	case Intrinsic::amdgcn_s_barrier_signal_var: {
10564	// these two intrinsics have two operands: barrier pointer and member count
10565	SDValue Chain = Op ->getOperand(Num: `0`);
10566	SmallVector<SDValue, `2`> Ops;
10567	SDValue BarOp = Op ->getOperand(Num: `2`);
10568	SDValue CntOp = Op ->getOperand(Num: `3`);
10569	SDValue M0Val;
10570	// extract the BarrierID from bits 4-9 of BarOp
10571	SDValue BarID;
10572	BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
10573	N2: DAG.getShiftAmountConstant(Val: `4`, VT: MVT::i32, DL));
10574	BarID =
10575	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
10576	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
10577	`0`);
10578	// Member count should be put into M0[ShAmt:+6]
10579	// Barrier ID should be put into M0[5:0]
10580	M0Val =
10581	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
10582	Op2: DAG.getTargetConstant(Val: `0x3F`, DL, VT: MVT::i32)),
10583	`0`);
10584	constexpr unsigned ShAmt = `16`;
10585	M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
10586	N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
10587
10588	M0Val = SDValue (
10589	DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), `0`);
10590
10591	Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: `0`));
10592
10593	auto *NewMI = DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_M0, dl: DL,
10594	VTs: Op ->getVTList(), Ops);
10595	return SDValue (NewMI, `0`);
10596	}
10597	case Intrinsic::amdgcn_s_prefetch_data: {
10598	// For non-global address space preserve the chain and remove the call.
10599	if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
10600	return Op.getOperand(i: `0`);
10601	return Op;
10602	}
10603	case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10604	SDValue Ops[] = {
10605	Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: `2`), DAG),
10606	Op.getOperand(i: `3`), // offset
10607	Op.getOperand(i: `4`), // length
10608	};
10609
10610	MemSDNode *M = cast<MemSDNode>(Val&: Op);
10611	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
10612	VTList: Op ->getVTList(), Ops, MemVT: M->getMemoryVT(),
10613	MMO: M->getMemOperand());
10614	}
10615	default: {
10616	if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10617	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10618	return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
10619
10620	return Op;
10621	}
10622	}
10623	}
10624
10625	bool SITargetLowering::shouldPreservePtrArith(const Function &F,
10626	EVT PtrVT) const {
10627	return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10628	}
10629
10630	// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10631	// offset (the offset that is included in bounds checking and swizzling, to be
10632	// split between the instruction's voffset and immoffset fields) and soffset
10633	// (the offset that is excluded from bounds checking and swizzling, to go in
10634	// the instruction's soffset field). This function takes the first kind of
10635	// offset and figures out how to split it between voffset and immoffset.
10636	std::pair<SDValue, SDValue>
10637	SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10638	SDLoc DL(Offset);
10639	const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
10640	SDValue N0 = Offset;
10641	ConstantSDNode C1 = nullptr*;
10642
10643	if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
10644	N0 = SDValue ();
10645	else if (DAG.isBaseWithConstantOffset(Op: N0)) {
10646	C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
10647	N0 = N0.getOperand(i: `0`);
10648	}
10649
10650	if (C1) {
10651	unsigned ImmOffset = C1->getZExtValue();
10652	// If the immediate value is too big for the immoffset field, put only bits
10653	// that would normally fit in the immoffset field. The remaining value that
10654	// is copied/added for the voffset field is a large power of 2, and it
10655	// stands more chance of being CSEd with the copy/add for another similar
10656	// load/store.
10657	// However, do not do that rounding down if that is a negative
10658	// number, as it appears to be illegal to have a negative offset in the
10659	// vgpr, even if adding the immediate offset makes it positive.
10660	unsigned Overflow = ImmOffset & ~MaxImm;
10661	ImmOffset -= Overflow;
10662	if ((int32_t)Overflow < `0`) {
10663	Overflow += ImmOffset;
10664	ImmOffset = `0`;
10665	}
10666	C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
10667	if (Overflow) {
10668	auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
10669	if (!N0)
10670	N0 = OverflowVal;
10671	else {
10672	SDValue Ops[] = {N0, OverflowVal};
10673	N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
10674	}
10675	}
10676	}
10677	if (!N0)
10678	N0 = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
10679	if (!C1)
10680	C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
10681	return {N0, SDValue (C1, `0`)};
10682	}
10683
10684	// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10685	// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10686	// pointed to by Offsets.
10687	void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10688	SelectionDAG &DAG, SDValue *Offsets,
10689	Align Alignment) const {
10690	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10691	SDLoc DL(CombinedOffset);
10692	if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
10693	uint32_t Imm = C->getZExtValue();
10694	uint32_t SOffset, ImmOffset;
10695	if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10696	Offsets[`0`] = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
10697	Offsets[`1`] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10698	Offsets[`2`] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10699	return;
10700	}
10701	}
10702	if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
10703	SDValue N0 = CombinedOffset.getOperand(i: `0`);
10704	SDValue N1 = CombinedOffset.getOperand(i: `1`);
10705	uint32_t SOffset, ImmOffset;
10706	int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
10707	if (Offset >= `0` &&
10708	TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
10709	Offsets[`0`] = N0;
10710	Offsets[`1`] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10711	Offsets[`2`] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10712	return;
10713	}
10714	}
10715
10716	SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10717	? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
10718	: DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
10719
10720	Offsets[`0`] = CombinedOffset;
10721	Offsets[`1`] = SOffsetZero;
10722	Offsets[`2`] = DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32);
10723	}
10724
10725	SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10726	SelectionDAG &DAG) const {
10727	if (!MaybePointer.getValueType().isScalarInteger())
10728	return MaybePointer;
10729
10730	SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
10731	return Rsrc;
10732	}
10733
10734	// Wrap a global or flat pointer into a buffer intrinsic using the flags
10735	// specified in the intrinsic.
10736	SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10737	SelectionDAG &DAG) const {
10738	SDLoc Loc(Op);
10739
10740	SDValue Pointer = Op->getOperand(Num: `1`);
10741	SDValue Stride = Op->getOperand(Num: `2`);
10742	SDValue NumRecords = Op->getOperand(Num: `3`);
10743	SDValue Flags = Op->getOperand(Num: `4`);
10744
10745	auto [LowHalf, HighHalf] = DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
10746	SDValue Mask = DAG.getConstant(Val: `0x0000ffff`, DL: Loc, VT: MVT::i32);
10747	SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
10748	std::optional<uint32_t> ConstStride = std::nullopt;
10749	if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride))
10750	ConstStride = ConstNode->getZExtValue();
10751
10752	SDValue NewHighHalf = Masked;
10753	if (!ConstStride \|\| *ConstStride != `0`) {
10754	SDValue ShiftedStride;
10755	if (ConstStride) {
10756	ShiftedStride = DAG.getConstant(Val: *ConstStride << `16`, DL: Loc, VT: MVT::i32);
10757	} else {
10758	SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
10759	ShiftedStride =
10760	DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
10761	N2: DAG.getShiftAmountConstant(Val: `16`, VT: MVT::i32, DL: Loc));
10762	}
10763	NewHighHalf = DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
10764	}
10765
10766	SDValue Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf,
10767	N2: NewHighHalf, N3: NumRecords, N4: Flags);
10768	SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
10769	return RsrcPtr;
10770	}
10771
10772	// Handle 8 bit and 16 bit buffer loads
10773	SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10774	EVT LoadVT, SDLoc DL,
10775	ArrayRef<SDValue> Ops,
10776	MachineMemOperand *MMO,
10777	bool IsTFE) const {
10778	EVT IntVT = LoadVT.changeTypeToInteger();
10779
10780	if (IsTFE) {
10781	unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10782	? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10783	: AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10784	MachineFunction &MF = DAG.getMachineFunction();
10785	MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: `0`, Size: `8`);
10786	SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
10787	SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
10788	SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10789	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
10790	SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10791	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32));
10792	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
10793	SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
10794	return DAG.getMergeValues(Ops: {Value, Status, SDValue (Op.getNode(), `1`)}, dl: DL);
10795	}
10796
10797	unsigned Opc = LoadVT.getScalarType() == MVT::i8
10798	? AMDGPUISD::BUFFER_LOAD_UBYTE
10799	: AMDGPUISD::BUFFER_LOAD_USHORT;
10800
10801	SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
10802	SDValue BufferLoad =
10803	DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
10804	SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
10805	LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
10806
10807	return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: `1`)}, dl: DL);
10808	}
10809
10810	// Handle 8 bit and 16 bit buffer stores
10811	SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10812	EVT VDataType, SDLoc DL,
10813	SDValue Ops[],
10814	MemSDNode M) const* {
10815	if (VDataType == MVT::f16 \|\| VDataType == MVT::bf16)
10816	Ops[`1`] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[`1`]);
10817
10818	SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[`1`]);
10819	Ops[`1`] = BufferStoreExt;
10820	unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10821	: AMDGPUISD::BUFFER_STORE_SHORT;
10822	ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[`0`], `9`);
10823	return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
10824	MMO: M->getMemOperand());
10825	}
10826
10827	static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
10828	SDValue Op, const SDLoc &SL, EVT VT) {
10829	if (VT.bitsLT(VT: Op.getValueType()))
10830	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
10831
10832	switch (ExtType) {
10833	case ISD::SEXTLOAD:
10834	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
10835	case ISD::ZEXTLOAD:
10836	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
10837	case ISD::EXTLOAD:
10838	return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
10839	case ISD::NON_EXTLOAD:
10840	return Op;
10841	}
10842
10843	llvm_unreachable("invalid ext type");
10844	}
10845
10846	// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10847	// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10848	SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10849	DAGCombinerInfo &DCI) const {
10850	SelectionDAG &DAG = DCI.DAG;
10851	if (Ld->getAlign() < Align (`4`) \|\| Ld->isDivergent())
10852	return SDValue ();
10853
10854	// FIXME: Constant loads should all be marked invariant.
10855	unsigned AS = Ld->getAddressSpace();
10856	if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10857	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10858	(AS != AMDGPUAS::GLOBAL_ADDRESS \|\| !Ld->isInvariant()))
10859	return SDValue ();
10860
10861	// Don't do this early, since it may interfere with adjacent load merging for
10862	// illegal types. We can avoid losing alignment information for exotic types
10863	// pre-legalize.
10864	EVT MemVT = Ld->getMemoryVT();
10865	if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) \|\|
10866	MemVT.getSizeInBits() >= `32`)
10867	return SDValue ();
10868
10869	SDLoc SL(Ld);
10870
10871	assert((!MemVT.isVector() \|\| Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10872	"unexpected vector extload");
10873
10874	// TODO: Drop only high part of range.
10875	SDValue Ptr = Ld->getBasePtr();
10876	SDValue NewLoad = DAG.getLoad(
10877	AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
10878	Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
10879	MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
10880	Ranges: nullptr); // Drop ranges
10881
10882	EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
10883	if (MemVT.isFloatingPoint()) {
10884	assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10885	"unexpected fp extload");
10886	TruncVT = MemVT.changeTypeToInteger();
10887	}
10888
10889	SDValue Cvt = NewLoad;
10890	if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10891	Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
10892	N2: DAG.getValueType(TruncVT));
10893	} else if (Ld->getExtensionType() == ISD::ZEXTLOAD \|\|
10894	Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10895	Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
10896	} else {
10897	assert(Ld->getExtensionType() == ISD::EXTLOAD);
10898	}
10899
10900	EVT VT = Ld->getValueType(ResNo: `0`);
10901	EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
10902
10903	DCI.AddToWorklist(N: Cvt.getNode());
10904
10905	// We may need to handle exotic cases, such as i16->i64 extloads, so insert
10906	// the appropriate extension from the 32-bit load.
10907	Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
10908	DCI.AddToWorklist(N: Cvt.getNode());
10909
10910	// Handle conversion back to floating point if necessary.
10911	Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
10912
10913	return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: `1`)}, dl: SL);
10914	}
10915
10916	static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10917	const SIMachineFunctionInfo &Info) {
10918	// TODO: Should check if the address can definitely not access stack.
10919	if (Info.isEntryFunction())
10920	return Info.getUserSGPRInfo().hasFlatScratchInit();
10921	return true;
10922	}
10923
10924	SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10925	SDLoc DL(Op);
10926	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
10927	ISD::LoadExtType ExtType = Load->getExtensionType();
10928	EVT MemVT = Load->getMemoryVT();
10929	MachineMemOperand *MMO = Load->getMemOperand();
10930
10931	if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < `32`) {
10932	if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
10933	return SDValue ();
10934
10935	// FIXME: Copied from PPC
10936	// First, load into 32 bits, then truncate to 1 bit.
10937
10938	SDValue Chain = Load->getChain();
10939	SDValue BasePtr = Load->getBasePtr();
10940
10941	EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10942
10943	SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
10944	MemVT: RealMemVT, MMO);
10945
10946	if (!MemVT.isVector()) {
10947	SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
10948	NewLD.getValue(R: `1`)};
10949
10950	return DAG.getMergeValues(Ops, dl: DL);
10951	}
10952
10953	SmallVector<SDValue, `3`> Elts;
10954	for (unsigned I = `0`, N = MemVT.getVectorNumElements(); I != N; ++I) {
10955	SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
10956	N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
10957
10958	Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
10959	}
10960
10961	SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: `1`)};
10962
10963	return DAG.getMergeValues(Ops, dl: DL);
10964	}
10965
10966	if (!MemVT.isVector())
10967	return SDValue ();
10968
10969	assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10970	"Custom lowering for non-i32 vectors hasn't been implemented.");
10971
10972	Align Alignment = Load->getAlign();
10973	unsigned AS = Load->getAddressSpace();
10974	if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10975	Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > `32`) {
10976	return SplitVectorLoad(Op, DAG);
10977	}
10978
10979	MachineFunction &MF = DAG.getMachineFunction();
10980	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10981	// If there is a possibility that flat instruction access scratch memory
10982	// then we need to use the same legalization rules we use for private.
10983	if (AS == AMDGPUAS::FLAT_ADDRESS &&
10984	!Subtarget->hasMultiDwordFlatScratchAddressing())
10985	AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
10986	? AMDGPUAS::PRIVATE_ADDRESS
10987	: AMDGPUAS::GLOBAL_ADDRESS;
10988
10989	unsigned NumElements = MemVT.getVectorNumElements();
10990
10991	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
10992	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
10993	(AS == AMDGPUAS::GLOBAL_ADDRESS &&
10994	Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10995	isMemOpHasNoClobberedMemOperand(N: Load))) {
10996	if ((!Op ->isDivergent() \|\| AMDGPU::isUniformMMO(MMO)) &&
10997	Alignment >= Align (`4`) && NumElements < `32`) {
10998	if (MemVT.isPow2VectorType() \|\|
10999	(Subtarget->hasScalarDwordx3Loads() && NumElements == `3`))
11000	return SDValue ();
11001	return WidenOrSplitVectorLoad(Op, DAG);
11002	}
11003	// Non-uniform loads will be selected to MUBUF instructions, so they
11004	// have the same legalization requirements as global and private
11005	// loads.
11006	//
11007	}
11008	if (AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
11009	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
11010	AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS) {
11011	if (NumElements > `4`)
11012	return SplitVectorLoad(Op, DAG);
11013	// v3 loads not supported on SI.
11014	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
11015	return WidenOrSplitVectorLoad(Op, DAG);
11016
11017	// v3 and v4 loads are supported for private and global memory.
11018	return SDValue ();
11019	}
11020	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11021	// Depending on the setting of the private_element_size field in the
11022	// resource descriptor, we can only make private accesses up to a certain
11023	// size.
11024	switch (Subtarget->getMaxPrivateElementSize()) {
11025	case `4`: {
11026	auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
11027	return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
11028	}
11029	case `8`:
11030	if (NumElements > `2`)
11031	return SplitVectorLoad(Op, DAG);
11032	return SDValue ();
11033	case `16`:
11034	// Same as global/flat
11035	if (NumElements > `4`)
11036	return SplitVectorLoad(Op, DAG);
11037	// v3 loads not supported on SI.
11038	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
11039	return WidenOrSplitVectorLoad(Op, DAG);
11040
11041	return SDValue ();
11042	default:
11043	llvm_unreachable("unsupported private_element_size");
11044	}
11045	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
11046	unsigned Fast = `0`;
11047	auto Flags = Load->getMemOperand()->getFlags();
11048	if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
11049	Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
11050	Fast > `1`)
11051	return SDValue ();
11052
11053	if (MemVT.isVector())
11054	return SplitVectorLoad(Op, DAG);
11055	}
11056
11057	if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
11058	VT: MemVT, MMO: *Load->getMemOperand())) {
11059	auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
11060	return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
11061	}
11062
11063	return SDValue ();
11064	}
11065
11066	SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11067	EVT VT = Op.getValueType();
11068	if (VT.getSizeInBits() == `128` \|\| VT.getSizeInBits() == `256` \|\|
11069	VT.getSizeInBits() == `512`)
11070	return splitTernaryVectorOp(Op, DAG);
11071
11072	assert(VT.getSizeInBits() == `64`);
11073
11074	SDLoc DL(Op);
11075	SDValue Cond = Op.getOperand(i: `0`);
11076
11077	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
11078	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
11079
11080	SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `1`));
11081	SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: `2`));
11082
11083	SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
11084	SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
11085
11086	SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
11087
11088	SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
11089	SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
11090
11091	SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
11092
11093	SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
11094	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
11095	}
11096
11097	// Catch division cases where we can use shortcuts with rcp and rsq
11098	// instructions.
11099	SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11100	SelectionDAG &DAG) const {
11101	SDLoc SL(Op);
11102	SDValue LHS = Op.getOperand(i: `0`);
11103	SDValue RHS = Op.getOperand(i: `1`);
11104	EVT VT = Op.getValueType();
11105	const SDNodeFlags Flags = Op ->getFlags();
11106
11107	bool AllowInaccurateRcp =
11108	Flags.hasApproximateFuncs() \|\| DAG.getTarget().Options.UnsafeFPMath;
11109
11110	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
11111	// Without !fpmath accuracy information, we can't do more because we don't
11112	// know exactly whether rcp is accurate enough to meet !fpmath requirement.
11113	// f16 is always accurate enough
11114	if (!AllowInaccurateRcp && VT != MVT::f16)
11115	return SDValue ();
11116
11117	if (CLHS->isExactlyValue(V: `1.0`)) {
11118	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11119	// the CI documentation has a worst case error of 1 ulp.
11120	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11121	// use it as long as we aren't trying to use denormals.
11122	//
11123	// v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11124
11125	// 1.0 / sqrt(x) -> rsq(x)
11126
11127	// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
11128	// error seems really high at 2^29 ULP.
11129	// 1.0 / x -> rcp(x)
11130	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
11131	}
11132
11133	// Same as for 1.0, but expand the sign out of the constant.
11134	if (CLHS->isExactlyValue(V: -`1.0`)) {
11135	// -1.0 / x -> rcp (fneg x)
11136	SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
11137	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
11138	}
11139	}
11140
11141	// For f16 require afn or arcp.
11142	// For f32 require afn.
11143	if (!AllowInaccurateRcp && (VT != MVT::f16 \|\| !Flags.hasAllowReciprocal()))
11144	return SDValue ();
11145
11146	// Turn into multiply by the reciprocal.
11147	// x / y -> x (1.0 / y)*
11148	SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
11149	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
11150	}
11151
11152	SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11153	SelectionDAG &DAG) const {
11154	SDLoc SL(Op);
11155	SDValue X = Op.getOperand(i: `0`);
11156	SDValue Y = Op.getOperand(i: `1`);
11157	EVT VT = Op.getValueType();
11158	const SDNodeFlags Flags = Op ->getFlags();
11159
11160	bool AllowInaccurateDiv =
11161	Flags.hasApproximateFuncs() \|\| DAG.getTarget().Options.UnsafeFPMath;
11162	if (!AllowInaccurateDiv)
11163	return SDValue ();
11164
11165	SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
11166	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
11167
11168	SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
11169	SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
11170
11171	R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
11172	SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
11173	R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
11174	SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
11175	SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
11176	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
11177	}
11178
11179	static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11180	EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11181	SDNodeFlags Flags) {
11182	if (GlueChain ->getNumValues() <= `1`) {
11183	return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
11184	}
11185
11186	assert(GlueChain->getNumValues() == `3`);
11187
11188	SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
11189	switch (Opcode) {
11190	default:
11191	llvm_unreachable("no chain equivalent for opcode");
11192	case ISD::FMUL:
11193	Opcode = AMDGPUISD::FMUL_W_CHAIN;
11194	break;
11195	}
11196
11197	return DAG.getNode(Opcode, DL: SL, VTList,
11198	Ops: {GlueChain.getValue(R: `1`), A, B, GlueChain.getValue(R: `2`)},
11199	Flags);
11200	}
11201
11202	static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11203	EVT VT, SDValue A, SDValue B, SDValue C,
11204	SDValue GlueChain, SDNodeFlags Flags) {
11205	if (GlueChain ->getNumValues() <= `1`) {
11206	return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
11207	}
11208
11209	assert(GlueChain->getNumValues() == `3`);
11210
11211	SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
11212	switch (Opcode) {
11213	default:
11214	llvm_unreachable("no chain equivalent for opcode");
11215	case ISD::FMA:
11216	Opcode = AMDGPUISD::FMA_W_CHAIN;
11217	break;
11218	}
11219
11220	return DAG.getNode(Opcode, DL: SL, VTList,
11221	Ops: {GlueChain.getValue(R: `1`), A, B, C, GlueChain.getValue(R: `2`)},
11222	Flags);
11223	}
11224
11225	SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11226	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11227	return FastLowered;
11228
11229	SDLoc SL(Op);
11230	SDValue LHS = Op.getOperand(i: `0`);
11231	SDValue RHS = Op.getOperand(i: `1`);
11232
11233	// a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11234	// b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11235	// r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11236	// q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n rcp*
11237	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
11238	// q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n rcp*
11239	// e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d q + n*
11240	// tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11241	// tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11242	// q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11243	// q16.u = opx(V_CVT_F16_F32, q32.u);
11244	// q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11245
11246	// We will use ISD::FMA on targets that don't support ISD::FMAD.
11247	unsigned FMADOpCode =
11248	isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
11249
11250	SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
11251	SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
11252	SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
11253	SDValue Rcp =
11254	DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op ->getFlags());
11255	SDValue Quot =
11256	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op ->getFlags());
11257	SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
11258	Flags: Op ->getFlags());
11259	Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op ->getFlags());
11260	Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
11261	Flags: Op ->getFlags());
11262	SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op ->getFlags());
11263	SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
11264	TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
11265	N2: DAG.getConstant(Val: `0xff800000`, DL: SL, VT: MVT::i32));
11266	Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
11267	Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op ->getFlags());
11268	SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
11269	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32));
11270	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
11271	Flags: Op ->getFlags());
11272	}
11273
11274	// Faster 2.5 ULP division that does not support denormals.
11275	SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11276	SDNodeFlags Flags = Op ->getFlags();
11277	SDLoc SL(Op);
11278	SDValue LHS = Op.getOperand(i: `1`);
11279	SDValue RHS = Op.getOperand(i: `2`);
11280
11281	SDValue r1 = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
11282
11283	const APFloat K0Val(`0x1p+96f`);
11284	const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
11285
11286	const APFloat K1Val(`0x1p-32f`);
11287	const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
11288
11289	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f32);
11290
11291	EVT SetCCVT =
11292	getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
11293
11294	SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
11295
11296	SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
11297
11298	r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
11299
11300	// rcp does not support denormals.
11301	SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
11302
11303	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
11304
11305	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
11306	}
11307
11308	// Returns immediate value for setting the F32 denorm mode when using the
11309	// S_DENORM_MODE instruction.
11310	static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
11311	const SIMachineFunctionInfo *Info,
11312	const GCNSubtarget *ST) {
11313	assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11314	uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11315	uint32_t Mode = SPDenormMode \| (DPDenormModeDefault << `2`);
11316	return DAG.getTargetConstant(Val: Mode, DL: SDLoc (), VT: MVT::i32);
11317	}
11318
11319	SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11320	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11321	return FastLowered;
11322
11323	// The selection matcher assumes anything with a chain selecting to a
11324	// mayRaiseFPException machine instruction. Since we're introducing a chain
11325	// here, we need to explicitly report nofpexcept for the regular fdiv
11326	// lowering.
11327	SDNodeFlags Flags = Op ->getFlags();
11328	Flags.setNoFPExcept(true);
11329
11330	SDLoc SL(Op);
11331	SDValue LHS = Op.getOperand(i: `0`);
11332	SDValue RHS = Op.getOperand(i: `1`);
11333
11334	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f32);
11335
11336	SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
11337
11338	SDValue DenominatorScaled =
11339	DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
11340	SDValue NumeratorScaled =
11341	DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
11342
11343	// Denominator is scaled to not be denormal, so using rcp is ok.
11344	SDValue ApproxRcp =
11345	DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
11346	SDValue NegDivScale0 =
11347	DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
11348
11349	using namespace AMDGPU::Hwreg;
11350	const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: `4`, Values: `2`);
11351	const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
11352
11353	const MachineFunction &MF = DAG.getMachineFunction();
11354	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11355	const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11356
11357	const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11358	const bool HasDynamicDenormals =
11359	(DenormMode.Input == DenormalMode::Dynamic) \|\|
11360	(DenormMode.Output == DenormalMode::Dynamic);
11361
11362	SDValue SavedDenormMode;
11363
11364	if (!PreservesDenormals) {
11365	// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11366	// lowering. The chain dependence is insufficient, and we need glue. We do
11367	// not need the glue variants in a strictfp function.
11368
11369	SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
11370
11371	SDValue Glue = DAG.getEntryNode();
11372	if (HasDynamicDenormals) {
11373	SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
11374	VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
11375	Ops: {BitField, Glue});
11376	SavedDenormMode = SDValue (GetReg, `0`);
11377
11378	Glue = DAG.getMergeValues(
11379	Ops: {DAG.getEntryNode(), SDValue (GetReg, `0`), SDValue (GetReg, `1`)}, dl: SL);
11380	}
11381
11382	SDNode *EnableDenorm;
11383	if (Subtarget->hasDenormModeInst()) {
11384	const SDValue EnableDenormValue =
11385	getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
11386
11387	EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
11388	N2: EnableDenormValue)
11389	.getNode();
11390	} else {
11391	const SDValue EnableDenormValue =
11392	DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
11393	EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
11394	Ops: {EnableDenormValue, BitField, Glue});
11395	}
11396
11397	SDValue Ops[`3`] = {NegDivScale0, SDValue (EnableDenorm, `0`),
11398	SDValue (EnableDenorm, `1`)};
11399
11400	NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
11401	}
11402
11403	SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
11404	B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
11405
11406	SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
11407	C: ApproxRcp, GlueChain: Fma0, Flags);
11408
11409	SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
11410	GlueChain: Fma1, Flags);
11411
11412	SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
11413	C: NumeratorScaled, GlueChain: Mul, Flags);
11414
11415	SDValue Fma3 =
11416	getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
11417
11418	SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
11419	C: NumeratorScaled, GlueChain: Fma3, Flags);
11420
11421	if (!PreservesDenormals) {
11422	SDNode *DisableDenorm;
11423	if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11424	const SDValue DisableDenormValue = getSPDenormModeValue(
11425	FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
11426
11427	SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
11428	DisableDenorm =
11429	DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
11430	N1: Fma4.getValue(R: `1`), N2: DisableDenormValue, N3: Fma4.getValue(R: `2`))
11431	.getNode();
11432	} else {
11433	assert(HasDynamicDenormals == (bool)SavedDenormMode);
11434	const SDValue DisableDenormValue =
11435	HasDynamicDenormals
11436	? SavedDenormMode
11437	: DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
11438
11439	DisableDenorm = DAG.getMachineNode(
11440	Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
11441	Ops: {DisableDenormValue, BitField, Fma4.getValue(R: `1`), Fma4.getValue(R: `2`)});
11442	}
11443
11444	SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
11445	N1: SDValue (DisableDenorm, `0`), N2: DAG.getRoot());
11446	DAG.setRoot(OutputChain);
11447	}
11448
11449	SDValue Scale = NumeratorScaled.getValue(R: `1`);
11450	SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
11451	Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
11452
11453	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
11454	}
11455
11456	SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11457	if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11458	return FastLowered;
11459
11460	SDLoc SL(Op);
11461	SDValue X = Op.getOperand(i: `0`);
11462	SDValue Y = Op.getOperand(i: `1`);
11463
11464	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f64);
11465
11466	SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
11467
11468	SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
11469
11470	SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
11471
11472	SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
11473
11474	SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
11475
11476	SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
11477
11478	SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
11479
11480	SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
11481
11482	SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
11483	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
11484
11485	SDValue Fma4 =
11486	DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
11487
11488	SDValue Scale;
11489
11490	if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11491	// Workaround a hardware bug on SI where the condition output from div_scale
11492	// is not usable.
11493
11494	const SDValue Hi = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
11495
11496	// Figure out if the scale to use for div_fmas.
11497	SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
11498	SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
11499	SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
11500	SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
11501
11502	SDValue NumHi =
11503	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
11504	SDValue DenHi =
11505	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
11506
11507	SDValue Scale0Hi =
11508	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
11509	SDValue Scale1Hi =
11510	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
11511
11512	SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
11513	SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
11514	Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
11515	} else {
11516	Scale = DivScale1.getValue(R: `1`);
11517	}
11518
11519	SDValue Fmas =
11520	DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
11521
11522	return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
11523	}
11524
11525	SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11526	EVT VT = Op.getValueType();
11527
11528	if (VT == MVT::f32)
11529	return LowerFDIV32(Op, DAG);
11530
11531	if (VT == MVT::f64)
11532	return LowerFDIV64(Op, DAG);
11533
11534	if (VT == MVT::f16)
11535	return LowerFDIV16(Op, DAG);
11536
11537	llvm_unreachable("Unexpected type for fdiv");
11538	}
11539
11540	SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11541	SDLoc dl(Op);
11542	SDValue Val = Op.getOperand(i: `0`);
11543	EVT VT = Val.getValueType();
11544	EVT ResultExpVT = Op ->getValueType(ResNo: `1`);
11545	EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11546
11547	SDValue Mant = DAG.getNode(
11548	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
11549	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
11550
11551	SDValue Exp = DAG.getNode(
11552	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
11553	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
11554
11555	if (Subtarget->hasFractBug()) {
11556	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
11557	SDValue Inf =
11558	DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
11559
11560	SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
11561	SDValue Zero = DAG.getConstant(Val: `0`, DL: dl, VT: InstrExpVT);
11562	Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
11563	Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
11564	}
11565
11566	SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
11567	return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
11568	}
11569
11570	SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11571	SDLoc DL(Op);
11572	StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
11573	EVT VT = Store->getMemoryVT();
11574
11575	if (VT == MVT::i1) {
11576	return DAG.getTruncStore(
11577	Chain: Store->getChain(), dl: DL,
11578	Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
11579	Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
11580	}
11581
11582	assert(VT.isVector() &&
11583	Store->getValue().getValueType().getScalarType() == MVT::i32);
11584
11585	unsigned AS = Store->getAddressSpace();
11586	if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11587	Store->getAlign().value() < VT.getStoreSize() &&
11588	VT.getSizeInBits() > `32`) {
11589	return SplitVectorStore(Op, DAG);
11590	}
11591
11592	MachineFunction &MF = DAG.getMachineFunction();
11593	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11594	// If there is a possibility that flat instruction access scratch memory
11595	// then we need to use the same legalization rules we use for private.
11596	if (AS == AMDGPUAS::FLAT_ADDRESS &&
11597	!Subtarget->hasMultiDwordFlatScratchAddressing())
11598	AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
11599	? AMDGPUAS::PRIVATE_ADDRESS
11600	: AMDGPUAS::GLOBAL_ADDRESS;
11601
11602	unsigned NumElements = VT.getVectorNumElements();
11603	if (AS == AMDGPUAS::GLOBAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS) {
11604	if (NumElements > `4`)
11605	return SplitVectorStore(Op, DAG);
11606	// v3 stores not supported on SI.
11607	if (NumElements == `3` && !Subtarget->hasDwordx3LoadStores())
11608	return SplitVectorStore(Op, DAG);
11609
11610	if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
11611	VT, MMO: *Store->getMemOperand()))
11612	return expandUnalignedStore(ST: Store, DAG);
11613
11614	return SDValue ();
11615	}
11616	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11617	switch (Subtarget->getMaxPrivateElementSize()) {
11618	case `4`:
11619	return scalarizeVectorStore(ST: Store, DAG);
11620	case `8`:
11621	if (NumElements > `2`)
11622	return SplitVectorStore(Op, DAG);
11623	return SDValue ();
11624	case `16`:
11625	if (NumElements > `4` \|\|
11626	(NumElements == `3` && !Subtarget->enableFlatScratch()))
11627	return SplitVectorStore(Op, DAG);
11628	return SDValue ();
11629	default:
11630	llvm_unreachable("unsupported private_element_size");
11631	}
11632	} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
11633	unsigned Fast = `0`;
11634	auto Flags = Store->getMemOperand()->getFlags();
11635	if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
11636	Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
11637	Fast > `1`)
11638	return SDValue ();
11639
11640	if (VT.isVector())
11641	return SplitVectorStore(Op, DAG);
11642
11643	return expandUnalignedStore(ST: Store, DAG);
11644	}
11645
11646	// Probably an invalid store. If so we'll end up emitting a selection error.
11647	return SDValue ();
11648	}
11649
11650	// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11651	SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11652	SDLoc SL(Op);
11653	assert(!Subtarget->has16BitInsts());
11654	SDNodeFlags Flags = Op ->getFlags();
11655	SDValue Ext =
11656	DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: `0`), Flags);
11657
11658	SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
11659	SDValue Sqrt =
11660	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
11661
11662	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
11663	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
11664	}
11665
11666	SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11667	SDLoc DL(Op);
11668	SDNodeFlags Flags = Op ->getFlags();
11669	MVT VT = Op.getValueType().getSimpleVT();
11670	const SDValue X = Op.getOperand(i: `0`);
11671
11672	if (allowApproxFunc(DAG, Flags)) {
11673	// Instruction is 1ulp but ignores denormals.
11674	return DAG.getNode(
11675	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
11676	N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
11677	}
11678
11679	SDValue ScaleThreshold = DAG.getConstantFP(Val: `0x1.0p-96f`, DL, VT);
11680	SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
11681
11682	SDValue ScaleUpFactor = DAG.getConstantFP(Val: `0x1.0p+32f`, DL, VT);
11683
11684	SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
11685
11686	SDValue SqrtX =
11687	DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
11688
11689	SDValue SqrtS;
11690	if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
11691	SDValue SqrtID =
11692	DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
11693	SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
11694
11695	SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
11696	SDValue SqrtSNextDownInt =
11697	DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11698	N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
11699	SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
11700
11701	SDValue NegSqrtSNextDown =
11702	DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
11703
11704	SDValue SqrtVP =
11705	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
11706
11707	SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11708	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
11709	SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
11710
11711	SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
11712	SDValue SqrtVS =
11713	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
11714
11715	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL, VT);
11716	SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
11717
11718	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
11719	Flags);
11720
11721	SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
11722	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
11723	Flags);
11724	} else {
11725	SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
11726
11727	SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
11728
11729	SDValue Half = DAG.getConstantFP(Val: `0.5f`, DL, VT);
11730	SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
11731	SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
11732
11733	SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
11734	SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
11735	SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
11736
11737	SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
11738	SDValue SqrtD =
11739	DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
11740	SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
11741	}
11742
11743	SDValue ScaleDownFactor = DAG.getConstantFP(Val: `0x1.0p-16f`, DL, VT);
11744
11745	SDValue ScaledDown =
11746	DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
11747
11748	SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
11749	SDValue IsZeroOrInf =
11750	DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11751	N2: DAG.getTargetConstant(Val: fcZero \| fcPosInf, DL, VT: MVT::i32));
11752
11753	return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
11754	}
11755
11756	SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11757	// For double type, the SQRT and RSQ instructions don't have required
11758	// precision, we apply Goldschmidt's algorithm to improve the result:
11759	//
11760	// y0 = rsq(x)
11761	// g0 = x y0*
11762	// h0 = 0.5 y0*
11763	//
11764	// r0 = 0.5 - h0 g0*
11765	// g1 = g0 r0 + g0*
11766	// h1 = h0 r0 + h0*
11767	//
11768	// r1 = 0.5 - h1 g1 => d0 = x - g1 * g1*
11769	// g2 = g1 r1 + g1 g2 = d0 * h1 + g1*
11770	// h2 = h1 r1 + h1*
11771	//
11772	// r2 = 0.5 - h2 g2 => d1 = x - g2 * g2*
11773	// g3 = g2 r2 + g2 g3 = d1 * h1 + g2*
11774	//
11775	// sqrt(x) = g3
11776
11777	SDNodeFlags Flags = Op ->getFlags();
11778
11779	SDLoc DL(Op);
11780
11781	SDValue X = Op.getOperand(i: `0`);
11782	SDValue ScaleConstant = DAG.getConstantFP(Val: `0x1.0p-767`, DL, VT: MVT::f64);
11783
11784	SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
11785
11786	SDValue ZeroInt = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
11787
11788	// Scale up input if it is too small.
11789	SDValue ScaleUpFactor = DAG.getConstant(Val: `256`, DL, VT: MVT::i32);
11790	SDValue ScaleUp =
11791	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
11792	SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
11793
11794	SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
11795
11796	SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
11797
11798	SDValue Half = DAG.getConstantFP(Val: `0.5`, DL, VT: MVT::f64);
11799	SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
11800
11801	SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
11802	SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
11803
11804	SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
11805
11806	SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
11807
11808	SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
11809	SDValue SqrtD0 =
11810	DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
11811
11812	SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
11813
11814	SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
11815	SDValue SqrtD1 =
11816	DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
11817
11818	SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
11819
11820	SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -`128`, DL, VT: MVT::i32);
11821	SDValue ScaleDown =
11822	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
11823	SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
11824
11825	// TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11826	// with finite only or nsz because rsq(+/-0) = +/-inf
11827
11828	// TODO: Check for DAZ and expand to subnormals
11829	SDValue IsZeroOrInf =
11830	DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11831	N2: DAG.getTargetConstant(Val: fcZero \| fcPosInf, DL, VT: MVT::i32));
11832
11833	// If x is +INF, +0, or -0, use its original value
11834	return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
11835	Flags);
11836	}
11837
11838	SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11839	SDLoc DL(Op);
11840	EVT VT = Op.getValueType();
11841	SDValue Arg = Op.getOperand(i: `0`);
11842	SDValue TrigVal;
11843
11844	// Propagate fast-math flags so that the multiply we introduce can be folded
11845	// if Arg is already the result of a multiply by constant.
11846	auto Flags = Op ->getFlags();
11847
11848	SDValue OneOver2Pi = DAG.getConstantFP(Val: `0.5` * numbers::inv_pi, DL, VT);
11849
11850	if (Subtarget->hasTrigReducedRange()) {
11851	SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11852	TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags);
11853	} else {
11854	TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11855	}
11856
11857	switch (Op.getOpcode()) {
11858	case ISD::FCOS:
11859	return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc (Op), VT, Operand: TrigVal, Flags);
11860	case ISD::FSIN:
11861	return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc (Op), VT, Operand: TrigVal, Flags);
11862	default:
11863	llvm_unreachable("Wrong trig opcode");
11864	}
11865	}
11866
11867	SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11868	SelectionDAG &DAG) const {
11869	AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
11870	assert(AtomicNode->isCompareAndSwap());
11871	unsigned AS = AtomicNode->getAddressSpace();
11872
11873	// No custom lowering required for local address space
11874	if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11875	return Op;
11876
11877	// Non-local address space requires custom lowering for atomic compare
11878	// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11879	SDLoc DL(Op);
11880	SDValue ChainIn = Op.getOperand(i: `0`);
11881	SDValue Addr = Op.getOperand(i: `1`);
11882	SDValue Old = Op.getOperand(i: `2`);
11883	SDValue New = Op.getOperand(i: `3`);
11884	EVT VT = Op.getValueType();
11885	MVT SimpleVT = VT.getSimpleVT();
11886	MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: `2`);
11887
11888	SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
11889	SDValue Ops[] = {ChainIn, Addr, NewOld};
11890
11891	return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
11892	VTList: Op ->getVTList(), Ops, MemVT: VT,
11893	MMO: AtomicNode->getMemOperand());
11894	}
11895
11896	//===----------------------------------------------------------------------===//
11897	// Custom DAG optimizations
11898	//===----------------------------------------------------------------------===//
11899
11900	SDValue
11901	SITargetLowering::performUCharToFloatCombine(SDNode *N,
11902	DAGCombinerInfo &DCI) const {
11903	EVT VT = N->getValueType(ResNo: `0`);
11904	EVT ScalarVT = VT.getScalarType();
11905	if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11906	return SDValue ();
11907
11908	SelectionDAG &DAG = DCI.DAG;
11909	SDLoc DL(N);
11910
11911	SDValue Src = N->getOperand(Num: `0`);
11912	EVT SrcVT = Src.getValueType();
11913
11914	// TODO: We could try to match extracting the higher bytes, which would be
11915	// easier if i8 vectors weren't promoted to i32 vectors, particularly after
11916	// types are legalized. v4i8 -> v4f32 is probably the only case to worry
11917	// about in practice.
11918	if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11919	if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `24`))) {
11920	SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
11921	DCI.AddToWorklist(N: Cvt.getNode());
11922
11923	// For the f16 case, fold to a cast to f32 and then cast back to f16.
11924	if (ScalarVT != MVT::f32) {
11925	Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
11926	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
11927	}
11928	return Cvt;
11929	}
11930	}
11931
11932	return SDValue ();
11933	}
11934
11935	SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11936	DAGCombinerInfo &DCI) const {
11937	SDValue MagnitudeOp = N->getOperand(Num: `0`);
11938	SDValue SignOp = N->getOperand(Num: `1`);
11939
11940	// The generic combine for fcopysign + fp cast is too conservative with
11941	// vectors, and also gets confused by the splitting we will perform here, so
11942	// peek through FP casts.
11943	if (SignOp.getOpcode() == ISD::FP_EXTEND \|\|
11944	SignOp.getOpcode() == ISD::FP_ROUND)
11945	SignOp = SignOp.getOperand(i: `0`);
11946
11947	SelectionDAG &DAG = DCI.DAG;
11948	SDLoc DL(N);
11949	EVT SignVT = SignOp.getValueType();
11950
11951	// f64 fcopysign is really an f32 copysign on the high bits, so replace the
11952	// lower half with a copy.
11953	// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11954	EVT MagVT = MagnitudeOp.getValueType();
11955
11956	unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : `1`;
11957
11958	if (MagVT.getScalarType() == MVT::f64) {
11959	EVT F32VT = MagVT.isVector()
11960	? EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::f32, NumElements: `2` NumElts)
11961	: MVT::v2f32;
11962
11963	SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
11964
11965	SmallVector<SDValue, `8`> NewElts;
11966	for (unsigned I = `0`; I != NumElts; ++I) {
11967	SDValue MagLo =
11968	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11969	N2: DAG.getConstant(Val: `2` * I, DL, VT: MVT::i32));
11970	SDValue MagHi =
11971	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11972	N2: DAG.getConstant(Val: `2` * I + `1`, DL, VT: MVT::i32));
11973
11974	SDValue SignOpElt =
11975	MagVT.isVector()
11976	? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
11977	N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
11978	: SignOp;
11979
11980	SDValue HiOp =
11981	DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
11982
11983	SDValue Vector =
11984	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
11985
11986	SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
11987	NewElts.push_back(Elt: NewElt);
11988	}
11989
11990	if (NewElts.size() == `1`)
11991	return NewElts [`0`];
11992
11993	return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
11994	}
11995
11996	if (SignVT.getScalarType() != MVT::f64)
11997	return SDValue ();
11998
11999	// Reduce width of sign operand, we only need the highest bit.
12000	//
12001	// fcopysign f64:x, f64:y ->
12002	// fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12003	// TODO: In some cases it might make sense to go all the way to f16.
12004
12005	EVT F32VT = MagVT.isVector()
12006	? EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::f32, NumElements: `2` NumElts)
12007	: MVT::v2f32;
12008
12009	SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
12010
12011	SmallVector<SDValue, `8`> F32Signs;
12012	for (unsigned I = `0`; I != NumElts; ++I) {
12013	// Take sign from odd elements of cast vector
12014	SDValue SignAsF32 =
12015	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
12016	N2: DAG.getConstant(Val: `2` * I + `1`, DL, VT: MVT::i32));
12017	F32Signs.push_back(Elt: SignAsF32);
12018	}
12019
12020	SDValue NewSign =
12021	NumElts == `1`
12022	? F32Signs.back()
12023	: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
12024	VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
12025	Ops: F32Signs);
12026
12027	return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `0`),
12028	N2: NewSign);
12029	}
12030
12031	// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12032	// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12033	// bits
12034
12035	// This is a variant of
12036	// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12037	//
12038	// The normal DAG combiner will do this, but only if the add has one use since
12039	// that would increase the number of instructions.
12040	//
12041	// This prevents us from seeing a constant offset that can be folded into a
12042	// memory instruction's addressing mode. If we know the resulting add offset of
12043	// a pointer can be folded into an addressing offset, we can replace the pointer
12044	// operand with the add of new constant offset. This eliminates one of the uses,
12045	// and may allow the remaining use to also be simplified.
12046	//
12047	SDValue SITargetLowering::performSHLPtrCombine(SDNode N, unsigned* AddrSpace,
12048	EVT MemVT,
12049	DAGCombinerInfo &DCI) const {
12050	SDValue N0 = N->getOperand(Num: `0`);
12051	SDValue N1 = N->getOperand(Num: `1`);
12052
12053	// We only do this to handle cases where it's profitable when there are
12054	// multiple uses of the add, so defer to the standard combine.
12055	if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) \|\|
12056	N0 ->hasOneUse())
12057	return SDValue ();
12058
12059	const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
12060	if (!CN1)
12061	return SDValue ();
12062
12063	const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: `1`));
12064	if (!CAdd)
12065	return SDValue ();
12066
12067	SelectionDAG &DAG = DCI.DAG;
12068
12069	if (N0 ->getOpcode() == ISD::OR &&
12070	!DAG.haveNoCommonBitsSet(A: N0.getOperand(i: `0`), B: N0.getOperand(i: `1`)))
12071	return SDValue ();
12072
12073	// If the resulting offset is too large, we can't fold it into the
12074	// addressing mode offset.
12075	APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12076	Type Ty = MemVT.getTypeForEVT(Context&: DCI.DAG.getContext());
12077
12078	AddrMode AM;
12079	AM.HasBaseReg = true;
12080	AM.BaseOffs = Offset.getSExtValue();
12081	if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
12082	return SDValue ();
12083
12084	SDLoc SL(N);
12085	EVT VT = N->getValueType(ResNo: `0`);
12086
12087	SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: `0`), N2: N1);
12088	SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
12089
12090	SDNodeFlags Flags;
12091	Flags.setNoUnsignedWrap(
12092	N->getFlags().hasNoUnsignedWrap() &&
12093	(N0.getOpcode() == ISD::OR \|\| N0 ->getFlags().hasNoUnsignedWrap()));
12094
12095	return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
12096	}
12097
12098	/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12099	/// by the chain and intrinsic ID. Theoretically we would also need to check the
12100	/// specific intrinsic, but they all place the pointer operand first.
12101	static unsigned getBasePtrIndex(const MemSDNode *N) {
12102	switch (N->getOpcode()) {
12103	case ISD::STORE:
12104	case ISD::INTRINSIC_W_CHAIN:
12105	case ISD::INTRINSIC_VOID:
12106	return `2`;
12107	default:
12108	return `1`;
12109	}
12110	}
12111
12112	SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12113	DAGCombinerInfo &DCI) const {
12114	SelectionDAG &DAG = DCI.DAG;
12115
12116	unsigned PtrIdx = getBasePtrIndex(N);
12117	SDValue Ptr = N->getOperand(Num: PtrIdx);
12118
12119	// TODO: We could also do this for multiplies.
12120	if (Ptr.getOpcode() == ISD::SHL) {
12121	SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
12122	MemVT: N->getMemoryVT(), DCI);
12123	if (NewPtr) {
12124	SmallVector<SDValue, `8`> NewOps(N->ops());
12125
12126	NewOps [PtrIdx] = NewPtr;
12127	return SDValue (DAG.UpdateNodeOperands(N, Ops: NewOps), `0`);
12128	}
12129	}
12130
12131	return SDValue ();
12132	}
12133
12134	static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12135	return (Opc == ISD::AND && (Val == `0` \|\| Val == `0xffffffff`)) \|\|
12136	(Opc == ISD::OR && (Val == `0xffffffff` \|\| Val == `0`)) \|\|
12137	(Opc == ISD::XOR && Val == `0`);
12138	}
12139
12140	// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12141	// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12142	// integer combine opportunities since most 64-bit operations are decomposed
12143	// this way. TODO: We won't want this for SALU especially if it is an inline
12144	// immediate.
12145	SDValue SITargetLowering::splitBinaryBitConstantOp(
12146	DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12147	const ConstantSDNode CRHS) const* {
12148	uint64_t Val = CRHS->getZExtValue();
12149	uint32_t ValLo = Lo_32(Value: Val);
12150	uint32_t ValHi = Hi_32(Value: Val);
12151	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12152
12153	if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) \|\|
12154	bitOpWithConstantIsReducible(Opc, Val: ValHi)) \|\|
12155	(CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
12156	// If we need to materialize a 64-bit immediate, it will be split up later
12157	// anyway. Avoid creating the harder to understand 64-bit immediate
12158	// materialization.
12159	return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12160	}
12161
12162	return SDValue ();
12163	}
12164
12165	bool llvm::isBoolSGPR(SDValue V) {
12166	if (V.getValueType() != MVT::i1)
12167	return false;
12168	switch (V.getOpcode()) {
12169	default:
12170	break;
12171	case ISD::SETCC:
12172	case ISD::IS_FPCLASS:
12173	case AMDGPUISD::FP_CLASS:
12174	return true;
12175	case ISD::AND:
12176	case ISD::OR:
12177	case ISD::XOR:
12178	return isBoolSGPR(V: V.getOperand(i: `0`)) && isBoolSGPR(V: V.getOperand(i: `1`));
12179	case ISD::SADDO:
12180	case ISD::UADDO:
12181	case ISD::SSUBO:
12182	case ISD::USUBO:
12183	case ISD::SMULO:
12184	case ISD::UMULO:
12185	return V.getResNo() == `1`;
12186	case ISD::INTRINSIC_WO_CHAIN: {
12187	unsigned IntrinsicID = V.getConstantOperandVal(i: `0`);
12188	switch (IntrinsicID) {
12189	case Intrinsic::amdgcn_is_shared:
12190	case Intrinsic::amdgcn_is_private:
12191	return true;
12192	default:
12193	return false;
12194	}
12195
12196	return false;
12197	}
12198	}
12199	return false;
12200	}
12201
12202	// If a constant has all zeroes or all ones within each byte return it.
12203	// Otherwise return 0.
12204	static uint32_t getConstantPermuteMask(uint32_t C) {
12205	// 0xff for any zero byte in the mask
12206	uint32_t ZeroByteMask = `0`;
12207	if (!(C & `0x000000ff`))
12208	ZeroByteMask \|= `0x000000ff`;
12209	if (!(C & `0x0000ff00`))
12210	ZeroByteMask \|= `0x0000ff00`;
12211	if (!(C & `0x00ff0000`))
12212	ZeroByteMask \|= `0x00ff0000`;
12213	if (!(C & `0xff000000`))
12214	ZeroByteMask \|= `0xff000000`;
12215	uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12216	if ((NonZeroByteMask & C) != NonZeroByteMask)
12217	return `0`; // Partial bytes selected.
12218	return C;
12219	}
12220
12221	// Check if a node selects whole bytes from its operand 0 starting at a byte
12222	// boundary while masking the rest. Returns select mask as in the v_perm_b32
12223	// or -1 if not succeeded.
12224	// Note byte select encoding:
12225	// value 0-3 selects corresponding source byte;
12226	// value 0xc selects zero;
12227	// value 0xff selects 0xff.
12228	static uint32_t getPermuteMask(SDValue V) {
12229	assert(V.getValueSizeInBits() == `32`);
12230
12231	if (V.getNumOperands() != `2`)
12232	return ~`0`;
12233
12234	ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: `1`));
12235	if (!N1)
12236	return ~`0`;
12237
12238	uint32_t C = N1->getZExtValue();
12239
12240	switch (V.getOpcode()) {
12241	default:
12242	break;
12243	case ISD::AND:
12244	if (uint32_t ConstMask = getConstantPermuteMask(C))
12245	return (`0x03020100` & ConstMask) \| (`0x0c0c0c0c` & ~ConstMask);
12246	break;
12247
12248	case ISD::OR:
12249	if (uint32_t ConstMask = getConstantPermuteMask(C))
12250	return (`0x03020100` & ~ConstMask) \| ConstMask;
12251	break;
12252
12253	case ISD::SHL:
12254	if (C % `8`)
12255	return ~`0`;
12256
12257	return uint32_t((`0x030201000c0c0c0cull` << C) >> `32`);
12258
12259	case ISD::SRL:
12260	if (C % `8`)
12261	return ~`0`;
12262
12263	return uint32_t(`0x0c0c0c0c03020100ull` >> C);
12264	}
12265
12266	return ~`0`;
12267	}
12268
12269	SDValue SITargetLowering::performAndCombine(SDNode *N,
12270	DAGCombinerInfo &DCI) const {
12271	if (DCI.isBeforeLegalize())
12272	return SDValue ();
12273
12274	SelectionDAG &DAG = DCI.DAG;
12275	EVT VT = N->getValueType(ResNo: `0`);
12276	SDValue LHS = N->getOperand(Num: `0`);
12277	SDValue RHS = N->getOperand(Num: `1`);
12278
12279	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
12280	if (VT == MVT::i64 && CRHS) {
12281	if (SDValue Split =
12282	splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::AND, LHS, CRHS))
12283	return Split;
12284	}
12285
12286	if (CRHS && VT == MVT::i32) {
12287	// and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12288	// nb = number of trailing zeroes in mask
12289	// It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12290	// given that we are selecting 8 or 16 bit fields starting at byte boundary.
12291	uint64_t Mask = CRHS->getZExtValue();
12292	unsigned Bits = llvm::popcount(Value: Mask);
12293	if (getSubtarget()->hasSDWA() && LHS ->getOpcode() == ISD::SRL &&
12294	(Bits == `8` \|\| Bits == `16`) && isShiftedMask_64(Value: Mask) && !(Mask & `1`)) {
12295	if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`))) {
12296	unsigned Shift = CShift->getZExtValue();
12297	unsigned NB = CRHS->getAPIntValue().countr_zero();
12298	unsigned Offset = NB + Shift;
12299	if ((Offset & (Bits - `1`)) == `0`) { // Starts at a byte or word boundary.
12300	SDLoc SL(N);
12301	SDValue BFE =
12302	DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS ->getOperand(Num: `0`),
12303	N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
12304	N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
12305	EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
12306	SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
12307	N2: DAG.getValueType(NarrowVT));
12308	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc (LHS), VT, N1: Ext,
12309	N2: DAG.getConstant(Val: NB, DL: SDLoc (CRHS), VT: MVT::i32));
12310	return Shl;
12311	}
12312	}
12313	}
12314
12315	// and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12316	if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12317	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`))) {
12318	uint32_t Sel = getConstantPermuteMask(C: Mask);
12319	if (!Sel)
12320	return SDValue ();
12321
12322	// Select 0xc for all zero bytes
12323	Sel = (LHS.getConstantOperandVal(i: `2`) & Sel) \| (~Sel & `0x0c0c0c0c`);
12324	SDLoc DL(N);
12325	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
12326	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12327	}
12328	}
12329
12330	// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12331	// fp_class x, ~(s_nan \| q_nan \| n_infinity \| p_infinity)
12332	if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12333	ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: `2`))->get();
12334	ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: `2`))->get();
12335
12336	SDValue X = LHS.getOperand(i: `0`);
12337	SDValue Y = RHS.getOperand(i: `0`);
12338	if (Y.getOpcode() != ISD::FABS \|\| Y.getOperand(i: `0`) != X \|\|
12339	!isTypeLegal(VT: X.getValueType()))
12340	return SDValue ();
12341
12342	if (LCC == ISD::SETO) {
12343	if (X != LHS.getOperand(i: `1`))
12344	return SDValue ();
12345
12346	if (RCC == ISD::SETUNE) {
12347	const ConstantFPSDNode *C1 =
12348	dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: `1`));
12349	if (!C1 \|\| !C1->isInfinity() \|\| C1->isNegative())
12350	return SDValue ();
12351
12352	const uint32_t Mask = SIInstrFlags::N_NORMAL \|
12353	SIInstrFlags::N_SUBNORMAL \| SIInstrFlags::N_ZERO \|
12354	SIInstrFlags::P_ZERO \| SIInstrFlags::P_SUBNORMAL \|
12355	SIInstrFlags::P_NORMAL;
12356
12357	static_assert(
12358	((~(SIInstrFlags::S_NAN \| SIInstrFlags::Q_NAN \|
12359	SIInstrFlags::N_INFINITY \| SIInstrFlags::P_INFINITY)) &
12360	`0x3ff`) == Mask,
12361	"mask not equal");
12362
12363	SDLoc DL(N);
12364	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
12365	N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
12366	}
12367	}
12368	}
12369
12370	if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12371	std::swap(a&: LHS, b&: RHS);
12372
12373	if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12374	RHS.hasOneUse()) {
12375	ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: `2`))->get();
12376	// and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan \|
12377	// n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12378	// \| n_nan)
12379	const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: `1`));
12380	if ((LCC == ISD::SETO \|\| LCC == ISD::SETUO) && Mask &&
12381	(RHS.getOperand(i: `0`) == LHS.getOperand(i: `0`) &&
12382	LHS.getOperand(i: `0`) == LHS.getOperand(i: `1`))) {
12383	const unsigned OrdMask = SIInstrFlags::S_NAN \| SIInstrFlags::Q_NAN;
12384	unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12385	: Mask->getZExtValue() & OrdMask;
12386
12387	SDLoc DL(N);
12388	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: `0`),
12389	N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
12390	}
12391	}
12392
12393	if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND \|\|
12394	LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12395	// and x, (sext cc from i1) => select cc, x, 0
12396	if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12397	std::swap(a&: LHS, b&: RHS);
12398	if (isBoolSGPR(V: RHS.getOperand(i: `0`)))
12399	return DAG.getSelect(DL: SDLoc (N), VT: MVT::i32, Cond: RHS.getOperand(i: `0`), LHS,
12400	RHS: DAG.getConstant(Val: `0`, DL: SDLoc (N), VT: MVT::i32));
12401	}
12402
12403	// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12404	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12405	if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12406	N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
12407	uint32_t LHSMask = getPermuteMask(V: LHS);
12408	uint32_t RHSMask = getPermuteMask(V: RHS);
12409	if (LHSMask != ~`0u` && RHSMask != ~`0u`) {
12410	// Canonicalize the expression in an attempt to have fewer unique masks
12411	// and therefore fewer registers used to hold the masks.
12412	if (LHSMask > RHSMask) {
12413	std::swap(a&: LHSMask, b&: RHSMask);
12414	std::swap(a&: LHS, b&: RHS);
12415	}
12416
12417	// Select 0xc for each lane used from source operand. Zero has 0xc mask
12418	// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12419	uint32_t LHSUsedLanes = ~(LHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
12420	uint32_t RHSUsedLanes = ~(RHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
12421
12422	// Check of we need to combine values from two sources within a byte.
12423	if (!(LHSUsedLanes & RHSUsedLanes) &&
12424	// If we select high and lower word keep it for SDWA.
12425	// TODO: teach SDWA to work with v_perm_b32 and remove the check.
12426	!(LHSUsedLanes == `0x0c0c0000` && RHSUsedLanes == `0x00000c0c`)) {
12427	// Each byte in each mask is either selector mask 0-3, or has higher
12428	// bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12429	// zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12430	// mask which is not 0xff wins. By anding both masks we have a correct
12431	// result except that 0x0c shall be corrected to give 0x0c only.
12432	uint32_t Mask = LHSMask & RHSMask;
12433	for (unsigned I = `0`; I < `32`; I += `8`) {
12434	uint32_t ByteSel = `0xff` << I;
12435	if ((LHSMask & ByteSel) == `0x0c` \|\| (RHSMask & ByteSel) == `0x0c`)
12436	Mask &= (`0x0c` << I) & `0xffffffff`;
12437	}
12438
12439	// Add 4 to each active LHS lane. It will not affect any existing 0xff
12440	// or 0x0c.
12441	uint32_t Sel = Mask \| (LHSUsedLanes & `0x04040404`);
12442	SDLoc DL(N);
12443
12444	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
12445	N2: RHS.getOperand(i: `0`),
12446	N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12447	}
12448	}
12449	}
12450
12451	return SDValue ();
12452	}
12453
12454	// A key component of v_perm is a mapping between byte position of the src
12455	// operands, and the byte position of the dest. To provide such, we need: 1. the
12456	// node that provides x byte of the dest of the OR, and 2. the byte of the node
12457	// used to provide that x byte. calculateByteProvider finds which node provides
12458	// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12459	// and finds an ultimate src and byte position For example: The supported
12460	// LoadCombine pattern for vector loads is as follows
12461	// t1
12462	// or
12463	// / \
12464	// t2 t3
12465	// zext shl
12466	// \| \| \
12467	// t4 t5 16
12468	// or anyext
12469	// / \ \|
12470	// t6 t7 t8
12471	// srl shl or
12472	// / \| / \ / \
12473	// t9 t10 t11 t12 t13 t14
12474	// trunc 8 trunc* 8 and and*
12475	// \| \| / \| \| \
12476	// t15 t16 t17 t18 t19 t20
12477	// trunc 255 srl -256*
12478	// \| / \
12479	// t15 t15 16
12480	//
12481	// In this example, the truncs are from i32->i16*
12482	//
12483	// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12484	// respectively. calculateSrcByte would find (given node) -> ultimate src &
12485	// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12486	// After finding the mapping, we can combine the tree into vperm t15, t16,
12487	// 0x05000407
12488
12489	// Find the source and byte position from a node.
12490	// \p DestByte is the byte position of the dest of the or that the src
12491	// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12492	// dest of the or byte. \p Depth tracks how many recursive iterations we have
12493	// performed.
12494	static const std::optional<ByteProvider<SDValue>>
12495	calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = `0`,
12496	unsigned Depth = `0`) {
12497	// We may need to recursively traverse a series of SRLs
12498	if (Depth >= `6`)
12499	return std::nullopt;
12500
12501	if (Op.getValueSizeInBits() < `8`)
12502	return std::nullopt;
12503
12504	if (Op.getValueType().isVector())
12505	return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
12506
12507	switch (Op ->getOpcode()) {
12508	case ISD::TRUNCATE: {
12509	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
12510	}
12511
12512	case ISD::SIGN_EXTEND:
12513	case ISD::ZERO_EXTEND:
12514	case ISD::SIGN_EXTEND_INREG: {
12515	SDValue NarrowOp = Op ->getOperand(Num: `0`);
12516	auto NarrowVT = NarrowOp.getValueType();
12517	if (Op ->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12518	auto *VTSign = cast<VTSDNode>(Val: Op ->getOperand(Num: `1`));
12519	NarrowVT = VTSign->getVT();
12520	}
12521	if (!NarrowVT.isByteSized())
12522	return std::nullopt;
12523	uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12524
12525	if (SrcIndex >= NarrowByteWidth)
12526	return std::nullopt;
12527	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
12528	}
12529
12530	case ISD::SRA:
12531	case ISD::SRL: {
12532	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
12533	if (!ShiftOp)
12534	return std::nullopt;
12535
12536	uint64_t BitShift = ShiftOp->getZExtValue();
12537
12538	if (BitShift % `8` != `0`)
12539	return std::nullopt;
12540
12541	SrcIndex += BitShift / `8`;
12542
12543	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte, SrcIndex, Depth: Depth + `1`);
12544	}
12545
12546	default: {
12547	return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
12548	}
12549	}
12550	llvm_unreachable("fully handled switch");
12551	}
12552
12553	// For a byte position in the result of an Or, traverse the tree and find the
12554	// node (and the byte of the node) which ultimately provides this {Or,
12555	// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12556	// the byte position of the Op that corresponds with the originally requested
12557	// byte of the Or \p Depth tracks how many recursive iterations we have
12558	// performed. \p StartingIndex is the originally requested byte of the Or
12559	static const std::optional<ByteProvider<SDValue>>
12560	calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12561	unsigned StartingIndex = `0`) {
12562	// Finding Src tree of RHS of or typically requires at least 1 additional
12563	// depth
12564	if (Depth > `6`)
12565	return std::nullopt;
12566
12567	unsigned BitWidth = Op.getScalarValueSizeInBits();
12568	if (BitWidth % `8` != `0`)
12569	return std::nullopt;
12570	if (Index > BitWidth / `8` - `1`)
12571	return std::nullopt;
12572
12573	bool IsVec = Op.getValueType().isVector();
12574	switch (Op.getOpcode()) {
12575	case ISD::OR: {
12576	if (IsVec)
12577	return std::nullopt;
12578
12579	auto RHS = calculateByteProvider(Op: Op.getOperand(i: `1`), Index, Depth: Depth + `1`,
12580	StartingIndex);
12581	if (!RHS)
12582	return std::nullopt;
12583	auto LHS = calculateByteProvider(Op: Op.getOperand(i: `0`), Index, Depth: Depth + `1`,
12584	StartingIndex);
12585	if (!LHS)
12586	return std::nullopt;
12587	// A well formed Or will have two ByteProviders for each byte, one of which
12588	// is constant zero
12589	if (!LHS ->isConstantZero() && !RHS ->isConstantZero())
12590	return std::nullopt;
12591	if (!LHS \|\| LHS ->isConstantZero())
12592	return RHS;
12593	if (!RHS \|\| RHS ->isConstantZero())
12594	return LHS;
12595	return std::nullopt;
12596	}
12597
12598	case ISD::AND: {
12599	if (IsVec)
12600	return std::nullopt;
12601
12602	auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
12603	if (!BitMaskOp)
12604	return std::nullopt;
12605
12606	uint32_t BitMask = BitMaskOp->getZExtValue();
12607	// Bits we expect for our StartingIndex
12608	uint32_t IndexMask = `0xFF` << (Index * `8`);
12609
12610	if ((IndexMask & BitMask) != IndexMask) {
12611	// If the result of the and partially provides the byte, then it
12612	// is not well formatted
12613	if (IndexMask & BitMask)
12614	return std::nullopt;
12615	return ByteProvider<SDValue>::getConstantZero();
12616	}
12617
12618	return calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte: StartingIndex, SrcIndex: Index);
12619	}
12620
12621	case ISD::FSHR: {
12622	if (IsVec)
12623	return std::nullopt;
12624
12625	// fshr(X,Y,Z): (X << (BW - (Z % BW))) \| (Y >> (Z % BW))
12626	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`));
12627	if (!ShiftOp \|\| Op.getValueType().isVector())
12628	return std::nullopt;
12629
12630	uint64_t BitsProvided = Op.getValueSizeInBits();
12631	if (BitsProvided % `8` != `0`)
12632	return std::nullopt;
12633
12634	uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
12635	if (BitShift % `8`)
12636	return std::nullopt;
12637
12638	uint64_t ConcatSizeInBytes = BitsProvided / `4`;
12639	uint64_t ByteShift = BitShift / `8`;
12640
12641	uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12642	uint64_t BytesProvided = BitsProvided / `8`;
12643	SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? `0` : `1`);
12644	NewIndex %= BytesProvided;
12645	return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + `1`, StartingIndex);
12646	}
12647
12648	case ISD::SRA:
12649	case ISD::SRL: {
12650	if (IsVec)
12651	return std::nullopt;
12652
12653	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
12654	if (!ShiftOp)
12655	return std::nullopt;
12656
12657	uint64_t BitShift = ShiftOp->getZExtValue();
12658	if (BitShift % `8`)
12659	return std::nullopt;
12660
12661	auto BitsProvided = Op.getScalarValueSizeInBits();
12662	if (BitsProvided % `8` != `0`)
12663	return std::nullopt;
12664
12665	uint64_t BytesProvided = BitsProvided / `8`;
12666	uint64_t ByteShift = BitShift / `8`;
12667	// The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12668	// If the byte we are trying to provide (as tracked by index) falls in this
12669	// range, then the SRL provides the byte. The byte of interest of the src of
12670	// the SRL is Index + ByteShift
12671	return BytesProvided - ByteShift > Index
12672	? calculateSrcByte(Op: Op ->getOperand(Num: `0`), DestByte: StartingIndex,
12673	SrcIndex: Index + ByteShift)
12674	: ByteProvider<SDValue>::getConstantZero();
12675	}
12676
12677	case ISD::SHL: {
12678	if (IsVec)
12679	return std::nullopt;
12680
12681	auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
12682	if (!ShiftOp)
12683	return std::nullopt;
12684
12685	uint64_t BitShift = ShiftOp->getZExtValue();
12686	if (BitShift % `8` != `0`)
12687	return std::nullopt;
12688	uint64_t ByteShift = BitShift / `8`;
12689
12690	// If we are shifting by an amount greater than (or equal to)
12691	// the index we are trying to provide, then it provides 0s. If not,
12692	// then this bytes are not definitively 0s, and the corresponding byte
12693	// of interest is Index - ByteShift of the src
12694	return Index < ByteShift
12695	? ByteProvider<SDValue>::getConstantZero()
12696	: calculateByteProvider(Op: Op.getOperand(i: `0`), Index: Index - ByteShift,
12697	Depth: Depth + `1`, StartingIndex);
12698	}
12699	case ISD::ANY_EXTEND:
12700	case ISD::SIGN_EXTEND:
12701	case ISD::ZERO_EXTEND:
12702	case ISD::SIGN_EXTEND_INREG:
12703	case ISD::AssertZext:
12704	case ISD::AssertSext: {
12705	if (IsVec)
12706	return std::nullopt;
12707
12708	SDValue NarrowOp = Op ->getOperand(Num: `0`);
12709	unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12710	if (Op ->getOpcode() == ISD::SIGN_EXTEND_INREG \|\|
12711	Op ->getOpcode() == ISD::AssertZext \|\|
12712	Op ->getOpcode() == ISD::AssertSext) {
12713	auto *VTSign = cast<VTSDNode>(Val: Op ->getOperand(Num: `1`));
12714	NarrowBitWidth = VTSign->getVT().getSizeInBits();
12715	}
12716	if (NarrowBitWidth % `8` != `0`)
12717	return std::nullopt;
12718	uint64_t NarrowByteWidth = NarrowBitWidth / `8`;
12719
12720	if (Index >= NarrowByteWidth)
12721	return Op.getOpcode() == ISD::ZERO_EXTEND
12722	? std::optional<ByteProvider<SDValue>>(
12723	ByteProvider<SDValue>::getConstantZero())
12724	: std::nullopt;
12725	return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + `1`, StartingIndex);
12726	}
12727
12728	case ISD::TRUNCATE: {
12729	if (IsVec)
12730	return std::nullopt;
12731
12732	uint64_t NarrowByteWidth = BitWidth / `8`;
12733
12734	if (NarrowByteWidth >= Index) {
12735	return calculateByteProvider(Op: Op.getOperand(i: `0`), Index, Depth: Depth + `1`,
12736	StartingIndex);
12737	}
12738
12739	return std::nullopt;
12740	}
12741
12742	case ISD::CopyFromReg: {
12743	if (BitWidth / `8` > Index)
12744	return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
12745
12746	return std::nullopt;
12747	}
12748
12749	case ISD::LOAD: {
12750	auto *L = cast<LoadSDNode>(Val: Op.getNode());
12751
12752	unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12753	if (NarrowBitWidth % `8` != `0`)
12754	return std::nullopt;
12755	uint64_t NarrowByteWidth = NarrowBitWidth / `8`;
12756
12757	// If the width of the load does not reach byte we are trying to provide for
12758	// and it is not a ZEXTLOAD, then the load does not provide for the byte in
12759	// question
12760	if (Index >= NarrowByteWidth) {
12761	return L->getExtensionType() == ISD::ZEXTLOAD
12762	? std::optional<ByteProvider<SDValue>>(
12763	ByteProvider<SDValue>::getConstantZero())
12764	: std::nullopt;
12765	}
12766
12767	if (NarrowByteWidth > Index) {
12768	return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
12769	}
12770
12771	return std::nullopt;
12772	}
12773
12774	case ISD::BSWAP: {
12775	if (IsVec)
12776	return std::nullopt;
12777
12778	return calculateByteProvider(Op: Op ->getOperand(Num: `0`), Index: BitWidth / `8` - Index - `1`,
12779	Depth: Depth + `1`, StartingIndex);
12780	}
12781
12782	case ISD::EXTRACT_VECTOR_ELT: {
12783	auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
12784	if (!IdxOp)
12785	return std::nullopt;
12786	auto VecIdx = IdxOp->getZExtValue();
12787	auto ScalarSize = Op.getScalarValueSizeInBits();
12788	if (ScalarSize < `32`)
12789	Index = ScalarSize == `8` ? VecIdx : VecIdx * `2` + Index;
12790	return calculateSrcByte(Op: ScalarSize >= `32` ? Op : Op.getOperand(i: `0`),
12791	DestByte: StartingIndex, SrcIndex: Index);
12792	}
12793
12794	case AMDGPUISD::PERM: {
12795	if (IsVec)
12796	return std::nullopt;
12797
12798	auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `2`));
12799	if (!PermMask)
12800	return std::nullopt;
12801
12802	auto IdxMask =
12803	(PermMask->getZExtValue() & (`0xFF` << (Index * `8`))) >> (Index * `8`);
12804	if (IdxMask > `0x07` && IdxMask != `0x0c`)
12805	return std::nullopt;
12806
12807	auto NextOp = Op.getOperand(i: IdxMask > `0x03` ? `0` : `1`);
12808	auto NextIndex = IdxMask > `0x03` ? IdxMask % `4` : IdxMask;
12809
12810	return IdxMask != `0x0c` ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
12811	: ByteProvider<SDValue>(
12812	ByteProvider<SDValue>::getConstantZero());
12813	}
12814
12815	default: {
12816	return std::nullopt;
12817	}
12818	}
12819
12820	llvm_unreachable("fully handled switch");
12821	}
12822
12823	// Returns true if the Operand is a scalar and is 16 bits
12824	static bool isExtendedFrom16Bits(SDValue &Operand) {
12825
12826	switch (Operand.getOpcode()) {
12827	case ISD::ANY_EXTEND:
12828	case ISD::SIGN_EXTEND:
12829	case ISD::ZERO_EXTEND: {
12830	auto OpVT = Operand.getOperand(i: `0`).getValueType();
12831	return !OpVT.isVector() && OpVT.getSizeInBits() == `16`;
12832	}
12833	case ISD::LOAD: {
12834	LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
12835	auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
12836	if (ExtType == ISD::ZEXTLOAD \|\| ExtType == ISD::SEXTLOAD \|\|
12837	ExtType == ISD::EXTLOAD) {
12838	auto MemVT = L->getMemoryVT();
12839	return !MemVT.isVector() && MemVT.getSizeInBits() == `16`;
12840	}
12841	return L->getMemoryVT().getSizeInBits() == `16`;
12842	}
12843	default:
12844	return false;
12845	}
12846	}
12847
12848	// Returns true if the mask matches consecutive bytes, and the first byte
12849	// begins at a power of 2 byte offset from 0th byte
12850	static bool addresses16Bits(int Mask) {
12851	int Low8 = Mask & `0xff`;
12852	int Hi8 = (Mask & `0xff00`) >> `8`;
12853
12854	assert(Low8 < `8` && Hi8 < `8`);
12855	// Are the bytes contiguous in the order of increasing addresses.
12856	bool IsConsecutive = (Hi8 - Low8 == `1`);
12857	// Is the first byte at location that is aligned for 16 bit instructions.
12858	// A counter example is taking 2 consecutive bytes starting at the 8th bit.
12859	// In this case, we still need code to extract the 16 bit operand, so it
12860	// is better to use i8 v_perm
12861	bool Is16Aligned = !(Low8 % `2`);
12862
12863	return IsConsecutive && Is16Aligned;
12864	}
12865
12866	// Do not lower into v_perm if the operands are actually 16 bit
12867	// and the selected bits (based on PermMask) correspond with two
12868	// easily addressable 16 bit operands.
12869	static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12870	SDValue &OtherOp) {
12871	int Low16 = PermMask & `0xffff`;
12872	int Hi16 = (PermMask & `0xffff0000`) >> `16`;
12873
12874	auto TempOp = peekThroughBitcasts(V: Op);
12875	auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
12876
12877	auto OpIs16Bit =
12878	TempOtherOp.getValueSizeInBits() == `16` \|\| isExtendedFrom16Bits(Operand&: TempOp);
12879	if (!OpIs16Bit)
12880	return true;
12881
12882	auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == `16` \|\|
12883	isExtendedFrom16Bits(Operand&: TempOtherOp);
12884	if (!OtherOpIs16Bit)
12885	return true;
12886
12887	// Do we cleanly address both
12888	return !addresses16Bits(Mask: Low16) \|\| !addresses16Bits(Mask: Hi16);
12889	}
12890
12891	static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12892	unsigned DWordOffset) {
12893	SDValue Ret;
12894
12895	auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12896	// ByteProvider must be at least 8 bits
12897	assert(Src.getValueSizeInBits().isKnownMultipleOf(`8`));
12898
12899	if (TypeSize <= `32`)
12900	return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
12901
12902	if (Src.getValueType().isVector()) {
12903	auto ScalarTySize = Src.getScalarValueSizeInBits();
12904	auto ScalarTy = Src.getValueType().getScalarType();
12905	if (ScalarTySize == `32`) {
12906	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
12907	N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
12908	}
12909	if (ScalarTySize > `32`) {
12910	Ret = DAG.getNode(
12911	Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
12912	N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / `32`), DL: SL, VT: MVT::i32));
12913	auto ShiftVal = `32` * (DWordOffset % (ScalarTySize / `32`));
12914	if (ShiftVal)
12915	Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
12916	N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12917	return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12918	}
12919
12920	assert(ScalarTySize < `32`);
12921	auto NumElements = TypeSize / ScalarTySize;
12922	auto Trunc32Elements = (ScalarTySize * NumElements) / `32`;
12923	auto NormalizedTrunc = Trunc32Elements * `32` / ScalarTySize;
12924	auto NumElementsIn32 = `32` / ScalarTySize;
12925	auto NumAvailElements = DWordOffset < Trunc32Elements
12926	? NumElementsIn32
12927	: NumElements - NormalizedTrunc;
12928
12929	SmallVector<SDValue, `4`> VecSrcs;
12930	DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
12931	Count: NumAvailElements);
12932
12933	Ret = DAG.getBuildVector(
12934	VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
12935	Ops: VecSrcs);
12936	return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12937	}
12938
12939	/// Scalar Type
12940	auto ShiftVal = `32` * DWordOffset;
12941	Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
12942	N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12943	return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12944	}
12945
12946	static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12947	SelectionDAG &DAG = DCI.DAG;
12948	[[maybe_unused]] EVT VT = N->getValueType(ResNo: `0`);
12949	SmallVector<ByteProvider<SDValue>, `8`> PermNodes;
12950
12951	// VT is known to be MVT::i32, so we need to provide 4 bytes.
12952	assert(VT == MVT::i32);
12953	for (int i = `0`; i < `4`; i++) {
12954	// Find the ByteProvider that provides the ith byte of the result of OR
12955	std::optional<ByteProvider<SDValue>> P =
12956	calculateByteProvider(Op: SDValue (N, `0`), Index: i, Depth: `0`, /StartingIndex = / i);
12957	// TODO support constantZero
12958	if (!P \|\| P ->isConstantZero())
12959	return SDValue ();
12960
12961	PermNodes.push_back(Elt: *P);
12962	}
12963	if (PermNodes.size() != `4`)
12964	return SDValue ();
12965
12966	std::pair<unsigned, unsigned> FirstSrc(`0`, PermNodes [`0`].SrcOffset / `4`);
12967	std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12968	uint64_t PermMask = `0x00000000`;
12969	for (size_t i = `0`; i < PermNodes.size(); i++) {
12970	auto PermOp = PermNodes [i];
12971	// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12972	// by sizeof(Src2) = 4
12973	int SrcByteAdjust = `4`;
12974
12975	// If the Src uses a byte from a different DWORD, then it corresponds
12976	// with a difference source
12977	if (!PermOp.hasSameSrc(Other: PermNodes [FirstSrc.first]) \|\|
12978	((PermOp.SrcOffset / `4`) != FirstSrc.second)) {
12979	if (SecondSrc)
12980	if (!PermOp.hasSameSrc(Other: PermNodes [SecondSrc ->first]) \|\|
12981	((PermOp.SrcOffset / `4`) != SecondSrc ->second))
12982	return SDValue ();
12983
12984	// Set the index of the second distinct Src node
12985	SecondSrc = {i, PermNodes [i].SrcOffset / `4`};
12986	assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % `8`));
12987	SrcByteAdjust = `0`;
12988	}
12989	assert((PermOp.SrcOffset % `4`) + SrcByteAdjust < `8`);
12990	assert(!DAG.getDataLayout().isBigEndian());
12991	PermMask \|= ((PermOp.SrcOffset % `4`) + SrcByteAdjust) << (i * `8`);
12992	}
12993	SDLoc DL(N);
12994	SDValue Op = *PermNodes [FirstSrc.first].Src;
12995	Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
12996	assert(Op.getValueSizeInBits() == `32`);
12997
12998	// Check that we are not just extracting the bytes in order from an op
12999	if (!SecondSrc) {
13000	int Low16 = PermMask & `0xffff`;
13001	int Hi16 = (PermMask & `0xffff0000`) >> `16`;
13002
13003	bool WellFormedLow = (Low16 == `0x0504`) \|\| (Low16 == `0x0100`);
13004	bool WellFormedHi = (Hi16 == `0x0706`) \|\| (Hi16 == `0x0302`);
13005
13006	// The perm op would really just produce Op. So combine into Op
13007	if (WellFormedLow && WellFormedHi)
13008	return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: `32`), V: Op);
13009	}
13010
13011	SDValue OtherOp = SecondSrc ? *PermNodes [SecondSrc ->first].Src : Op;
13012
13013	if (SecondSrc) {
13014	OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc ->second);
13015	assert(OtherOp.getValueSizeInBits() == `32`);
13016	}
13017
13018	if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13019
13020	assert(Op.getValueType().isByteSized() &&
13021	OtherOp.getValueType().isByteSized());
13022
13023	// If the ultimate src is less than 32 bits, then we will only be
13024	// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13025	// CalculateByteProvider would not have returned Op as source if we
13026	// used a byte that is outside its ValueType. Thus, we are free to
13027	// ANY_EXTEND as the extended bits are dont-cares.
13028	Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
13029	OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
13030
13031	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
13032	N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
13033	}
13034	return SDValue ();
13035	}
13036
13037	SDValue SITargetLowering::performOrCombine(SDNode *N,
13038	DAGCombinerInfo &DCI) const {
13039	SelectionDAG &DAG = DCI.DAG;
13040	SDValue LHS = N->getOperand(Num: `0`);
13041	SDValue RHS = N->getOperand(Num: `1`);
13042
13043	EVT VT = N->getValueType(ResNo: `0`);
13044	if (VT == MVT::i1) {
13045	// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 \| c2)
13046	if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13047	RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13048	SDValue Src = LHS.getOperand(i: `0`);
13049	if (Src != RHS.getOperand(i: `0`))
13050	return SDValue ();
13051
13052	const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
13053	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: `1`));
13054	if (!CLHS \|\| !CRHS)
13055	return SDValue ();
13056
13057	// Only 10 bits are used.
13058	static const uint32_t MaxMask = `0x3ff`;
13059
13060	uint32_t NewMask =
13061	(CLHS->getZExtValue() \| CRHS->getZExtValue()) & MaxMask;
13062	SDLoc DL(N);
13063	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
13064	N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13065	}
13066
13067	return SDValue ();
13068	}
13069
13070	// or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13071	if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
13072	LHS.getOpcode() == AMDGPUISD::PERM &&
13073	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`))) {
13074	uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: `1`));
13075	if (!Sel)
13076	return SDValue ();
13077
13078	Sel \|= LHS.getConstantOperandVal(i: `2`);
13079	SDLoc DL(N);
13080	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
13081	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13082	}
13083
13084	// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13085	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13086	if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13087	N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
13088
13089	// If all the uses of an or need to extract the individual elements, do not
13090	// attempt to lower into v_perm
13091	auto usesCombinedOperand = [](SDNode *OrUse) {
13092	// If we have any non-vectorized use, then it is a candidate for v_perm
13093	if (OrUse->getOpcode() != ISD::BITCAST \|\|
13094	!OrUse->getValueType(ResNo: `0`).isVector())
13095	return true;
13096
13097	// If we have any non-vectorized use, then it is a candidate for v_perm
13098	for (auto *VUser : OrUse->users()) {
13099	if (!VUser->getValueType(ResNo: `0`).isVector())
13100	return true;
13101
13102	// If the use of a vector is a store, then combining via a v_perm
13103	// is beneficial.
13104	// TODO -- whitelist more uses
13105	for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13106	if (VUser->getOpcode() == VectorwiseOp)
13107	return true;
13108	}
13109	return false;
13110	};
13111
13112	if (!any_of(Range: N->users(), P: usesCombinedOperand))
13113	return SDValue ();
13114
13115	uint32_t LHSMask = getPermuteMask(V: LHS);
13116	uint32_t RHSMask = getPermuteMask(V: RHS);
13117
13118	if (LHSMask != ~`0u` && RHSMask != ~`0u`) {
13119	// Canonicalize the expression in an attempt to have fewer unique masks
13120	// and therefore fewer registers used to hold the masks.
13121	if (LHSMask > RHSMask) {
13122	std::swap(a&: LHSMask, b&: RHSMask);
13123	std::swap(a&: LHS, b&: RHS);
13124	}
13125
13126	// Select 0xc for each lane used from source operand. Zero has 0xc mask
13127	// set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13128	uint32_t LHSUsedLanes = ~(LHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
13129	uint32_t RHSUsedLanes = ~(RHSMask & `0x0c0c0c0c`) & `0x0c0c0c0c`;
13130
13131	// Check of we need to combine values from two sources within a byte.
13132	if (!(LHSUsedLanes & RHSUsedLanes) &&
13133	// If we select high and lower word keep it for SDWA.
13134	// TODO: teach SDWA to work with v_perm_b32 and remove the check.
13135	!(LHSUsedLanes == `0x0c0c0000` && RHSUsedLanes == `0x00000c0c`)) {
13136	// Kill zero bytes selected by other mask. Zero value is 0xc.
13137	LHSMask &= ~RHSUsedLanes;
13138	RHSMask &= ~LHSUsedLanes;
13139	// Add 4 to each active LHS lane
13140	LHSMask \|= LHSUsedLanes & `0x04040404`;
13141	// Combine masks
13142	uint32_t Sel = LHSMask \| RHSMask;
13143	SDLoc DL(N);
13144
13145	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: `0`),
13146	N2: RHS.getOperand(i: `0`),
13147	N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13148	}
13149	}
13150	if (LHSMask == ~`0u` \|\| RHSMask == ~`0u`) {
13151	if (SDValue Perm = matchPERM(N, DCI))
13152	return Perm;
13153	}
13154	}
13155
13156	if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())
13157	return SDValue ();
13158
13159	// TODO: This could be a generic combine with a predicate for extracting the
13160	// high half of an integer being free.
13161
13162	// (or i64:x, (zero_extend i32:y)) ->
13163	// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13164	if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13165	RHS.getOpcode() != ISD::ZERO_EXTEND)
13166	std::swap(a&: LHS, b&: RHS);
13167
13168	if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13169	SDValue ExtSrc = RHS.getOperand(i: `0`);
13170	EVT SrcVT = ExtSrc.getValueType();
13171	if (SrcVT == MVT::i32) {
13172	SDLoc SL(N);
13173	auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
13174	SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
13175
13176	DCI.AddToWorklist(N: LowOr.getNode());
13177	DCI.AddToWorklist(N: HiBits.getNode());
13178
13179	SDValue Vec =
13180	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
13181	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
13182	}
13183	}
13184
13185	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
13186	if (CRHS) {
13187	if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::OR,
13188	LHS: N->getOperand(Num: `0`), CRHS))
13189	return Split;
13190	}
13191
13192	return SDValue ();
13193	}
13194
13195	SDValue SITargetLowering::performXorCombine(SDNode *N,
13196	DAGCombinerInfo &DCI) const {
13197	if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
13198	return RV;
13199
13200	SDValue LHS = N->getOperand(Num: `0`);
13201	SDValue RHS = N->getOperand(Num: `1`);
13202
13203	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13204	SelectionDAG &DAG = DCI.DAG;
13205
13206	EVT VT = N->getValueType(ResNo: `0`);
13207	if (CRHS && VT == MVT::i64) {
13208	if (SDValue Split =
13209	splitBinaryBitConstantOp(DCI, SL: SDLoc (N), Opc: ISD::XOR, LHS, CRHS))
13210	return Split;
13211	}
13212
13213	// Make sure to apply the 64-bit constant splitting fold before trying to fold
13214	// fneg-like xors into 64-bit select.
13215	if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13216	// This looks like an fneg, try to fold as a source modifier.
13217	if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13218	shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
13219	// xor (select c, a, b), 0x80000000 ->
13220	// bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13221	SDLoc DL(N);
13222	SDValue CastLHS =
13223	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS ->getOperand(Num: `1`));
13224	SDValue CastRHS =
13225	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS ->getOperand(Num: `2`));
13226	SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
13227	SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
13228	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
13229	N1: LHS ->getOperand(Num: `0`), N2: FNegLHS, N3: FNegRHS);
13230	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
13231	}
13232	}
13233
13234	return SDValue ();
13235	}
13236
13237	SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13238	DAGCombinerInfo &DCI) const {
13239	if (!Subtarget->has16BitInsts() \|\|
13240	DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13241	return SDValue ();
13242
13243	EVT VT = N->getValueType(ResNo: `0`);
13244	if (VT != MVT::i32)
13245	return SDValue ();
13246
13247	SDValue Src = N->getOperand(Num: `0`);
13248	if (Src.getValueType() != MVT::i16)
13249	return SDValue ();
13250
13251	return SDValue ();
13252	}
13253
13254	SDValue
13255	SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13256	DAGCombinerInfo &DCI) const {
13257	SDValue Src = N->getOperand(Num: `0`);
13258	auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: `1`));
13259
13260	// Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13261	// with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13262	if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13263	VTSign->getVT() == MVT::i8) \|\|
13264	(Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13265	VTSign->getVT() == MVT::i16))) {
13266	assert(Subtarget->hasScalarSubwordLoads() &&
13267	"s_buffer_load_{u8, i8} are supported "
13268	"in GFX12 (or newer) architectures.");
13269	EVT VT = Src.getValueType();
13270	unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13271	? AMDGPUISD::SBUFFER_LOAD_BYTE
13272	: AMDGPUISD::SBUFFER_LOAD_SHORT;
13273	SDLoc DL(N);
13274	SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
13275	SDValue Ops[] = {
13276	Src.getOperand(i: `0`), // source register
13277	Src.getOperand(i: `1`), // offset
13278	Src.getOperand(i: `2`) // cachePolicy
13279	};
13280	auto *M = cast<MemSDNode>(Val&: Src);
13281	SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13282	Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
13283	SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
13284	return LoadVal;
13285	}
13286	if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13287	VTSign->getVT() == MVT::i8) \|\|
13288	(Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13289	VTSign->getVT() == MVT::i16)) &&
13290	Src.hasOneUse()) {
13291	auto *M = cast<MemSDNode>(Val&: Src);
13292	SDValue Ops[] = {Src.getOperand(i: `0`), // Chain
13293	Src.getOperand(i: `1`), // rsrc
13294	Src.getOperand(i: `2`), // vindex
13295	Src.getOperand(i: `3`), // voffset
13296	Src.getOperand(i: `4`), // soffset
13297	Src.getOperand(i: `5`), // offset
13298	Src.getOperand(i: `6`), Src.getOperand(i: `7`)};
13299	// replace with BUFFER_LOAD_BYTE/SHORT
13300	SDVTList ResList =
13301	DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: `0`).getValueType());
13302	unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13303	? AMDGPUISD::BUFFER_LOAD_BYTE
13304	: AMDGPUISD::BUFFER_LOAD_SHORT;
13305	SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13306	Opcode: Opc, dl: SDLoc (N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
13307	return DCI.DAG.getMergeValues(
13308	Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: `1`)}, dl: SDLoc (N));
13309	}
13310	return SDValue ();
13311	}
13312
13313	SDValue SITargetLowering::performClassCombine(SDNode *N,
13314	DAGCombinerInfo &DCI) const {
13315	SelectionDAG &DAG = DCI.DAG;
13316	SDValue Mask = N->getOperand(Num: `1`);
13317
13318	// fp_class x, 0 -> false
13319	if (isNullConstant(V: Mask))
13320	return DAG.getConstant(Val: `0`, DL: SDLoc (N), VT: MVT::i1);
13321
13322	if (N->getOperand(Num: `0`).isUndef())
13323	return DAG.getUNDEF(VT: MVT::i1);
13324
13325	return SDValue ();
13326	}
13327
13328	SDValue SITargetLowering::performRcpCombine(SDNode *N,
13329	DAGCombinerInfo &DCI) const {
13330	EVT VT = N->getValueType(ResNo: `0`);
13331	SDValue N0 = N->getOperand(Num: `0`);
13332
13333	if (N0.isUndef()) {
13334	return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
13335	DL: SDLoc (N), VT);
13336	}
13337
13338	if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP \|\|
13339	N0.getOpcode() == ISD::SINT_TO_FP)) {
13340	return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc (N), VT, Operand: N0,
13341	Flags: N->getFlags());
13342	}
13343
13344	// TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13345	if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13346	N->getFlags().hasAllowContract() && N0 ->getFlags().hasAllowContract()) {
13347	return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc (N), VT, Operand: N0.getOperand(i: `0`),
13348	Flags: N->getFlags());
13349	}
13350
13351	return AMDGPUTargetLowering::performRcpCombine(N, DCI);
13352	}
13353
13354	bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
13355	unsigned MaxDepth) const {
13356	unsigned Opcode = Op.getOpcode();
13357	if (Opcode == ISD::FCANONICALIZE)
13358	return true;
13359
13360	if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
13361	const auto &F = CFP->getValueAPF();
13362	if (F.isNaN() && F.isSignaling())
13363	return false;
13364	if (!F.isDenormal())
13365	return true;
13366
13367	DenormalMode Mode =
13368	DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
13369	return Mode == DenormalMode::getIEEE();
13370	}
13371
13372	// If source is a result of another standard FP operation it is already in
13373	// canonical form.
13374	if (MaxDepth == `0`)
13375	return false;
13376
13377	switch (Opcode) {
13378	// These will flush denorms if required.
13379	case ISD::FADD:
13380	case ISD::FSUB:
13381	case ISD::FMUL:
13382	case ISD::FCEIL:
13383	case ISD::FFLOOR:
13384	case ISD::FMA:
13385	case ISD::FMAD:
13386	case ISD::FSQRT:
13387	case ISD::FDIV:
13388	case ISD::FREM:
13389	case ISD::FP_ROUND:
13390	case ISD::FP_EXTEND:
13391	case ISD::FP16_TO_FP:
13392	case ISD::FP_TO_FP16:
13393	case ISD::BF16_TO_FP:
13394	case ISD::FP_TO_BF16:
13395	case ISD::FLDEXP:
13396	case AMDGPUISD::FMUL_LEGACY:
13397	case AMDGPUISD::FMAD_FTZ:
13398	case AMDGPUISD::RCP:
13399	case AMDGPUISD::RSQ:
13400	case AMDGPUISD::RSQ_CLAMP:
13401	case AMDGPUISD::RCP_LEGACY:
13402	case AMDGPUISD::RCP_IFLAG:
13403	case AMDGPUISD::LOG:
13404	case AMDGPUISD::EXP:
13405	case AMDGPUISD::DIV_SCALE:
13406	case AMDGPUISD::DIV_FMAS:
13407	case AMDGPUISD::DIV_FIXUP:
13408	case AMDGPUISD::FRACT:
13409	case AMDGPUISD::CVT_PKRTZ_F16_F32:
13410	case AMDGPUISD::CVT_F32_UBYTE0:
13411	case AMDGPUISD::CVT_F32_UBYTE1:
13412	case AMDGPUISD::CVT_F32_UBYTE2:
13413	case AMDGPUISD::CVT_F32_UBYTE3:
13414	case AMDGPUISD::FP_TO_FP16:
13415	case AMDGPUISD::SIN_HW:
13416	case AMDGPUISD::COS_HW:
13417	return true;
13418
13419	// It can/will be lowered or combined as a bit operation.
13420	// Need to check their input recursively to handle.
13421	case ISD::FNEG:
13422	case ISD::FABS:
13423	case ISD::FCOPYSIGN:
13424	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), MaxDepth: MaxDepth - `1`);
13425
13426	case ISD::AND:
13427	if (Op.getValueType() == MVT::i32) {
13428	// Be careful as we only know it is a bitcast floating point type. It
13429	// could be f32, v2f16, we have no way of knowing. Luckily the constant
13430	// value that we optimize for, which comes up in fp32 to bf16 conversions,
13431	// is valid to optimize for all types.
13432	if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
13433	if (RHS->getZExtValue() == `0xffff0000`) {
13434	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), MaxDepth: MaxDepth - `1`);
13435	}
13436	}
13437	}
13438	break;
13439
13440	case ISD::FSIN:
13441	case ISD::FCOS:
13442	case ISD::FSINCOS:
13443	return Op.getValueType().getScalarType() != MVT::f16;
13444
13445	case ISD::FMINNUM:
13446	case ISD::FMAXNUM:
13447	case ISD::FMINNUM_IEEE:
13448	case ISD::FMAXNUM_IEEE:
13449	case ISD::FMINIMUM:
13450	case ISD::FMAXIMUM:
13451	case ISD::FMINIMUMNUM:
13452	case ISD::FMAXIMUMNUM:
13453	case AMDGPUISD::CLAMP:
13454	case AMDGPUISD::FMED3:
13455	case AMDGPUISD::FMAX3:
13456	case AMDGPUISD::FMIN3:
13457	case AMDGPUISD::FMAXIMUM3:
13458	case AMDGPUISD::FMINIMUM3: {
13459	// FIXME: Shouldn't treat the generic operations different based these.
13460	// However, we aren't really required to flush the result from
13461	// minnum/maxnum..
13462
13463	// snans will be quieted, so we only need to worry about denormals.
13464	if (Subtarget->supportsMinMaxDenormModes() \|\|
13465	// FIXME: denormalsEnabledForType is broken for dynamic
13466	denormalsEnabledForType(DAG, VT: Op.getValueType()))
13467	return true;
13468
13469	// Flushing may be required.
13470	// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13471	// targets need to check their input recursively.
13472
13473	// FIXME: Does this apply with clamp? It's implemented with max.
13474	for (unsigned I = `0`, E = Op.getNumOperands(); I != E; ++I) {
13475	if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - `1`))
13476	return false;
13477	}
13478
13479	return true;
13480	}
13481	case ISD::SELECT: {
13482	return isCanonicalized(DAG, Op: Op.getOperand(i: `1`), MaxDepth: MaxDepth - `1`) &&
13483	isCanonicalized(DAG, Op: Op.getOperand(i: `2`), MaxDepth: MaxDepth - `1`);
13484	}
13485	case ISD::BUILD_VECTOR: {
13486	for (unsigned i = `0`, e = Op.getNumOperands(); i != e; ++i) {
13487	SDValue SrcOp = Op.getOperand(i);
13488	if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - `1`))
13489	return false;
13490	}
13491
13492	return true;
13493	}
13494	case ISD::EXTRACT_VECTOR_ELT:
13495	case ISD::EXTRACT_SUBVECTOR: {
13496	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), MaxDepth: MaxDepth - `1`);
13497	}
13498	case ISD::INSERT_VECTOR_ELT: {
13499	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), MaxDepth: MaxDepth - `1`) &&
13500	isCanonicalized(DAG, Op: Op.getOperand(i: `1`), MaxDepth: MaxDepth - `1`);
13501	}
13502	case ISD::UNDEF:
13503	// Could be anything.
13504	return false;
13505
13506	case ISD::BITCAST:
13507	// TODO: This is incorrect as it loses track of the operand's type. We may
13508	// end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13509	// same bits that are canonicalized in one type need not be in the other.
13510	return isCanonicalized(DAG, Op: Op.getOperand(i: `0`), MaxDepth: MaxDepth - `1`);
13511	case ISD::TRUNCATE: {
13512	// Hack round the mess we make when legalizing extract_vector_elt
13513	if (Op.getValueType() == MVT::i16) {
13514	SDValue TruncSrc = Op.getOperand(i: `0`);
13515	if (TruncSrc.getValueType() == MVT::i32 &&
13516	TruncSrc.getOpcode() == ISD::BITCAST &&
13517	TruncSrc.getOperand(i: `0`).getValueType() == MVT::v2f16) {
13518	return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: `0`), MaxDepth: MaxDepth - `1`);
13519	}
13520	}
13521	return false;
13522	}
13523	case ISD::INTRINSIC_WO_CHAIN: {
13524	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
13525	// TODO: Handle more intrinsics
13526	switch (IntrinsicID) {
13527	case Intrinsic::amdgcn_cvt_pkrtz:
13528	case Intrinsic::amdgcn_cubeid:
13529	case Intrinsic::amdgcn_frexp_mant:
13530	case Intrinsic::amdgcn_fdot2:
13531	case Intrinsic::amdgcn_rcp:
13532	case Intrinsic::amdgcn_rsq:
13533	case Intrinsic::amdgcn_rsq_clamp:
13534	case Intrinsic::amdgcn_rcp_legacy:
13535	case Intrinsic::amdgcn_rsq_legacy:
13536	case Intrinsic::amdgcn_trig_preop:
13537	case Intrinsic::amdgcn_log:
13538	case Intrinsic::amdgcn_exp2:
13539	case Intrinsic::amdgcn_sqrt:
13540	return true;
13541	default:
13542	break;
13543	}
13544
13545	break;
13546	}
13547	default:
13548	break;
13549	}
13550
13551	// FIXME: denormalsEnabledForType is broken for dynamic
13552	return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
13553	DAG.isKnownNeverSNaN(Op);
13554	}
13555
13556	bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
13557	unsigned MaxDepth) const {
13558	const MachineRegisterInfo &MRI = MF.getRegInfo();
13559	MachineInstr *MI = MRI.getVRegDef(Reg);
13560	unsigned Opcode = MI->getOpcode();
13561
13562	if (Opcode == AMDGPU::G_FCANONICALIZE)
13563	return true;
13564
13565	std::optional<FPValueAndVReg> FCR;
13566	// Constant splat (can be padded with undef) or scalar constant.
13567	if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
13568	if (FCR ->Value.isSignaling())
13569	return false;
13570	if (!FCR ->Value.isDenormal())
13571	return true;
13572
13573	DenormalMode Mode = MF.getDenormalMode(FPType: FCR ->Value.getSemantics());
13574	return Mode == DenormalMode::getIEEE();
13575	}
13576
13577	if (MaxDepth == `0`)
13578	return false;
13579
13580	switch (Opcode) {
13581	case AMDGPU::G_FADD:
13582	case AMDGPU::G_FSUB:
13583	case AMDGPU::G_FMUL:
13584	case AMDGPU::G_FCEIL:
13585	case AMDGPU::G_FFLOOR:
13586	case AMDGPU::G_FRINT:
13587	case AMDGPU::G_FNEARBYINT:
13588	case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13589	case AMDGPU::G_INTRINSIC_TRUNC:
13590	case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13591	case AMDGPU::G_FMA:
13592	case AMDGPU::G_FMAD:
13593	case AMDGPU::G_FSQRT:
13594	case AMDGPU::G_FDIV:
13595	case AMDGPU::G_FREM:
13596	case AMDGPU::G_FPOW:
13597	case AMDGPU::G_FPEXT:
13598	case AMDGPU::G_FLOG:
13599	case AMDGPU::G_FLOG2:
13600	case AMDGPU::G_FLOG10:
13601	case AMDGPU::G_FPTRUNC:
13602	case AMDGPU::G_AMDGPU_RCP_IFLAG:
13603	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13604	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13605	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13606	case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13607	return true;
13608	case AMDGPU::G_FNEG:
13609	case AMDGPU::G_FABS:
13610	case AMDGPU::G_FCOPYSIGN:
13611	return isCanonicalized(Reg: MI->getOperand(i: `1`).getReg(), MF, MaxDepth: MaxDepth - `1`);
13612	case AMDGPU::G_FMINNUM:
13613	case AMDGPU::G_FMAXNUM:
13614	case AMDGPU::G_FMINNUM_IEEE:
13615	case AMDGPU::G_FMAXNUM_IEEE:
13616	case AMDGPU::G_FMINIMUM:
13617	case AMDGPU::G_FMAXIMUM:
13618	case AMDGPU::G_FMINIMUMNUM:
13619	case AMDGPU::G_FMAXIMUMNUM: {
13620	if (Subtarget->supportsMinMaxDenormModes() \|\|
13621	// FIXME: denormalsEnabledForType is broken for dynamic
13622	denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
13623	return true;
13624
13625	[[fallthrough]];
13626	}
13627	case AMDGPU::G_BUILD_VECTOR:
13628	for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
13629	if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - `1`))
13630	return false;
13631	return true;
13632	case AMDGPU::G_INTRINSIC:
13633	case AMDGPU::G_INTRINSIC_CONVERGENT:
13634	switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
13635	case Intrinsic::amdgcn_fmul_legacy:
13636	case Intrinsic::amdgcn_fmad_ftz:
13637	case Intrinsic::amdgcn_sqrt:
13638	case Intrinsic::amdgcn_fmed3:
13639	case Intrinsic::amdgcn_sin:
13640	case Intrinsic::amdgcn_cos:
13641	case Intrinsic::amdgcn_log:
13642	case Intrinsic::amdgcn_exp2:
13643	case Intrinsic::amdgcn_log_clamp:
13644	case Intrinsic::amdgcn_rcp:
13645	case Intrinsic::amdgcn_rcp_legacy:
13646	case Intrinsic::amdgcn_rsq:
13647	case Intrinsic::amdgcn_rsq_clamp:
13648	case Intrinsic::amdgcn_rsq_legacy:
13649	case Intrinsic::amdgcn_div_scale:
13650	case Intrinsic::amdgcn_div_fmas:
13651	case Intrinsic::amdgcn_div_fixup:
13652	case Intrinsic::amdgcn_fract:
13653	case Intrinsic::amdgcn_cvt_pkrtz:
13654	case Intrinsic::amdgcn_cubeid:
13655	case Intrinsic::amdgcn_cubema:
13656	case Intrinsic::amdgcn_cubesc:
13657	case Intrinsic::amdgcn_cubetc:
13658	case Intrinsic::amdgcn_frexp_mant:
13659	case Intrinsic::amdgcn_fdot2:
13660	case Intrinsic::amdgcn_trig_preop:
13661	return true;
13662	default:
13663	break;
13664	}
13665
13666	[[fallthrough]];
13667	default:
13668	return false;
13669	}
13670
13671	llvm_unreachable("invalid operation");
13672	}
13673
13674	// Constant fold canonicalize.
13675	SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13676	const SDLoc &SL, EVT VT,
13677	const APFloat &C) const {
13678	// Flush denormals to 0 if not enabled.
13679	if (C.isDenormal()) {
13680	DenormalMode Mode =
13681	DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
13682	if (Mode == DenormalMode::getPreserveSign()) {
13683	return DAG.getConstantFP(
13684	Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
13685	}
13686
13687	if (Mode != DenormalMode::getIEEE())
13688	return SDValue ();
13689	}
13690
13691	if (C.isNaN()) {
13692	APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
13693	if (C.isSignaling()) {
13694	// Quiet a signaling NaN.
13695	// FIXME: Is this supposed to preserve payload bits?
13696	return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
13697	}
13698
13699	// Make sure it is the canonical NaN bitpattern.
13700	//
13701	// TODO: Can we use -1 as the canonical NaN value since it's an inline
13702	// immediate?
13703	if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13704	return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
13705	}
13706
13707	// Already canonical.
13708	return DAG.getConstantFP(Val: C, DL: SL, VT);
13709	}
13710
13711	static bool vectorEltWillFoldAway(SDValue Op) {
13712	return Op.isUndef() \|\| isa<ConstantFPSDNode>(Val: Op);
13713	}
13714
13715	SDValue
13716	SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13717	DAGCombinerInfo &DCI) const {
13718	SelectionDAG &DAG = DCI.DAG;
13719	SDValue N0 = N->getOperand(Num: `0`);
13720	EVT VT = N->getValueType(ResNo: `0`);
13721
13722	// fcanonicalize undef -> qnan
13723	if (N0.isUndef()) {
13724	APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
13725	return DAG.getConstantFP(Val: QNaN, DL: SDLoc (N), VT);
13726	}
13727
13728	if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
13729	EVT VT = N->getValueType(ResNo: `0`);
13730	return getCanonicalConstantFP(DAG, SL: SDLoc (N), VT, C: CFP->getValueAPF());
13731	}
13732
13733	// fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13734	// (fcanonicalize k)
13735	//
13736	// fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13737
13738	// TODO: This could be better with wider vectors that will be split to v2f16,
13739	// and to consider uses since there aren't that many packed operations.
13740	if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13741	isTypeLegal(VT: MVT::v2f16)) {
13742	SDLoc SL(N);
13743	SDValue NewElts[`2`];
13744	SDValue Lo = N0.getOperand(i: `0`);
13745	SDValue Hi = N0.getOperand(i: `1`);
13746	EVT EltVT = Lo.getValueType();
13747
13748	if (vectorEltWillFoldAway(Op: Lo) \|\| vectorEltWillFoldAway(Op: Hi)) {
13749	for (unsigned I = `0`; I != `2`; ++I) {
13750	SDValue Op = N0.getOperand(i: I);
13751	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
13752	NewElts[I] =
13753	getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
13754	} else if (Op.isUndef()) {
13755	// Handled below based on what the other operand is.
13756	NewElts[I] = Op;
13757	} else {
13758	NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
13759	}
13760	}
13761
13762	// If one half is undef, and one is constant, prefer a splat vector rather
13763	// than the normal qNaN. If it's a register, prefer 0.0 since that's
13764	// cheaper to use and may be free with a packed operation.
13765	if (NewElts[`0`].isUndef()) {
13766	if (isa<ConstantFPSDNode>(Val: NewElts[`1`]))
13767	NewElts[`0`] = isa<ConstantFPSDNode>(Val: NewElts[`1`])
13768	? NewElts[`1`]
13769	: DAG.getConstantFP(Val: `0.0f`, DL: SL, VT: EltVT);
13770	}
13771
13772	if (NewElts[`1`].isUndef()) {
13773	NewElts[`1`] = isa<ConstantFPSDNode>(Val: NewElts[`0`])
13774	? NewElts[`0`]
13775	: DAG.getConstantFP(Val: `0.0f`, DL: SL, VT: EltVT);
13776	}
13777
13778	return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
13779	}
13780	}
13781
13782	return SDValue ();
13783	}
13784
13785	static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13786	switch (Opc) {
13787	case ISD::FMAXNUM:
13788	case ISD::FMAXNUM_IEEE:
13789	case ISD::FMAXIMUMNUM:
13790	return AMDGPUISD::FMAX3;
13791	case ISD::FMAXIMUM:
13792	return AMDGPUISD::FMAXIMUM3;
13793	case ISD::SMAX:
13794	return AMDGPUISD::SMAX3;
13795	case ISD::UMAX:
13796	return AMDGPUISD::UMAX3;
13797	case ISD::FMINNUM:
13798	case ISD::FMINNUM_IEEE:
13799	case ISD::FMINIMUMNUM:
13800	return AMDGPUISD::FMIN3;
13801	case ISD::FMINIMUM:
13802	return AMDGPUISD::FMINIMUM3;
13803	case ISD::SMIN:
13804	return AMDGPUISD::SMIN3;
13805	case ISD::UMIN:
13806	return AMDGPUISD::UMIN3;
13807	default:
13808	llvm_unreachable("Not a min/max opcode");
13809	}
13810	}
13811
13812	SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13813	const SDLoc &SL, SDValue Src,
13814	SDValue MinVal,
13815	SDValue MaxVal,
13816	bool Signed) const {
13817
13818	// med3 comes from
13819	// min(max(x, K0), K1), K0 < K1
13820	// max(min(x, K0), K1), K1 < K0
13821	//
13822	// "MinVal" and "MaxVal" respectively refer to the rhs of the
13823	// min/max op.
13824	ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
13825	ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
13826
13827	if (!MinK \|\| !MaxK)
13828	return SDValue ();
13829
13830	if (Signed) {
13831	if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
13832	return SDValue ();
13833	} else {
13834	if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
13835	return SDValue ();
13836	}
13837
13838	EVT VT = MinK->getValueType(ResNo: `0`);
13839	unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13840	if (VT == MVT::i32 \|\| (VT == MVT::i16 && Subtarget->hasMed3_16()))
13841	return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
13842
13843	// Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13844	// not available, but this is unlikely to be profitable as constants
13845	// will often need to be materialized & extended, especially on
13846	// pre-GFX10 where VOP3 instructions couldn't take literal operands.
13847	return SDValue ();
13848	}
13849
13850	static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13851	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
13852	return C;
13853
13854	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
13855	if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13856	return C;
13857	}
13858
13859	return nullptr;
13860	}
13861
13862	SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13863	const SDLoc &SL, SDValue Op0,
13864	SDValue Op1) const {
13865	ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
13866	if (!K1)
13867	return SDValue ();
13868
13869	ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: `1`));
13870	if (!K0)
13871	return SDValue ();
13872
13873	// Ordered >= (although NaN inputs should have folded away by now).
13874	if (K0->getValueAPF() > K1->getValueAPF())
13875	return SDValue ();
13876
13877	// med3 with a nan input acts like
13878	// v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
13879	//
13880	// So the result depends on whether the IEEE mode bit is enabled or not with a
13881	// signaling nan input.
13882	// ieee=1
13883	// s0 snan: yields s2
13884	// s1 snan: yields s2
13885	// s2 snan: qnan
13886
13887	// s0 qnan: min(s1, s2)
13888	// s1 qnan: min(s0, s2)
13889	// s2 qnan: min(s0, s1)
13890
13891	// ieee=0
13892	// s0 snan: min(s1, s2)
13893	// s1 snan: min(s0, s2)
13894	// s2 snan: qnan
13895
13896	// s0 qnan: min(s1, s2)
13897	// s1 qnan: min(s0, s2)
13898	// s2 qnan: min(s0, s1)
13899	const MachineFunction &MF = DAG.getMachineFunction();
13900	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13901
13902	// TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
13903	// whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
13904	// can only form if op0 is fmaxnum_ieee if IEEE=1.
13905	EVT VT = Op0.getValueType();
13906	if (Info->getMode().DX10Clamp) {
13907	// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13908	// hardware fmed3 behavior converting to a min.
13909	// FIXME: Should this be allowing -0.0?
13910	if (K1->isExactlyValue(V: `1.0`) && K0->isExactlyValue(V: `0.0`))
13911	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: `0`));
13912	}
13913
13914	// med3 for f16 is only available on gfx9+, and not available for v2f16.
13915	if (VT == MVT::f32 \|\| (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13916	// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13917	// signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13918	// then give the other result, which is different from med3 with a NaN
13919	// input.
13920	SDValue Var = Op0.getOperand(i: `0`);
13921	if (!DAG.isKnownNeverSNaN(Op: Var))
13922	return SDValue ();
13923
13924	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13925
13926	if ((!K0->hasOneUse() \|\| TII->isInlineConstant(Imm: K0->getValueAPF())) &&
13927	(!K1->hasOneUse() \|\| TII->isInlineConstant(Imm: K1->getValueAPF()))) {
13928	return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: `0`), N1: Var,
13929	N2: SDValue (K0, `0`), N3: SDValue (K1, `0`));
13930	}
13931	}
13932
13933	return SDValue ();
13934	}
13935
13936	/// \return true if the subtarget supports minimum3 and maximum3 with the given
13937	/// base min/max opcode \p Opc for type \p VT.
13938	static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13939	EVT VT) {
13940	switch (Opc) {
13941	case ISD::FMINNUM:
13942	case ISD::FMAXNUM:
13943	case ISD::FMINNUM_IEEE:
13944	case ISD::FMAXNUM_IEEE:
13945	case ISD::FMINIMUMNUM:
13946	case ISD::FMAXIMUMNUM:
13947	case AMDGPUISD::FMIN_LEGACY:
13948	case AMDGPUISD::FMAX_LEGACY:
13949	return (VT == MVT::f32) \|\| (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13950	case ISD::FMINIMUM:
13951	case ISD::FMAXIMUM:
13952	return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) \|\|
13953	(VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) \|\|
13954	(VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
13955	case ISD::SMAX:
13956	case ISD::SMIN:
13957	case ISD::UMAX:
13958	case ISD::UMIN:
13959	return (VT == MVT::i32) \|\| (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13960	default:
13961	return false;
13962	}
13963
13964	llvm_unreachable("not a min/max opcode");
13965	}
13966
13967	SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13968	DAGCombinerInfo &DCI) const {
13969	SelectionDAG &DAG = DCI.DAG;
13970
13971	EVT VT = N->getValueType(ResNo: `0`);
13972	unsigned Opc = N->getOpcode();
13973	SDValue Op0 = N->getOperand(Num: `0`);
13974	SDValue Op1 = N->getOperand(Num: `1`);
13975
13976	// Only do this if the inner op has one use since this will just increases
13977	// register pressure for no benefit.
13978
13979	if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
13980	// max(max(a, b), c) -> max3(a, b, c)
13981	// min(min(a, b), c) -> min3(a, b, c)
13982	if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13983	SDLoc DL(N);
13984	return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: `0`),
13985	N1: Op0.getOperand(i: `0`), N2: Op0.getOperand(i: `1`), N3: Op1);
13986	}
13987
13988	// Try commuted.
13989	// max(a, max(b, c)) -> max3(a, b, c)
13990	// min(a, min(b, c)) -> min3(a, b, c)
13991	if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13992	SDLoc DL(N);
13993	return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: `0`),
13994	N1: Op0, N2: Op1.getOperand(i: `0`), N3: Op1.getOperand(i: `1`));
13995	}
13996	}
13997
13998	// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13999	// max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14000	if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14001	if (SDValue Med3 = performIntMed3ImmCombine(
14002	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op1, MaxVal: Op0 ->getOperand(Num: `1`), Signed: true))
14003	return Med3;
14004	}
14005	if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14006	if (SDValue Med3 = performIntMed3ImmCombine(
14007	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op0 ->getOperand(Num: `1`), MaxVal: Op1, Signed: true))
14008	return Med3;
14009	}
14010
14011	if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14012	if (SDValue Med3 = performIntMed3ImmCombine(
14013	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op1, MaxVal: Op0 ->getOperand(Num: `1`), Signed: false))
14014	return Med3;
14015	}
14016	if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14017	if (SDValue Med3 = performIntMed3ImmCombine(
14018	DAG, SL: SDLoc (N), Src: Op0 ->getOperand(Num: `0`), MinVal: Op0 ->getOperand(Num: `1`), MaxVal: Op1, Signed: false))
14019	return Med3;
14020	}
14021
14022	// if !is_snan(x):
14023	// fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14024	// fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14025	// fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14026	// fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14027	if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) \|\|
14028	(Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) \|\|
14029	(Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) \|\|
14030	(Opc == AMDGPUISD::FMIN_LEGACY &&
14031	Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14032	(VT == MVT::f32 \|\| VT == MVT::f64 \|\|
14033	(VT == MVT::f16 && Subtarget->has16BitInsts()) \|\|
14034	(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14035	Op0.hasOneUse()) {
14036	if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc (N), Op0, Op1))
14037	return Res;
14038	}
14039
14040	// Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14041	// for some types, but at a higher cost since it's implemented with a 3
14042	// operand form.
14043	const SDNodeFlags Flags = N->getFlags();
14044	if ((Opc == ISD::FMINIMUM \|\| Opc == ISD::FMAXIMUM) &&
14045	!Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
14046	unsigned NewOpc =
14047	Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14048	return DAG.getNode(Opcode: NewOpc, DL: SDLoc (N), VT, N1: Op0, N2: Op1, Flags);
14049	}
14050
14051	return SDValue ();
14052	}
14053
14054	static bool isClampZeroToOne(SDValue A, SDValue B) {
14055	if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
14056	if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
14057	// FIXME: Should this be allowing -0.0?
14058	return (CA->isExactlyValue(V: `0.0`) && CB->isExactlyValue(V: `1.0`)) \|\|
14059	(CA->isExactlyValue(V: `1.0`) && CB->isExactlyValue(V: `0.0`));
14060	}
14061	}
14062
14063	return false;
14064	}
14065
14066	// FIXME: Should only worry about snans for version with chain.
14067	SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14068	DAGCombinerInfo &DCI) const {
14069	EVT VT = N->getValueType(ResNo: `0`);
14070	// v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14071	// NaNs. With a NaN input, the order of the operands may change the result.
14072
14073	SelectionDAG &DAG = DCI.DAG;
14074	SDLoc SL(N);
14075
14076	SDValue Src0 = N->getOperand(Num: `0`);
14077	SDValue Src1 = N->getOperand(Num: `1`);
14078	SDValue Src2 = N->getOperand(Num: `2`);
14079
14080	if (isClampZeroToOne(A: Src0, B: Src1)) {
14081	// const_a, const_b, x -> clamp is safe in all cases including signaling
14082	// nans.
14083	// FIXME: Should this be allowing -0.0?
14084	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
14085	}
14086
14087	const MachineFunction &MF = DAG.getMachineFunction();
14088	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14089
14090	// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14091	// handling no dx10-clamp?
14092	if (Info->getMode().DX10Clamp) {
14093	// If NaNs is clamped to 0, we are free to reorder the inputs.
14094
14095	if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
14096	std::swap(a&: Src0, b&: Src1);
14097
14098	if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
14099	std::swap(a&: Src1, b&: Src2);
14100
14101	if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
14102	std::swap(a&: Src0, b&: Src1);
14103
14104	if (isClampZeroToOne(A: Src1, B: Src2))
14105	return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
14106	}
14107
14108	return SDValue ();
14109	}
14110
14111	SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14112	DAGCombinerInfo &DCI) const {
14113	SDValue Src0 = N->getOperand(Num: `0`);
14114	SDValue Src1 = N->getOperand(Num: `1`);
14115	if (Src0.isUndef() && Src1.isUndef())
14116	return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: `0`));
14117	return SDValue ();
14118	}
14119
14120	// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14121	// expanded into a set of cmp/select instructions.
14122	bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
14123	unsigned NumElem,
14124	bool IsDivergentIdx,
14125	const GCNSubtarget *Subtarget) {
14126	if (UseDivergentRegisterIndexing)
14127	return false;
14128
14129	unsigned VecSize = EltSize * NumElem;
14130
14131	// Sub-dword vectors of size 2 dword or less have better implementation.
14132	if (VecSize <= `64` && EltSize < `32`)
14133	return false;
14134
14135	// Always expand the rest of sub-dword instructions, otherwise it will be
14136	// lowered via memory.
14137	if (EltSize < `32`)
14138	return true;
14139
14140	// Always do this if var-idx is divergent, otherwise it will become a loop.
14141	if (IsDivergentIdx)
14142	return true;
14143
14144	// Large vectors would yield too many compares and v_cndmask_b32 instructions.
14145	unsigned NumInsts = NumElem / Number of compares / +
14146	((EltSize + `31`) / `32`) * NumElem / Number of cndmasks /;
14147
14148	// On some architectures (GFX9) movrel is not available and it's better
14149	// to expand.
14150	if (Subtarget->useVGPRIndexMode())
14151	return NumInsts <= `16`;
14152
14153	// If movrel is available, use it instead of expanding for vector of 8
14154	// elements.
14155	if (Subtarget->hasMovrel())
14156	return NumInsts <= `15`;
14157
14158	return true;
14159	}
14160
14161	bool SITargetLowering::shouldExpandVectorDynExt(SDNode N) const* {
14162	SDValue Idx = N->getOperand(Num: N->getNumOperands() - `1`);
14163	if (isa<ConstantSDNode>(Val: Idx))
14164	return false;
14165
14166	SDValue Vec = N->getOperand(Num: `0`);
14167	EVT VecVT = Vec.getValueType();
14168	EVT EltVT = VecVT.getVectorElementType();
14169	unsigned EltSize = EltVT.getSizeInBits();
14170	unsigned NumElem = VecVT.getVectorNumElements();
14171
14172	return SITargetLowering::shouldExpandVectorDynExt(
14173	EltSize, NumElem, IsDivergentIdx: Idx ->isDivergent(), Subtarget: getSubtarget());
14174	}
14175
14176	SDValue
14177	SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14178	DAGCombinerInfo &DCI) const {
14179	SDValue Vec = N->getOperand(Num: `0`);
14180	SelectionDAG &DAG = DCI.DAG;
14181
14182	EVT VecVT = Vec.getValueType();
14183	EVT VecEltVT = VecVT.getVectorElementType();
14184	EVT ResVT = N->getValueType(ResNo: `0`);
14185
14186	unsigned VecSize = VecVT.getSizeInBits();
14187	unsigned VecEltSize = VecEltVT.getSizeInBits();
14188
14189	if ((Vec.getOpcode() == ISD::FNEG \|\| Vec.getOpcode() == ISD::FABS) &&
14190	allUsesHaveSourceMods(N)) {
14191	SDLoc SL(N);
14192	SDValue Idx = N->getOperand(Num: `1`);
14193	SDValue Elt =
14194	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: `0`), N2: Idx);
14195	return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
14196	}
14197
14198	// ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14199	// =>
14200	// Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14201	// Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14202	// ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14203	if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14204	SDLoc SL(N);
14205	SDValue Idx = N->getOperand(Num: `1`);
14206	unsigned Opc = Vec.getOpcode();
14207
14208	switch (Opc) {
14209	default:
14210	break;
14211	// TODO: Support other binary operations.
14212	case ISD::FADD:
14213	case ISD::FSUB:
14214	case ISD::FMUL:
14215	case ISD::ADD:
14216	case ISD::UMIN:
14217	case ISD::UMAX:
14218	case ISD::SMIN:
14219	case ISD::SMAX:
14220	case ISD::FMAXNUM:
14221	case ISD::FMINNUM:
14222	case ISD::FMAXNUM_IEEE:
14223	case ISD::FMINNUM_IEEE:
14224	case ISD::FMAXIMUM:
14225	case ISD::FMINIMUM: {
14226	SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
14227	N1: Vec.getOperand(i: `0`), N2: Idx);
14228	SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
14229	N1: Vec.getOperand(i: `1`), N2: Idx);
14230
14231	DCI.AddToWorklist(N: Elt0.getNode());
14232	DCI.AddToWorklist(N: Elt1.getNode());
14233	return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec ->getFlags());
14234	}
14235	}
14236	}
14237
14238	// EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14239	if (shouldExpandVectorDynExt(N)) {
14240	SDLoc SL(N);
14241	SDValue Idx = N->getOperand(Num: `1`);
14242	SDValue V;
14243	for (unsigned I = `0`, E = VecVT.getVectorNumElements(); I < E; ++I) {
14244	SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
14245	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
14246	if (I == `0`)
14247	V = Elt;
14248	else
14249	V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
14250	}
14251	return V;
14252	}
14253
14254	if (!DCI.isBeforeLegalize())
14255	return SDValue ();
14256
14257	// Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14258	// elements. This exposes more load reduction opportunities by replacing
14259	// multiple small extract_vector_elements with a single 32-bit extract.
14260	auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
14261	if (isa<MemSDNode>(Val: Vec) && VecEltSize <= `16` && VecEltVT.isByteSized() &&
14262	VecSize > `32` && VecSize % `32` == `0` && Idx) {
14263	EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
14264
14265	unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14266	unsigned EltIdx = BitIndex / `32`;
14267	unsigned LeftoverBitIdx = BitIndex % `32`;
14268	SDLoc SL(N);
14269
14270	SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
14271	DCI.AddToWorklist(N: Cast.getNode());
14272
14273	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
14274	N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
14275	DCI.AddToWorklist(N: Elt.getNode());
14276	SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
14277	N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
14278	DCI.AddToWorklist(N: Srl.getNode());
14279
14280	EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14281	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
14282	DCI.AddToWorklist(N: Trunc.getNode());
14283
14284	if (VecEltVT == ResVT) {
14285	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
14286	}
14287
14288	assert(ResVT.isScalarInteger());
14289	return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
14290	}
14291
14292	return SDValue ();
14293	}
14294
14295	SDValue
14296	SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14297	DAGCombinerInfo &DCI) const {
14298	SDValue Vec = N->getOperand(Num: `0`);
14299	SDValue Idx = N->getOperand(Num: `2`);
14300	EVT VecVT = Vec.getValueType();
14301	EVT EltVT = VecVT.getVectorElementType();
14302
14303	// INSERT_VECTOR_ELT (<n x e>, var-idx)
14304	// => BUILD_VECTOR n x select (e, const-idx)
14305	if (!shouldExpandVectorDynExt(N))
14306	return SDValue ();
14307
14308	SelectionDAG &DAG = DCI.DAG;
14309	SDLoc SL(N);
14310	SDValue Ins = N->getOperand(Num: `1`);
14311	EVT IdxVT = Idx.getValueType();
14312
14313	SmallVector<SDValue, `16`> Ops;
14314	for (unsigned I = `0`, E = VecVT.getVectorNumElements(); I < E; ++I) {
14315	SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
14316	SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
14317	SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
14318	Ops.push_back(Elt: V);
14319	}
14320
14321	return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
14322	}
14323
14324	/// Return the source of an fp_extend from f16 to f32, or a converted FP
14325	/// constant.
14326	static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
14327	if (Src.getOpcode() == ISD::FP_EXTEND &&
14328	Src.getOperand(i: `0`).getValueType() == MVT::f16) {
14329	return Src.getOperand(i: `0`);
14330	}
14331
14332	if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
14333	APFloat Val = CFP->getValueAPF();
14334	bool LosesInfo = true;
14335	Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
14336	if (!LosesInfo)
14337	return DAG.getConstantFP(Val, DL: SDLoc (Src), VT: MVT::f16);
14338	}
14339
14340	return SDValue ();
14341	}
14342
14343	SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14344	DAGCombinerInfo &DCI) const {
14345	assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14346	"combine only useful on gfx8");
14347
14348	SDValue TruncSrc = N->getOperand(Num: `0`);
14349	EVT VT = N->getValueType(ResNo: `0`);
14350	if (VT != MVT::f16)
14351	return SDValue ();
14352
14353	if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 \|\|
14354	TruncSrc.getValueType() != MVT::f32 \|\| !TruncSrc.hasOneUse())
14355	return SDValue ();
14356
14357	SelectionDAG &DAG = DCI.DAG;
14358	SDLoc SL(N);
14359
14360	// Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14361	// and expanding it with min/max saves 1 instruction vs. casting to f32 and
14362	// casting back.
14363
14364	// fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14365	// fmin(fmax(a, b), fmax(fmin(a, b), c))
14366	SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `0`));
14367	if (!A)
14368	return SDValue ();
14369
14370	SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `1`));
14371	if (!B)
14372	return SDValue ();
14373
14374	SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: `2`));
14375	if (!C)
14376	return SDValue ();
14377
14378	// This changes signaling nan behavior. If an input is a signaling nan, it
14379	// would have been quieted by the fpext originally. We don't care because
14380	// these are unconstrained ops. If we needed to insert quieting canonicalizes
14381	// we would be worse off than just doing the promotion.
14382	SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
14383	SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
14384	SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
14385	return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
14386	}
14387
14388	unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14389	const SDNode *N0,
14390	const SDNode N1) const* {
14391	EVT VT = N0->getValueType(ResNo: `0`);
14392
14393	// Only do this if we are not trying to support denormals. v_mad_f32 does not
14394	// support denormals ever.
14395	if (((VT == MVT::f32 &&
14396	denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) \|\|
14397	(VT == MVT::f16 && Subtarget->hasMadF16() &&
14398	denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
14399	isOperationLegal(Op: ISD::FMAD, VT))
14400	return ISD::FMAD;
14401
14402	const TargetOptions &Options = DAG.getTarget().Options;
14403	if ((Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath \|\|
14404	(N0->getFlags().hasAllowContract() &&
14405	N1->getFlags().hasAllowContract())) &&
14406	isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
14407	return ISD::FMA;
14408	}
14409
14410	return `0`;
14411	}
14412
14413	// For a reassociatable opcode perform:
14414	// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14415	SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14416	SelectionDAG &DAG) const {
14417	EVT VT = N->getValueType(ResNo: `0`);
14418	if (VT != MVT::i32 && VT != MVT::i64)
14419	return SDValue ();
14420
14421	if (DAG.isBaseWithConstantOffset(Op: SDValue (N, `0`)))
14422	return SDValue ();
14423
14424	unsigned Opc = N->getOpcode();
14425	SDValue Op0 = N->getOperand(Num: `0`);
14426	SDValue Op1 = N->getOperand(Num: `1`);
14427
14428	if (!(Op0 ->isDivergent() ^ Op1 ->isDivergent()))
14429	return SDValue ();
14430
14431	if (Op0 ->isDivergent())
14432	std::swap(a&: Op0, b&: Op1);
14433
14434	if (Op1.getOpcode() != Opc \|\| !Op1.hasOneUse())
14435	return SDValue ();
14436
14437	SDValue Op2 = Op1.getOperand(i: `1`);
14438	Op1 = Op1.getOperand(i: `0`);
14439	if (!(Op1 ->isDivergent() ^ Op2 ->isDivergent()))
14440	return SDValue ();
14441
14442	if (Op1 ->isDivergent())
14443	std::swap(a&: Op1, b&: Op2);
14444
14445	SDLoc SL(N);
14446	SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
14447	return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
14448	}
14449
14450	static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14451	SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14452	unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
14453	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
14454	SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
14455	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
14456	}
14457
14458	// Fold
14459	// y = lshr i64 x, 32
14460	// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14461	// with Const.hi == -1
14462	// To
14463	// res = mad_u64_u32 y.lo ,Const.lo, x.lo
14464	static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
14465	SDValue MulLHS, SDValue MulRHS,
14466	SDValue AddRHS) {
14467	if (MulRHS.getOpcode() == ISD::SRL)
14468	std::swap(a&: MulLHS, b&: MulRHS);
14469
14470	if (MulLHS.getValueType() != MVT::i64 \|\| MulLHS.getOpcode() != ISD::SRL)
14471	return SDValue ();
14472
14473	ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: `1`));
14474	if (!ShiftVal \|\| ShiftVal->getAsZExtVal() != `32` \|\|
14475	MulLHS.getOperand(i: `0`) != AddRHS)
14476	return SDValue ();
14477
14478	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
14479	if (!Const \|\| Hi_32(Value: Const->getZExtValue()) != uint32_t(-`1`))
14480	return SDValue ();
14481
14482	SDValue ConstMul =
14483	DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
14484	return getMad64_32(DAG, SL, VT: MVT::i64,
14485	N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
14486	N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
14487	}
14488
14489	// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14490	// multiplies, if any.
14491	//
14492	// Full 64-bit multiplies that feed into an addition are lowered here instead
14493	// of using the generic expansion. The generic expansion ends up with
14494	// a tree of ADD nodes that prevents us from using the "add" part of the
14495	// MAD instruction. The expansion produced here results in a chain of ADDs
14496	// instead of a tree.
14497	SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14498	DAGCombinerInfo &DCI) const {
14499	assert(N->getOpcode() == ISD::ADD);
14500
14501	SelectionDAG &DAG = DCI.DAG;
14502	EVT VT = N->getValueType(ResNo: `0`);
14503	SDLoc SL(N);
14504	SDValue LHS = N->getOperand(Num: `0`);
14505	SDValue RHS = N->getOperand(Num: `1`);
14506
14507	if (VT.isVector())
14508	return SDValue ();
14509
14510	// S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14511	// result in scalar registers for uniform values.
14512	if (!N->isDivergent() && Subtarget->hasSMulHi())
14513	return SDValue ();
14514
14515	unsigned NumBits = VT.getScalarSizeInBits();
14516	if (NumBits <= `32` \|\| NumBits > `64`)
14517	return SDValue ();
14518
14519	if (LHS.getOpcode() != ISD::MUL) {
14520	assert(RHS.getOpcode() == ISD::MUL);
14521	std::swap(a&: LHS, b&: RHS);
14522	}
14523
14524	// Avoid the fold if it would unduly increase the number of multiplies due to
14525	// multiple uses, except on hardware with full-rate multiply-add (which is
14526	// part of full-rate 64-bit ops).
14527	if (!Subtarget->hasFullRate64Ops()) {
14528	unsigned NumUsers = `0`;
14529	for (SDNode *User : LHS ->users()) {
14530	// There is a use that does not feed into addition, so the multiply can't
14531	// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14532	if (User->getOpcode() != ISD::ADD)
14533	return SDValue ();
14534
14535	// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14536	// MUL + 3xADD + 3xADDC over 3xMAD.
14537	++NumUsers;
14538	if (NumUsers >= `3`)
14539	return SDValue ();
14540	}
14541	}
14542
14543	SDValue MulLHS = LHS.getOperand(i: `0`);
14544	SDValue MulRHS = LHS.getOperand(i: `1`);
14545	SDValue AddRHS = RHS;
14546
14547	if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14548	return FoldedMAD;
14549
14550	// Always check whether operands are small unsigned values, since that
14551	// knowledge is useful in more cases. Check for small signed values only if
14552	// doing so can unlock a shorter code sequence.
14553	bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= `32`;
14554	bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= `32`;
14555
14556	bool MulSignedLo = false;
14557	if (!MulLHSUnsigned32 \|\| !MulRHSUnsigned32) {
14558	MulSignedLo =
14559	numBitsSigned(Op: MulLHS, DAG) <= `32` && numBitsSigned(Op: MulRHS, DAG) <= `32`;
14560	}
14561
14562	// The operands and final result all have the same number of bits. If
14563	// operands need to be extended, they can be extended with garbage. The
14564	// resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14565	// truncated away in the end.
14566	if (VT != MVT::i64) {
14567	MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
14568	MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
14569	AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
14570	}
14571
14572	// The basic code generated is conceptually straightforward. Pseudo code:
14573	//
14574	// accum = mad_64_32 lhs.lo, rhs.lo, accum
14575	// accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14576	// accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14577	//
14578	// The second and third lines are optional, depending on whether the factors
14579	// are {sign,zero}-extended or not.
14580	//
14581	// The actual DAG is noisier than the pseudo code, but only due to
14582	// instructions that disassemble values into low and high parts, and
14583	// assemble the final result.
14584	SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
14585
14586	auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
14587	auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
14588	SDValue Accum =
14589	getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
14590
14591	if (!MulSignedLo && (!MulLHSUnsigned32 \|\| !MulRHSUnsigned32)) {
14592	auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
14593
14594	if (!MulLHSUnsigned32) {
14595	auto MulLHSHi =
14596	DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
14597	SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
14598	AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
14599	}
14600
14601	if (!MulRHSUnsigned32) {
14602	auto MulRHSHi =
14603	DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
14604	SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
14605	AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
14606	}
14607
14608	Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
14609	Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
14610	}
14611
14612	if (VT != MVT::i64)
14613	Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
14614	return Accum;
14615	}
14616
14617	SDValue
14618	SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14619	DAGCombinerInfo &DCI) const {
14620	SDValue RHS = N->getOperand(Num: `1`);
14621	auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
14622	if (!CRHS)
14623	return SDValue ();
14624
14625	// TODO: Worth using computeKnownBits? Maybe expensive since it's so
14626	// common.
14627	uint64_t Val = CRHS->getZExtValue();
14628	if (countr_zero(Val) >= `32`) {
14629	SelectionDAG &DAG = DCI.DAG;
14630	SDLoc SL(N);
14631	SDValue LHS = N->getOperand(Num: `0`);
14632
14633	// Avoid carry machinery if we know the low half of the add does not
14634	// contribute to the final result.
14635	//
14636	// add i64:x, K if computeTrailingZeros(K) >= 32
14637	// => build_pair (add x.hi, K.hi), x.lo
14638
14639	// Breaking the 64-bit add here with this strange constant is unlikely
14640	// to interfere with addressing mode patterns.
14641
14642	SDValue Hi = getHiHalf64(Op: LHS, DAG);
14643	SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
14644	SDValue AddHi =
14645	DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
14646
14647	SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
14648	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
14649	}
14650
14651	return SDValue ();
14652	}
14653
14654	// Collect the ultimate src of each of the mul node's operands, and confirm
14655	// each operand is 8 bytes.
14656	static std::optional<ByteProvider<SDValue>>
14657	handleMulOperand(const SDValue &MulOperand) {
14658	auto Byte0 = calculateByteProvider(Op: MulOperand, Index: `0`, Depth: `0`);
14659	if (!Byte0 \|\| Byte0 ->isConstantZero()) {
14660	return std::nullopt;
14661	}
14662	auto Byte1 = calculateByteProvider(Op: MulOperand, Index: `1`, Depth: `0`);
14663	if (Byte1 && !Byte1 ->isConstantZero()) {
14664	return std::nullopt;
14665	}
14666	return Byte0;
14667	}
14668
14669	static unsigned addPermMasks(unsigned First, unsigned Second) {
14670	unsigned FirstCs = First & `0x0c0c0c0c`;
14671	unsigned SecondCs = Second & `0x0c0c0c0c`;
14672	unsigned FirstNoCs = First & ~`0x0c0c0c0c`;
14673	unsigned SecondNoCs = Second & ~`0x0c0c0c0c`;
14674
14675	assert((FirstCs & `0xFF`) \| (SecondCs & `0xFF`));
14676	assert((FirstCs & `0xFF00`) \| (SecondCs & `0xFF00`));
14677	assert((FirstCs & `0xFF0000`) \| (SecondCs & `0xFF0000`));
14678	assert((FirstCs & `0xFF000000`) \| (SecondCs & `0xFF000000`));
14679
14680	return (FirstNoCs \| SecondNoCs) \| (FirstCs & SecondCs);
14681	}
14682
14683	struct DotSrc {
14684	SDValue SrcOp;
14685	int64_t PermMask;
14686	int64_t DWordOffset;
14687	};
14688
14689	static void placeSources(ByteProvider<SDValue> &Src0,
14690	ByteProvider<SDValue> &Src1,
14691	SmallVectorImpl<DotSrc> &Src0s,
14692	SmallVectorImpl<DotSrc> &Src1s, int Step) {
14693
14694	assert(Src0.Src.has_value() && Src1.Src.has_value());
14695	// Src0s and Src1s are empty, just place arbitrarily.
14696	if (Step == `0`) {
14697	Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % `4`) << `24`) + `0x0c0c0c`,
14698	.DWordOffset: Src0.SrcOffset / `4`});
14699	Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % `4`) << `24`) + `0x0c0c0c`,
14700	.DWordOffset: Src1.SrcOffset / `4`});
14701	return;
14702	}
14703
14704	for (int BPI = `0`; BPI < `2`; BPI++) {
14705	std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14706	if (BPI == `1`) {
14707	BPP = {Src1, Src0};
14708	}
14709	unsigned ZeroMask = `0x0c0c0c0c`;
14710	unsigned FMask = `0xFF` << (`8` * (`3` - Step));
14711
14712	unsigned FirstMask =
14713	(BPP.first.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask);
14714	unsigned SecondMask =
14715	(BPP.second.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask);
14716	// Attempt to find Src vector which contains our SDValue, if so, add our
14717	// perm mask to the existing one. If we are unable to find a match for the
14718	// first SDValue, attempt to find match for the second.
14719	int FirstGroup = -`1`;
14720	for (int I = `0`; I < `2`; I++) {
14721	SmallVectorImpl<DotSrc> &Srcs = I == `0` ? Src0s : Src1s;
14722	auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14723	return IterElt.SrcOp == *BPP.first.Src &&
14724	(IterElt.DWordOffset == (BPP.first.SrcOffset / `4`));
14725	};
14726
14727	auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
14728	if (Match != Srcs.end()) {
14729	Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
14730	FirstGroup = I;
14731	break;
14732	}
14733	}
14734	if (FirstGroup != -`1`) {
14735	SmallVectorImpl<DotSrc> &Srcs = FirstGroup == `1` ? Src0s : Src1s;
14736	auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14737	return IterElt.SrcOp == *BPP.second.Src &&
14738	(IterElt.DWordOffset == (BPP.second.SrcOffset / `4`));
14739	};
14740	auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
14741	if (Match != Srcs.end()) {
14742	Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
14743	} else
14744	Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / `4`});
14745	return;
14746	}
14747	}
14748
14749	// If we have made it here, then we could not find a match in Src0s or Src1s
14750	// for either Src0 or Src1, so just place them arbitrarily.
14751
14752	unsigned ZeroMask = `0x0c0c0c0c`;
14753	unsigned FMask = `0xFF` << (`8` * (`3` - Step));
14754
14755	Src0s.push_back(
14756	Elt: {.SrcOp: *Src0.Src,
14757	.PermMask: ((Src0.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask)),
14758	.DWordOffset: Src0.SrcOffset / `4`});
14759	Src1s.push_back(
14760	Elt: {.SrcOp: *Src1.Src,
14761	.PermMask: ((Src1.SrcOffset % `4`) << (`8` * (`3` - Step)) \| (ZeroMask & ~FMask)),
14762	.DWordOffset: Src1.SrcOffset / `4`});
14763	}
14764
14765	static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
14766	SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14767	bool IsAny) {
14768
14769	// If we just have one source, just permute it accordingly.
14770	if (Srcs.size() == `1`) {
14771	auto *Elt = Srcs.begin();
14772	auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
14773
14774	// v_perm will produce the original value
14775	if (Elt->PermMask == `0x3020100`)
14776	return EltOp;
14777
14778	return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
14779	N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
14780	}
14781
14782	auto *FirstElt = Srcs.begin();
14783	auto *SecondElt = std::next(x: FirstElt);
14784
14785	SmallVector<SDValue, `2`> Perms;
14786
14787	// If we have multiple sources in the chain, combine them via perms (using
14788	// calculated perm mask) and Ors.
14789	while (true) {
14790	auto FirstMask = FirstElt->PermMask;
14791	auto SecondMask = SecondElt->PermMask;
14792
14793	unsigned FirstCs = FirstMask & `0x0c0c0c0c`;
14794	unsigned FirstPlusFour = FirstMask \| `0x04040404`;
14795	// 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14796	// original 0x0C.
14797	FirstMask = (FirstPlusFour & `0x0F0F0F0F`) \| FirstCs;
14798
14799	auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
14800	auto FirstVal =
14801	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14802	auto SecondVal =
14803	getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
14804
14805	Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
14806	N2: SecondVal,
14807	N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
14808
14809	FirstElt = std::next(x: SecondElt);
14810	if (FirstElt == Srcs.end())
14811	break;
14812
14813	SecondElt = std::next(x: FirstElt);
14814	// If we only have a FirstElt, then just combine that into the cumulative
14815	// source node.
14816	if (SecondElt == Srcs.end()) {
14817	auto EltOp =
14818	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14819
14820	Perms.push_back(
14821	Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
14822	N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
14823	break;
14824	}
14825	}
14826
14827	assert(Perms.size() == `1` \|\| Perms.size() == `2`);
14828	return Perms.size() == `2`
14829	? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms [`0`], N2: Perms [`1`])
14830	: Perms [`0`];
14831	}
14832
14833	static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14834	for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14835	EntryMask = EntryMask >> ((`4` - ChainLength) * `8`);
14836	auto ZeroMask = ChainLength == `2` ? `0x0c0c0000` : `0x0c000000`;
14837	EntryMask += ZeroMask;
14838	}
14839	}
14840
14841	static bool isMul(const SDValue Op) {
14842	auto Opcode = Op.getOpcode();
14843
14844	return (Opcode == ISD::MUL \|\| Opcode == AMDGPUISD::MUL_U24 \|\|
14845	Opcode == AMDGPUISD::MUL_I24);
14846	}
14847
14848	static std::optional<bool>
14849	checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
14850	ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14851	const SDValue &S1Op, const SelectionDAG &DAG) {
14852	// If we both ops are i8s (pre legalize-dag), then the signedness semantics
14853	// of the dot4 is irrelevant.
14854	if (S0Op.getValueSizeInBits() == `8` && S1Op.getValueSizeInBits() == `8`)
14855	return false;
14856
14857	auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: `0`);
14858	bool S0IsUnsigned = Known0.countMinLeadingZeros() > `0`;
14859	bool S0IsSigned = Known0.countMinLeadingOnes() > `0`;
14860	auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: `0`);
14861	bool S1IsUnsigned = Known1.countMinLeadingZeros() > `0`;
14862	bool S1IsSigned = Known1.countMinLeadingOnes() > `0`;
14863
14864	assert(!(S0IsUnsigned && S0IsSigned));
14865	assert(!(S1IsUnsigned && S1IsSigned));
14866
14867	// There are 9 possible permutations of
14868	// {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14869
14870	// In two permutations, the sign bits are known to be the same for both Ops,
14871	// so simply return Signed / Unsigned corresponding to the MSB
14872
14873	if ((S0IsUnsigned && S1IsUnsigned) \|\| (S0IsSigned && S1IsSigned))
14874	return S0IsSigned;
14875
14876	// In another two permutations, the sign bits are known to be opposite. In
14877	// this case return std::nullopt to indicate a bad match.
14878
14879	if ((S0IsUnsigned && S1IsSigned) \|\| (S0IsSigned && S1IsUnsigned))
14880	return std::nullopt;
14881
14882	// In the remaining five permutations, we don't know the value of the sign
14883	// bit for at least one Op. Since we have a valid ByteProvider, we know that
14884	// the upper bits must be extension bits. Thus, the only ways for the sign
14885	// bit to be unknown is if it was sign extended from unknown value, or if it
14886	// was any extended. In either case, it is correct to use the signed
14887	// version of the signedness semantics of dot4
14888
14889	// In two of such permutations, we known the sign bit is set for
14890	// one op, and the other is unknown. It is okay to used signed version of
14891	// dot4.
14892	if ((S0IsSigned && !(S1IsSigned \|\| S1IsUnsigned)) \|\|
14893	((S1IsSigned && !(S0IsSigned \|\| S0IsUnsigned))))
14894	return true;
14895
14896	// In one such permutation, we don't know either of the sign bits. It is okay
14897	// to used the signed version of dot4.
14898	if ((!(S1IsSigned \|\| S1IsUnsigned) && !(S0IsSigned \|\| S0IsUnsigned)))
14899	return true;
14900
14901	// In two of such permutations, we known the sign bit is unset for
14902	// one op, and the other is unknown. Return std::nullopt to indicate a
14903	// bad match.
14904	if ((S0IsUnsigned && !(S1IsSigned \|\| S1IsUnsigned)) \|\|
14905	((S1IsUnsigned && !(S0IsSigned \|\| S0IsUnsigned))))
14906	return std::nullopt;
14907
14908	llvm_unreachable("Fully covered condition");
14909	}
14910
14911	SDValue SITargetLowering::performAddCombine(SDNode *N,
14912	DAGCombinerInfo &DCI) const {
14913	SelectionDAG &DAG = DCI.DAG;
14914	EVT VT = N->getValueType(ResNo: `0`);
14915	SDLoc SL(N);
14916	SDValue LHS = N->getOperand(Num: `0`);
14917	SDValue RHS = N->getOperand(Num: `1`);
14918
14919	if (LHS.getOpcode() == ISD::MUL \|\| RHS.getOpcode() == ISD::MUL) {
14920	if (Subtarget->hasMad64_32()) {
14921	if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14922	return Folded;
14923	}
14924	}
14925
14926	if (SDValue V = reassociateScalarOps(N, DAG)) {
14927	return V;
14928	}
14929
14930	if (VT == MVT::i64) {
14931	if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14932	return Folded;
14933	}
14934
14935	if ((isMul(Op: LHS) \|\| isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
14936	(Subtarget->hasDot1Insts() \|\| Subtarget->hasDot8Insts())) {
14937	SDValue TempNode(N, `0`);
14938	std::optional<bool> IsSigned;
14939	SmallVector<DotSrc, `4`> Src0s;
14940	SmallVector<DotSrc, `4`> Src1s;
14941	SmallVector<SDValue, `4`> Src2s;
14942
14943	// Match the v_dot4 tree, while collecting src nodes.
14944	int ChainLength = `0`;
14945	for (int I = `0`; I < `4`; I++) {
14946	auto MulIdx = isMul(Op: LHS) ? `0` : isMul(Op: RHS) ? `1` : -`1`;
14947	if (MulIdx == -`1`)
14948	break;
14949	auto Src0 = handleMulOperand(MulOperand: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `0`));
14950	if (!Src0)
14951	break;
14952	auto Src1 = handleMulOperand(MulOperand: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `1`));
14953	if (!Src1)
14954	break;
14955
14956	auto IterIsSigned = checkDot4MulSignedness(
14957	N: TempNode ->getOperand(Num: MulIdx), Src0&: Src0, Src1&: Src1,
14958	S0Op: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `0`),
14959	S1Op: TempNode ->getOperand(Num: MulIdx)->getOperand(Num: `1`), DAG);
14960	if (!IterIsSigned)
14961	break;
14962	if (!IsSigned)
14963	IsSigned = *IterIsSigned;
14964	if (IterIsSigned != IsSigned)
14965	break;
14966	placeSources(Src0&: Src0, Src1&: Src1, Src0s, Src1s, Step: I);
14967	auto AddIdx = `1` - MulIdx;
14968	// Allow the special case where add (add (mul24, 0), mul24) became ->
14969	// add (mul24, mul24).
14970	if (I == `2` && isMul(Op: TempNode ->getOperand(Num: AddIdx))) {
14971	Src2s.push_back(Elt: TempNode ->getOperand(Num: AddIdx));
14972	auto Src0 =
14973	handleMulOperand(MulOperand: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `0`));
14974	if (!Src0)
14975	break;
14976	auto Src1 =
14977	handleMulOperand(MulOperand: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `1`));
14978	if (!Src1)
14979	break;
14980	auto IterIsSigned = checkDot4MulSignedness(
14981	N: TempNode ->getOperand(Num: AddIdx), Src0&: Src0, Src1&: Src1,
14982	S0Op: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `0`),
14983	S1Op: TempNode ->getOperand(Num: AddIdx)->getOperand(Num: `1`), DAG);
14984	if (!IterIsSigned)
14985	break;
14986	assert(IsSigned);
14987	if (IterIsSigned != IsSigned)
14988	break;
14989	placeSources(Src0&: Src0, Src1&: Src1, Src0s, Src1s, Step: I + `1`);
14990	Src2s.push_back(Elt: DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32));
14991	ChainLength = I + `2`;
14992	break;
14993	}
14994
14995	TempNode = TempNode ->getOperand(Num: AddIdx);
14996	Src2s.push_back(Elt: TempNode);
14997	ChainLength = I + `1`;
14998	if (TempNode ->getNumOperands() < `2`)
14999	break;
15000	LHS = TempNode ->getOperand(Num: `0`);
15001	RHS = TempNode ->getOperand(Num: `1`);
15002	}
15003
15004	if (ChainLength < `2`)
15005	return SDValue ();
15006
15007	// Masks were constructed with assumption that we would find a chain of
15008	// length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15009	// 0x0c) so they do not affect dot calculation.
15010	if (ChainLength < `4`) {
15011	fixMasks(Srcs&: Src0s, ChainLength);
15012	fixMasks(Srcs&: Src1s, ChainLength);
15013	}
15014
15015	SDValue Src0, Src1;
15016
15017	// If we are just using a single source for both, and have permuted the
15018	// bytes consistently, we can just use the sources without permuting
15019	// (commutation).
15020	bool UseOriginalSrc = false;
15021	if (ChainLength == `4` && Src0s.size() == `1` && Src1s.size() == `1` &&
15022	Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15023	Src0s.begin()->SrcOp.getValueSizeInBits() >= `32` &&
15024	Src1s.begin()->SrcOp.getValueSizeInBits() >= `32`) {
15025	SmallVector<unsigned, `4`> SrcBytes;
15026	auto Src0Mask = Src0s.begin()->PermMask;
15027	SrcBytes.push_back(Elt: Src0Mask & `0xFF000000`);
15028	bool UniqueEntries = true;
15029	for (auto I = `1`; I < `4`; I++) {
15030	auto NextByte = Src0Mask & (`0xFF` << ((`3` - I) * `8`));
15031
15032	if (is_contained(Range&: SrcBytes, Element: NextByte)) {
15033	UniqueEntries = false;
15034	break;
15035	}
15036	SrcBytes.push_back(Elt: NextByte);
15037	}
15038
15039	if (UniqueEntries) {
15040	UseOriginalSrc = true;
15041
15042	auto *FirstElt = Src0s.begin();
15043	auto FirstEltOp =
15044	getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
15045
15046	auto *SecondElt = Src1s.begin();
15047	auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
15048	DWordOffset: SecondElt->DWordOffset);
15049
15050	Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
15051	VT: MVT::getIntegerVT(BitWidth: `32`));
15052	Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
15053	VT: MVT::getIntegerVT(BitWidth: `32`));
15054	}
15055	}
15056
15057	if (!UseOriginalSrc) {
15058	Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
15059	Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
15060	}
15061
15062	assert(IsSigned);
15063	SDValue Src2 =
15064	DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s [ChainLength - `1`], DL: SL, VT: MVT::i32);
15065
15066	SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
15067	: Intrinsic::amdgcn_udot4,
15068	DL: SL, VT: MVT::i64);
15069
15070	assert(!VT.isVector());
15071	auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
15072	N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1));
15073
15074	return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
15075	}
15076
15077	if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())
15078	return SDValue ();
15079
15080	// add x, zext (setcc) => uaddo_carry x, 0, setcc
15081	// add x, sext (setcc) => usubo_carry x, 0, setcc
15082	unsigned Opc = LHS.getOpcode();
15083	if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|
15084	Opc == ISD::ANY_EXTEND \|\| Opc == ISD::UADDO_CARRY)
15085	std::swap(a&: RHS, b&: LHS);
15086
15087	Opc = RHS.getOpcode();
15088	switch (Opc) {
15089	default:
15090	break;
15091	case ISD::ZERO_EXTEND:
15092	case ISD::SIGN_EXTEND:
15093	case ISD::ANY_EXTEND: {
15094	auto Cond = RHS.getOperand(i: `0`);
15095	// If this won't be a real VOPC output, we would still need to insert an
15096	// extra instruction anyway.
15097	if (!isBoolSGPR(V: Cond))
15098	break;
15099	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
15100	SDValue Args[] = {LHS, DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), Cond};
15101	Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
15102	return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
15103	}
15104	case ISD::UADDO_CARRY: {
15105	// add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15106	if (!isNullConstant(V: RHS.getOperand(i: `1`)))
15107	break;
15108	SDValue Args[] = {LHS, RHS.getOperand(i: `0`), RHS.getOperand(i: `2`)};
15109	return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc (N), VTList: RHS ->getVTList(), Ops: Args);
15110	}
15111	}
15112	return SDValue ();
15113	}
15114
15115	SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15116	DAGCombinerInfo &DCI) const {
15117	SelectionDAG &DAG = DCI.DAG;
15118	SDLoc DL(N);
15119	SDValue N0 = N->getOperand(Num: `0`);
15120	SDValue N1 = N->getOperand(Num: `1`);
15121
15122	if (N1.getOpcode() == ISD::ADD) {
15123	// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15124	// y is not, and (add y, z) is used only once.
15125	// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15126	// z is not, and (add y, z) is used only once.
15127	// The goal is to move constant offsets to the outermost ptradd, to create
15128	// more opportunities to fold offsets into memory instructions.
15129	// Together with the generic combines in DAGCombiner.cpp, this also
15130	// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15131	//
15132	// This transform is here instead of in the general DAGCombiner as it can
15133	// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15134	// AArch64's CPA.
15135	SDValue X = N0;
15136	SDValue Y = N1.getOperand(i: `0`);
15137	SDValue Z = N1.getOperand(i: `1`);
15138	if (N1.hasOneUse()) {
15139	bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
15140	bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
15141	if (ZIsConstant != YIsConstant) {
15142	// If both additions in the original were NUW, the new ones are as well.
15143	SDNodeFlags Flags =
15144	(N->getFlags() & N1 ->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15145	if (YIsConstant)
15146	std::swap(a&: Y, b&: Z);
15147
15148	SDValue Inner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags);
15149	DCI.AddToWorklist(N: Inner.getNode());
15150	return DAG.getMemBasePlusOffset(Base: Inner, Offset: Z, DL, Flags);
15151	}
15152	}
15153	}
15154
15155	return SDValue ();
15156	}
15157
15158	SDValue SITargetLowering::performSubCombine(SDNode *N,
15159	DAGCombinerInfo &DCI) const {
15160	SelectionDAG &DAG = DCI.DAG;
15161	EVT VT = N->getValueType(ResNo: `0`);
15162
15163	if (VT == MVT::i64) {
15164	if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15165	return Folded;
15166	}
15167
15168	if (VT != MVT::i32)
15169	return SDValue ();
15170
15171	SDLoc SL(N);
15172	SDValue LHS = N->getOperand(Num: `0`);
15173	SDValue RHS = N->getOperand(Num: `1`);
15174
15175	// sub x, zext (setcc) => usubo_carry x, 0, setcc
15176	// sub x, sext (setcc) => uaddo_carry x, 0, setcc
15177	unsigned Opc = RHS.getOpcode();
15178	switch (Opc) {
15179	default:
15180	break;
15181	case ISD::ZERO_EXTEND:
15182	case ISD::SIGN_EXTEND:
15183	case ISD::ANY_EXTEND: {
15184	auto Cond = RHS.getOperand(i: `0`);
15185	// If this won't be a real VOPC output, we would still need to insert an
15186	// extra instruction anyway.
15187	if (!isBoolSGPR(V: Cond))
15188	break;
15189	SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
15190	SDValue Args[] = {LHS, DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32), Cond};
15191	Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
15192	return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
15193	}
15194	}
15195
15196	if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15197	// sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15198	if (!isNullConstant(V: LHS.getOperand(i: `1`)))
15199	return SDValue ();
15200	SDValue Args[] = {LHS.getOperand(i: `0`), RHS, LHS.getOperand(i: `2`)};
15201	return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc (N), VTList: LHS ->getVTList(), Ops: Args);
15202	}
15203	return SDValue ();
15204	}
15205
15206	SDValue
15207	SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15208	DAGCombinerInfo &DCI) const {
15209
15210	if (N->getValueType(ResNo: `0`) != MVT::i32)
15211	return SDValue ();
15212
15213	if (!isNullConstant(V: N->getOperand(Num: `1`)))
15214	return SDValue ();
15215
15216	SelectionDAG &DAG = DCI.DAG;
15217	SDValue LHS = N->getOperand(Num: `0`);
15218
15219	// uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15220	// usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15221	unsigned LHSOpc = LHS.getOpcode();
15222	unsigned Opc = N->getOpcode();
15223	if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) \|\|
15224	(LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15225	SDValue Args[] = {LHS.getOperand(i: `0`), LHS.getOperand(i: `1`), N->getOperand(Num: `2`)};
15226	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VTList: N->getVTList(), Ops: Args);
15227	}
15228	return SDValue ();
15229	}
15230
15231	SDValue SITargetLowering::performFAddCombine(SDNode *N,
15232	DAGCombinerInfo &DCI) const {
15233	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15234	return SDValue ();
15235
15236	SelectionDAG &DAG = DCI.DAG;
15237	EVT VT = N->getValueType(ResNo: `0`);
15238
15239	SDLoc SL(N);
15240	SDValue LHS = N->getOperand(Num: `0`);
15241	SDValue RHS = N->getOperand(Num: `1`);
15242
15243	// These should really be instruction patterns, but writing patterns with
15244	// source modifiers is a pain.
15245
15246	// fadd (fadd (a, a), b) -> mad 2.0, a, b
15247	if (LHS.getOpcode() == ISD::FADD) {
15248	SDValue A = LHS.getOperand(i: `0`);
15249	if (A == LHS.getOperand(i: `1`)) {
15250	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
15251	if (FusedOp != `0`) {
15252	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
15253	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
15254	}
15255	}
15256	}
15257
15258	// fadd (b, fadd (a, a)) -> mad 2.0, a, b
15259	if (RHS.getOpcode() == ISD::FADD) {
15260	SDValue A = RHS.getOperand(i: `0`);
15261	if (A == RHS.getOperand(i: `1`)) {
15262	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
15263	if (FusedOp != `0`) {
15264	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
15265	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
15266	}
15267	}
15268	}
15269
15270	return SDValue ();
15271	}
15272
15273	SDValue SITargetLowering::performFSubCombine(SDNode *N,
15274	DAGCombinerInfo &DCI) const {
15275	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15276	return SDValue ();
15277
15278	SelectionDAG &DAG = DCI.DAG;
15279	SDLoc SL(N);
15280	EVT VT = N->getValueType(ResNo: `0`);
15281	assert(!VT.isVector());
15282
15283	// Try to get the fneg to fold into the source modifier. This undoes generic
15284	// DAG combines and folds them into the mad.
15285	//
15286	// Only do this if we are not trying to support denormals. v_mad_f32 does
15287	// not support denormals ever.
15288	SDValue LHS = N->getOperand(Num: `0`);
15289	SDValue RHS = N->getOperand(Num: `1`);
15290	if (LHS.getOpcode() == ISD::FADD) {
15291	// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15292	SDValue A = LHS.getOperand(i: `0`);
15293	if (A == LHS.getOperand(i: `1`)) {
15294	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
15295	if (FusedOp != `0`) {
15296	const SDValue Two = DAG.getConstantFP(Val: `2.0`, DL: SL, VT);
15297	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
15298
15299	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
15300	}
15301	}
15302	}
15303
15304	if (RHS.getOpcode() == ISD::FADD) {
15305	// (fsub c, (fadd a, a)) -> mad -2.0, a, c
15306
15307	SDValue A = RHS.getOperand(i: `0`);
15308	if (A == RHS.getOperand(i: `1`)) {
15309	unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
15310	if (FusedOp != `0`) {
15311	const SDValue NegTwo = DAG.getConstantFP(Val: -`2.0`, DL: SL, VT);
15312	return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
15313	}
15314	}
15315	}
15316
15317	return SDValue ();
15318	}
15319
15320	SDValue SITargetLowering::performFDivCombine(SDNode *N,
15321	DAGCombinerInfo &DCI) const {
15322	SelectionDAG &DAG = DCI.DAG;
15323	SDLoc SL(N);
15324	EVT VT = N->getValueType(ResNo: `0`);
15325	if (VT != MVT::f16 \|\| !Subtarget->has16BitInsts())
15326	return SDValue ();
15327
15328	SDValue LHS = N->getOperand(Num: `0`);
15329	SDValue RHS = N->getOperand(Num: `1`);
15330
15331	SDNodeFlags Flags = N->getFlags();
15332	SDNodeFlags RHSFlags = RHS ->getFlags();
15333	if (!Flags.hasAllowContract() \|\| !RHSFlags.hasAllowContract() \|\|
15334	!RHS ->hasOneUse())
15335	return SDValue ();
15336
15337	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
15338	bool IsNegative = false;
15339	if (CLHS->isExactlyValue(V: `1.0`) \|\|
15340	(IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
15341	// fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15342	// fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15343	if (RHS.getOpcode() == ISD::FSQRT) {
15344	// TODO: Or in RHS flags, somehow missing from SDNodeFlags
15345	SDValue Rsq =
15346	DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: `0`), Flags);
15347	return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
15348	}
15349	}
15350	}
15351
15352	return SDValue ();
15353	}
15354
15355	SDValue SITargetLowering::performFMulCombine(SDNode *N,
15356	DAGCombinerInfo &DCI) const {
15357	SelectionDAG &DAG = DCI.DAG;
15358	EVT VT = N->getValueType(ResNo: `0`);
15359	EVT ScalarVT = VT.getScalarType();
15360	EVT IntVT = VT.changeElementType(EltVT: MVT::i32);
15361
15362	if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15363	(ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f16)) {
15364	// Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15365	return SDValue ();
15366	}
15367
15368	SDValue LHS = N->getOperand(Num: `0`);
15369	SDValue RHS = N->getOperand(Num: `1`);
15370
15371	// It is cheaper to realize i32 inline constants as compared against
15372	// materializing f16 or f64 (or even non-inline f32) values,
15373	// possible via ldexp usage, as shown below :
15374	//
15375	// Given : A = 2^a & B = 2^b ; where a and b are integers.
15376	// fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15377	// fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15378	if ((ScalarVT == MVT::f64 \|\| ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f16) &&
15379	(RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15380	const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: `1`));
15381	if (!TrueNode)
15382	return SDValue ();
15383	const ConstantFPSDNode *FalseNode =
15384	isConstOrConstSplatFP(N: RHS.getOperand(i: `2`));
15385	if (!FalseNode)
15386	return SDValue ();
15387
15388	if (TrueNode->isNegative() != FalseNode->isNegative())
15389	return SDValue ();
15390
15391	// For f32, only non-inline constants should be transformed.
15392	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15393	if (ScalarVT == MVT::f32 &&
15394	TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
15395	TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
15396	return SDValue ();
15397
15398	int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15399	if (TrueNodeExpVal == INT_MIN)
15400	return SDValue ();
15401	int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15402	if (FalseNodeExpVal == INT_MIN)
15403	return SDValue ();
15404
15405	SDLoc SL(N);
15406	SDValue SelectNode =
15407	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: `0`),
15408	N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
15409	N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
15410
15411	LHS = TrueNode->isNegative()
15412	? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS ->getFlags())
15413	: LHS;
15414
15415	return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
15416	}
15417
15418	return SDValue ();
15419	}
15420
15421	SDValue SITargetLowering::performFMACombine(SDNode *N,
15422	DAGCombinerInfo &DCI) const {
15423	SelectionDAG &DAG = DCI.DAG;
15424	EVT VT = N->getValueType(ResNo: `0`);
15425	SDLoc SL(N);
15426
15427	if (!Subtarget->hasDot10Insts() \|\| VT != MVT::f32)
15428	return SDValue ();
15429
15430	// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15431	// FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15432	SDValue Op1 = N->getOperand(Num: `0`);
15433	SDValue Op2 = N->getOperand(Num: `1`);
15434	SDValue FMA = N->getOperand(Num: `2`);
15435
15436	if (FMA.getOpcode() != ISD::FMA \|\| Op1.getOpcode() != ISD::FP_EXTEND \|\|
15437	Op2.getOpcode() != ISD::FP_EXTEND)
15438	return SDValue ();
15439
15440	// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15441	// regardless of the denorm mode setting. Therefore,
15442	// unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
15443	const TargetOptions &Options = DAG.getTarget().Options;
15444	if (Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath \|\|
15445	(N->getFlags().hasAllowContract() &&
15446	FMA ->getFlags().hasAllowContract())) {
15447	Op1 = Op1.getOperand(i: `0`);
15448	Op2 = Op2.getOperand(i: `0`);
15449	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
15450	Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15451	return SDValue ();
15452
15453	SDValue Vec1 = Op1.getOperand(i: `0`);
15454	SDValue Idx1 = Op1.getOperand(i: `1`);
15455	SDValue Vec2 = Op2.getOperand(i: `0`);
15456
15457	SDValue FMAOp1 = FMA.getOperand(i: `0`);
15458	SDValue FMAOp2 = FMA.getOperand(i: `1`);
15459	SDValue FMAAcc = FMA.getOperand(i: `2`);
15460
15461	if (FMAOp1.getOpcode() != ISD::FP_EXTEND \|\|
15462	FMAOp2.getOpcode() != ISD::FP_EXTEND)
15463	return SDValue ();
15464
15465	FMAOp1 = FMAOp1.getOperand(i: `0`);
15466	FMAOp2 = FMAOp2.getOperand(i: `0`);
15467	if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
15468	FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15469	return SDValue ();
15470
15471	SDValue Vec3 = FMAOp1.getOperand(i: `0`);
15472	SDValue Vec4 = FMAOp2.getOperand(i: `0`);
15473	SDValue Idx2 = FMAOp1.getOperand(i: `1`);
15474
15475	if (Idx1 != Op2.getOperand(i: `1`) \|\| Idx2 != FMAOp2.getOperand(i: `1`) \|\|
15476	// Idx1 and Idx2 cannot be the same.
15477	Idx1 == Idx2)
15478	return SDValue ();
15479
15480	if (Vec1 == Vec2 \|\| Vec3 == Vec4)
15481	return SDValue ();
15482
15483	if (Vec1.getValueType() != MVT::v2f16 \|\| Vec2.getValueType() != MVT::v2f16)
15484	return SDValue ();
15485
15486	if ((Vec1 == Vec3 && Vec2 == Vec4) \|\| (Vec1 == Vec4 && Vec2 == Vec3)) {
15487	return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
15488	N4: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i1));
15489	}
15490	}
15491	return SDValue ();
15492	}
15493
15494	SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15495	DAGCombinerInfo &DCI) const {
15496	SelectionDAG &DAG = DCI.DAG;
15497	SDLoc SL(N);
15498
15499	SDValue LHS = N->getOperand(Num: `0`);
15500	SDValue RHS = N->getOperand(Num: `1`);
15501	EVT VT = LHS.getValueType();
15502	ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
15503
15504	auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
15505	if (!CRHS) {
15506	CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
15507	if (CRHS) {
15508	std::swap(a&: LHS, b&: RHS);
15509	CC = getSetCCSwappedOperands(Operation: CC);
15510	}
15511	}
15512
15513	if (CRHS) {
15514	if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15515	isBoolSGPR(V: LHS.getOperand(i: `0`))) {
15516	// setcc (sext from i1 cc), -1, ne\|sgt\|ult) => not cc => xor cc, -1
15517	// setcc (sext from i1 cc), -1, eq\|sle\|uge) => cc
15518	// setcc (sext from i1 cc), 0, eq\|sge\|ule) => not cc => xor cc, -1
15519	// setcc (sext from i1 cc), 0, ne\|ugt\|slt) => cc
15520	if ((CRHS->isAllOnes() &&
15521	(CC == ISD::SETNE \|\| CC == ISD::SETGT \|\| CC == ISD::SETULT)) \|\|
15522	(CRHS->isZero() &&
15523	(CC == ISD::SETEQ \|\| CC == ISD::SETGE \|\| CC == ISD::SETULE)))
15524	return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: `0`),
15525	N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
15526	if ((CRHS->isAllOnes() &&
15527	(CC == ISD::SETEQ \|\| CC == ISD::SETLE \|\| CC == ISD::SETUGE)) \|\|
15528	(CRHS->isZero() &&
15529	(CC == ISD::SETNE \|\| CC == ISD::SETUGT \|\| CC == ISD::SETLT)))
15530	return LHS.getOperand(i: `0`);
15531	}
15532
15533	const APInt &CRHSVal = CRHS->getAPIntValue();
15534	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
15535	LHS.getOpcode() == ISD::SELECT &&
15536	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
15537	isa<ConstantSDNode>(Val: LHS.getOperand(i: `2`)) &&
15538	LHS.getConstantOperandVal(i: `1`) != LHS.getConstantOperandVal(i: `2`) &&
15539	isBoolSGPR(V: LHS.getOperand(i: `0`))) {
15540	// Given CT != FT:
15541	// setcc (select cc, CT, CF), CF, eq => xor cc, -1
15542	// setcc (select cc, CT, CF), CF, ne => cc
15543	// setcc (select cc, CT, CF), CT, ne => xor cc, -1
15544	// setcc (select cc, CT, CF), CT, eq => cc
15545	const APInt &CT = LHS.getConstantOperandAPInt(i: `1`);
15546	const APInt &CF = LHS.getConstantOperandAPInt(i: `2`);
15547
15548	if ((CF == CRHSVal && CC == ISD::SETEQ) \|\|
15549	(CT == CRHSVal && CC == ISD::SETNE))
15550	return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: `0`),
15551	N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
15552	if ((CF == CRHSVal && CC == ISD::SETNE) \|\|
15553	(CT == CRHSVal && CC == ISD::SETEQ))
15554	return LHS.getOperand(i: `0`);
15555	}
15556	}
15557
15558	if (VT != MVT::f32 && VT != MVT::f64 &&
15559	(!Subtarget->has16BitInsts() \|\| VT != MVT::f16))
15560	return SDValue ();
15561
15562	// Match isinf/isfinite pattern
15563	// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity \| n_infinity))
15564	// (fcmp one (fabs x), inf) -> (fp_class x,
15565	// (p_normal \| n_normal \| p_subnormal \| n_subnormal \| p_zero \| n_zero)
15566	if ((CC == ISD::SETOEQ \|\| CC == ISD::SETONE) &&
15567	LHS.getOpcode() == ISD::FABS) {
15568	const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
15569	if (!CRHS)
15570	return SDValue ();
15571
15572	const APFloat &APF = CRHS->getValueAPF();
15573	if (APF.isInfinity() && !APF.isNegative()) {
15574	const unsigned IsInfMask =
15575	SIInstrFlags::P_INFINITY \| SIInstrFlags::N_INFINITY;
15576	const unsigned IsFiniteMask =
15577	SIInstrFlags::N_ZERO \| SIInstrFlags::P_ZERO \| SIInstrFlags::N_NORMAL \|
15578	SIInstrFlags::P_NORMAL \| SIInstrFlags::N_SUBNORMAL \|
15579	SIInstrFlags::P_SUBNORMAL;
15580	unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15581	return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: `0`),
15582	N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
15583	}
15584	}
15585
15586	return SDValue ();
15587	}
15588
15589	SDValue
15590	SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15591	DAGCombinerInfo &DCI) const {
15592	SelectionDAG &DAG = DCI.DAG;
15593	SDLoc SL(N);
15594	unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15595
15596	SDValue Src = N->getOperand(Num: `0`);
15597	SDValue Shift = N->getOperand(Num: `0`);
15598
15599	// TODO: Extend type shouldn't matter (assuming legal types).
15600	if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15601	Shift = Shift.getOperand(i: `0`);
15602
15603	if (Shift.getOpcode() == ISD::SRL \|\| Shift.getOpcode() == ISD::SHL) {
15604	// cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15605	// cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15606	// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15607	// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15608	// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15609	if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`))) {
15610	SDValue Shifted = DAG.getZExtOrTrunc(
15611	Op: Shift.getOperand(i: `0`), DL: SDLoc (Shift.getOperand(i: `0`)), VT: MVT::i32);
15612
15613	unsigned ShiftOffset = `8` * Offset;
15614	if (Shift.getOpcode() == ISD::SHL)
15615	ShiftOffset -= C->getZExtValue();
15616	else
15617	ShiftOffset += C->getZExtValue();
15618
15619	if (ShiftOffset < `32` && (ShiftOffset % `8`) == `0`) {
15620	return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / `8`, DL: SL,
15621	VT: MVT::f32, Operand: Shifted);
15622	}
15623	}
15624	}
15625
15626	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15627	APInt DemandedBits = APInt::getBitsSet(numBits: `32`, loBit: `8` * Offset, hiBit: `8` * Offset + `8`);
15628	if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
15629	// We simplified Src. If this node is not dead, visit it again so it is
15630	// folded properly.
15631	if (N->getOpcode() != ISD::DELETED_NODE)
15632	DCI.AddToWorklist(N);
15633	return SDValue (N, `0`);
15634	}
15635
15636	// Handle (or x, (srl y, 8)) pattern when known bits are zero.
15637	if (SDValue DemandedSrc =
15638	TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
15639	return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
15640
15641	return SDValue ();
15642	}
15643
15644	SDValue SITargetLowering::performClampCombine(SDNode *N,
15645	DAGCombinerInfo &DCI) const {
15646	ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: `0`));
15647	if (!CSrc)
15648	return SDValue ();
15649
15650	const MachineFunction &MF = DCI.DAG.getMachineFunction();
15651	const APFloat &F = CSrc->getValueAPF();
15652	APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
15653	if (F < Zero \|\|
15654	(F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15655	return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
15656	}
15657
15658	APFloat One(F.getSemantics(), "1.0");
15659	if (F > One)
15660	return DCI.DAG.getConstantFP(Val: One, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
15661
15662	return SDValue (CSrc, `0`);
15663	}
15664
15665	SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
15666	DAGCombinerInfo &DCI) const {
15667	switch (N->getOpcode()) {
15668	case ISD::ADD:
15669	case ISD::SUB:
15670	case ISD::SHL:
15671	case ISD::SRL:
15672	case ISD::SRA:
15673	case ISD::AND:
15674	case ISD::OR:
15675	case ISD::XOR:
15676	case ISD::MUL:
15677	case ISD::SETCC:
15678	case ISD::SELECT:
15679	case ISD::SMIN:
15680	case ISD::SMAX:
15681	case ISD::UMIN:
15682	case ISD::UMAX:
15683	if (auto Res = promoteUniformOpToI32(Op: SDValue (N, `0`), DCI))
15684	return Res;
15685	break;
15686	default:
15687	break;
15688	}
15689
15690	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15691	return SDValue ();
15692
15693	switch (N->getOpcode()) {
15694	case ISD::ADD:
15695	return performAddCombine(N, DCI);
15696	case ISD::PTRADD:
15697	return performPtrAddCombine(N, DCI);
15698	case ISD::SUB:
15699	return performSubCombine(N, DCI);
15700	case ISD::UADDO_CARRY:
15701	case ISD::USUBO_CARRY:
15702	return performAddCarrySubCarryCombine(N, DCI);
15703	case ISD::FADD:
15704	return performFAddCombine(N, DCI);
15705	case ISD::FSUB:
15706	return performFSubCombine(N, DCI);
15707	case ISD::FDIV:
15708	return performFDivCombine(N, DCI);
15709	case ISD::FMUL:
15710	return performFMulCombine(N, DCI);
15711	case ISD::SETCC:
15712	return performSetCCCombine(N, DCI);
15713	case ISD::FMAXNUM:
15714	case ISD::FMINNUM:
15715	case ISD::FMAXNUM_IEEE:
15716	case ISD::FMINNUM_IEEE:
15717	case ISD::FMAXIMUM:
15718	case ISD::FMINIMUM:
15719	case ISD::FMAXIMUMNUM:
15720	case ISD::FMINIMUMNUM:
15721	case ISD::SMAX:
15722	case ISD::SMIN:
15723	case ISD::UMAX:
15724	case ISD::UMIN:
15725	case AMDGPUISD::FMIN_LEGACY:
15726	case AMDGPUISD::FMAX_LEGACY:
15727	return performMinMaxCombine(N, DCI);
15728	case ISD::FMA:
15729	return performFMACombine(N, DCI);
15730	case ISD::AND:
15731	return performAndCombine(N, DCI);
15732	case ISD::OR:
15733	return performOrCombine(N, DCI);
15734	case ISD::FSHR: {
15735	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15736	if (N->getValueType(ResNo: `0`) == MVT::i32 && N->isDivergent() &&
15737	TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -`1`) {
15738	return matchPERM(N, DCI);
15739	}
15740	break;
15741	}
15742	case ISD::XOR:
15743	return performXorCombine(N, DCI);
15744	case ISD::ZERO_EXTEND:
15745	return performZeroExtendCombine(N, DCI);
15746	case ISD::SIGN_EXTEND_INREG:
15747	return performSignExtendInRegCombine(N, DCI);
15748	case AMDGPUISD::FP_CLASS:
15749	return performClassCombine(N, DCI);
15750	case ISD::FCANONICALIZE:
15751	return performFCanonicalizeCombine(N, DCI);
15752	case AMDGPUISD::RCP:
15753	return performRcpCombine(N, DCI);
15754	case ISD::FLDEXP:
15755	case AMDGPUISD::FRACT:
15756	case AMDGPUISD::RSQ:
15757	case AMDGPUISD::RCP_LEGACY:
15758	case AMDGPUISD::RCP_IFLAG:
15759	case AMDGPUISD::RSQ_CLAMP: {
15760	// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15761	SDValue Src = N->getOperand(Num: `0`);
15762	if (Src.isUndef())
15763	return Src;
15764	break;
15765	}
15766	case ISD::SINT_TO_FP:
15767	case ISD::UINT_TO_FP:
15768	return performUCharToFloatCombine(N, DCI);
15769	case ISD::FCOPYSIGN:
15770	return performFCopySignCombine(N, DCI);
15771	case AMDGPUISD::CVT_F32_UBYTE0:
15772	case AMDGPUISD::CVT_F32_UBYTE1:
15773	case AMDGPUISD::CVT_F32_UBYTE2:
15774	case AMDGPUISD::CVT_F32_UBYTE3:
15775	return performCvtF32UByteNCombine(N, DCI);
15776	case AMDGPUISD::FMED3:
15777	return performFMed3Combine(N, DCI);
15778	case AMDGPUISD::CVT_PKRTZ_F16_F32:
15779	return performCvtPkRTZCombine(N, DCI);
15780	case AMDGPUISD::CLAMP:
15781	return performClampCombine(N, DCI);
15782	case ISD::SCALAR_TO_VECTOR: {
15783	SelectionDAG &DAG = DCI.DAG;
15784	EVT VT = N->getValueType(ResNo: `0`);
15785
15786	// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15787	if (VT == MVT::v2i16 \|\| VT == MVT::v2f16 \|\| VT == MVT::v2bf16) {
15788	SDLoc SL(N);
15789	SDValue Src = N->getOperand(Num: `0`);
15790	EVT EltVT = Src.getValueType();
15791	if (EltVT != MVT::i16)
15792	Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
15793
15794	SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
15795	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
15796	}
15797
15798	break;
15799	}
15800	case ISD::EXTRACT_VECTOR_ELT:
15801	return performExtractVectorEltCombine(N, DCI);
15802	case ISD::INSERT_VECTOR_ELT:
15803	return performInsertVectorEltCombine(N, DCI);
15804	case ISD::FP_ROUND:
15805	return performFPRoundCombine(N, DCI);
15806	case ISD::LOAD: {
15807	if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
15808	return Widened;
15809	[[fallthrough]];
15810	}
15811	default: {
15812	if (!DCI.isBeforeLegalize()) {
15813	if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
15814	return performMemSDNodeCombine(N: MemNode, DCI);
15815	}
15816
15817	break;
15818	}
15819	}
15820
15821	return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
15822	}
15823
15824	/// Helper function for adjustWritemask
15825	static unsigned SubIdx2Lane(unsigned Idx) {
15826	switch (Idx) {
15827	default:
15828	return ~`0u`;
15829	case AMDGPU::sub0:
15830	return `0`;
15831	case AMDGPU::sub1:
15832	return `1`;
15833	case AMDGPU::sub2:
15834	return `2`;
15835	case AMDGPU::sub3:
15836	return `3`;
15837	case AMDGPU::sub4:
15838	return `4`; // Possible with TFE/LWE
15839	}
15840	}
15841
15842	/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15843	SDNode SITargetLowering::adjustWritemask(MachineSDNode &Node,
15844	SelectionDAG &DAG) const {
15845	unsigned Opcode = Node->getMachineOpcode();
15846
15847	// Subtract 1 because the vdata output is not a MachineSDNode operand.
15848	int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - `1`;
15849	if (D16Idx >= `0` && Node->getConstantOperandVal(Num: D16Idx))
15850	return Node; // not implemented for D16
15851
15852	SDNode Users[`5`] = {nullptr*};
15853	unsigned Lane = `0`;
15854	unsigned DmaskIdx =
15855	AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - `1`;
15856	unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
15857	unsigned NewDmask = `0`;
15858	unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - `1`;
15859	unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - `1`;
15860	bool UsesTFC = (int(TFEIdx) >= `0` && Node->getConstantOperandVal(Num: TFEIdx)) \|\|
15861	(int(LWEIdx) >= `0` && Node->getConstantOperandVal(Num: LWEIdx));
15862	unsigned TFCLane = `0`;
15863	bool HasChain = Node->getNumValues() > `1`;
15864
15865	if (OldDmask == `0`) {
15866	// These are folded out, but on the chance it happens don't assert.
15867	return Node;
15868	}
15869
15870	unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
15871	// Work out which is the TFE/LWE lane if that is enabled.
15872	if (UsesTFC) {
15873	TFCLane = OldBitsSet;
15874	}
15875
15876	// Try to figure out the used register components
15877	for (SDUse &Use : Node->uses()) {
15878
15879	// Don't look at users of the chain.
15880	if (Use.getResNo() != `0`)
15881	continue;
15882
15883	SDNode *User = Use.getUser();
15884
15885	// Abort if we can't understand the usage
15886	if (!User->isMachineOpcode() \|\|
15887	User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15888	return Node;
15889
15890	// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15891	// Note that subregs are packed, i.e. Lane==0 is the first bit set
15892	// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15893	// set, etc.
15894	Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: `1`));
15895	if (Lane == ~`0u`)
15896	return Node;
15897
15898	// Check if the use is for the TFE/LWE generated result at VGPRn+1.
15899	if (UsesTFC && Lane == TFCLane) {
15900	Users[Lane] = User;
15901	} else {
15902	// Set which texture component corresponds to the lane.
15903	unsigned Comp;
15904	for (unsigned i = `0`, Dmask = OldDmask; (i <= Lane) && (Dmask != `0`); i++) {
15905	Comp = llvm::countr_zero(Val: Dmask);
15906	Dmask &= ~(`1` << Comp);
15907	}
15908
15909	// Abort if we have more than one user per component.
15910	if (Users[Lane])
15911	return Node;
15912
15913	Users[Lane] = User;
15914	NewDmask \|= `1` << Comp;
15915	}
15916	}
15917
15918	// Don't allow 0 dmask, as hardware assumes one channel enabled.
15919	bool NoChannels = !NewDmask;
15920	if (NoChannels) {
15921	if (!UsesTFC) {
15922	// No uses of the result and not using TFC. Then do nothing.
15923	return Node;
15924	}
15925	// If the original dmask has one channel - then nothing to do
15926	if (OldBitsSet == `1`)
15927	return Node;
15928	// Use an arbitrary dmask - required for the instruction to work
15929	NewDmask = `1`;
15930	}
15931	// Abort if there's no change
15932	if (NewDmask == OldDmask)
15933	return Node;
15934
15935	unsigned BitsSet = llvm::popcount(Value: NewDmask);
15936
15937	// Check for TFE or LWE - increase the number of channels by one to account
15938	// for the extra return value
15939	// This will need adjustment for D16 if this is also included in
15940	// adjustWriteMask (this function) but at present D16 are excluded.
15941	unsigned NewChannels = BitsSet + UsesTFC;
15942
15943	int NewOpcode =
15944	AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
15945	assert(NewOpcode != -`1` &&
15946	NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15947	"failed to find equivalent MIMG op");
15948
15949	// Adjust the writemask in the node
15950	SmallVector<SDValue, `12`> Ops;
15951	llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
15952	Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc (Node), VT: MVT::i32));
15953	llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + `1`));
15954
15955	MVT SVT = Node->getValueType(ResNo: `0`).getVectorElementType().getSimpleVT();
15956
15957	MVT ResultVT = NewChannels == `1`
15958	? SVT
15959	: MVT::getVectorVT(VT: SVT, NumElements: NewChannels == `3` ? `4`
15960	: NewChannels == `5` ? `8`
15961	: NewChannels);
15962	SDVTList NewVTList =
15963	HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
15964
15965	MachineSDNode *NewNode =
15966	DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc (Node), VTs: NewVTList, Ops);
15967
15968	if (HasChain) {
15969	// Update chain.
15970	DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
15971	DAG.ReplaceAllUsesOfValueWith(From: SDValue (Node, `1`), To: SDValue (NewNode, `1`));
15972	}
15973
15974	if (NewChannels == `1`) {
15975	assert(Node->hasNUsesOfValue(`1`, `0`));
15976	SDNode *Copy =
15977	DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc (Node),
15978	VT: Users[Lane]->getValueType(ResNo: `0`), Op1: SDValue (NewNode, `0`));
15979	DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
15980	return nullptr;
15981	}
15982
15983	// Update the users of the node with the new indices
15984	for (unsigned i = `0`, Idx = AMDGPU::sub0; i < `5`; ++i) {
15985	SDNode *User = Users[i];
15986	if (!User) {
15987	// Handle the special case of NoChannels. We set NewDmask to 1 above, but
15988	// Users[0] is still nullptr because channel 0 doesn't really have a use.
15989	if (i \|\| !NoChannels)
15990	continue;
15991	} else {
15992	SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc (User), VT: MVT::i32);
15993	SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue (NewNode, `0`), Op2: Op);
15994	if (NewUser != User) {
15995	DAG.ReplaceAllUsesWith(From: SDValue (User, `0`), To: SDValue (NewUser, `0`));
15996	DAG.RemoveDeadNode(N: User);
15997	}
15998	}
15999
16000	switch (Idx) {
16001	default:
16002	break;
16003	case AMDGPU::sub0:
16004	Idx = AMDGPU::sub1;
16005	break;
16006	case AMDGPU::sub1:
16007	Idx = AMDGPU::sub2;
16008	break;
16009	case AMDGPU::sub2:
16010	Idx = AMDGPU::sub3;
16011	break;
16012	case AMDGPU::sub3:
16013	Idx = AMDGPU::sub4;
16014	break;
16015	}
16016	}
16017
16018	DAG.RemoveDeadNode(N: Node);
16019	return nullptr;
16020	}
16021
16022	static bool isFrameIndexOp(SDValue Op) {
16023	if (Op.getOpcode() == ISD::AssertZext)
16024	Op = Op.getOperand(i: `0`);
16025
16026	return isa<FrameIndexSDNode>(Val: Op);
16027	}
16028
16029	/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16030	/// with frame index operands.
16031	/// LLVM assumes that inputs are to these instructions are registers.
16032	SDNode *
16033	SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
16034	SelectionDAG &DAG) const {
16035	if (Node->getOpcode() == ISD::CopyToReg) {
16036	RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: `1`));
16037	SDValue SrcVal = Node->getOperand(Num: `2`);
16038
16039	// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16040	// to try understanding copies to physical registers.
16041	if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16042	SDLoc SL(Node);
16043	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16044	SDValue VReg = DAG.getRegister(
16045	Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
16046
16047	SDNode *Glued = Node->getGluedNode();
16048	SDValue ToVReg = DAG.getCopyToReg(
16049	Chain: Node->getOperand(Num: `0`), dl: SL, Reg: VReg, N: SrcVal,
16050	Glue: SDValue (Glued, Glued ? Glued->getNumValues() - `1` : `0`));
16051	SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue (DestReg, `0`),
16052	N: VReg, Glue: ToVReg.getValue(R: `1`));
16053	DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
16054	DAG.RemoveDeadNode(N: Node);
16055	return ToResultReg.getNode();
16056	}
16057	}
16058
16059	SmallVector<SDValue, `8`> Ops;
16060	for (unsigned i = `0`; i < Node->getNumOperands(); ++i) {
16061	if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
16062	Ops.push_back(Elt: Node->getOperand(Num: i));
16063	continue;
16064	}
16065
16066	SDLoc DL(Node);
16067	Ops.push_back(Elt: SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
16068	VT: Node->getOperand(Num: i).getValueType(),
16069	Op1: Node->getOperand(Num: i)),
16070	`0`));
16071	}
16072
16073	return DAG.UpdateNodeOperands(N: Node, Ops);
16074	}
16075
16076	/// Fold the instructions after selecting them.
16077	/// Returns null if users were already updated.
16078	SDNode SITargetLowering::PostISelFolding(MachineSDNode Node,
16079	SelectionDAG &DAG) const {
16080	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16081	unsigned Opcode = Node->getMachineOpcode();
16082
16083	if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16084	!TII->isGather4(Opcode) &&
16085	AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
16086	return adjustWritemask(Node, DAG);
16087	}
16088
16089	if (Opcode == AMDGPU::INSERT_SUBREG \|\| Opcode == AMDGPU::REG_SEQUENCE) {
16090	legalizeTargetIndependentNode(Node, DAG);
16091	return Node;
16092	}
16093
16094	switch (Opcode) {
16095	case AMDGPU::V_DIV_SCALE_F32_e64:
16096	case AMDGPU::V_DIV_SCALE_F64_e64: {
16097	// Satisfy the operand register constraint when one of the inputs is
16098	// undefined. Ordinarily each undef value will have its own implicit_def of
16099	// a vreg, so force these to use a single register.
16100	SDValue Src0 = Node->getOperand(Num: `1`);
16101	SDValue Src1 = Node->getOperand(Num: `3`);
16102	SDValue Src2 = Node->getOperand(Num: `5`);
16103
16104	if ((Src0.isMachineOpcode() &&
16105	Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16106	(Src0 == Src1 \|\| Src0 == Src2))
16107	break;
16108
16109	MVT VT = Src0.getValueType().getSimpleVT();
16110	const TargetRegisterClass *RC =
16111	getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
16112
16113	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16114	SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
16115
16116	SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc (Node), Reg: UndefReg,
16117	N: Src0, Glue: SDValue ());
16118
16119	// src0 must be the same register as src1 or src2, even if the value is
16120	// undefined, so make sure we don't violate this constraint.
16121	if (Src0.isMachineOpcode() &&
16122	Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16123	if (Src1.isMachineOpcode() &&
16124	Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16125	Src0 = Src1;
16126	else if (Src2.isMachineOpcode() &&
16127	Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16128	Src0 = Src2;
16129	else {
16130	assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16131	Src0 = UndefReg;
16132	Src1 = UndefReg;
16133	}
16134	} else
16135	break;
16136
16137	SmallVector<SDValue, `9`> Ops(Node->ops());
16138	Ops [`1`] = Src0;
16139	Ops [`3`] = Src1;
16140	Ops [`5`] = Src2;
16141	Ops.push_back(Elt: ImpDef.getValue(R: `1`));
16142	return DAG.getMachineNode(Opcode, dl: SDLoc (Node), VTs: Node->getVTList(), Ops);
16143	}
16144	default:
16145	break;
16146	}
16147
16148	return Node;
16149	}
16150
16151	// Any MIMG instructions that use tfe or lwe require an initialization of the
16152	// result register that will be written in the case of a memory access failure.
16153	// The required code is also added to tie this init code to the result of the
16154	// img instruction.
16155	void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
16156	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16157	const SIRegisterInfo &TRI = TII->getRegisterInfo();
16158	MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16159	MachineBasicBlock &MBB = *MI.getParent();
16160
16161	int DstIdx =
16162	AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
16163	unsigned InitIdx = `0`;
16164
16165	if (TII->isImage(MI)) {
16166	MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
16167	MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
16168	MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
16169
16170	if (!TFE && !LWE) // intersect_ray
16171	return;
16172
16173	unsigned TFEVal = TFE ? TFE->getImm() : `0`;
16174	unsigned LWEVal = LWE ? LWE->getImm() : `0`;
16175	unsigned D16Val = D16 ? D16->getImm() : `0`;
16176
16177	if (!TFEVal && !LWEVal)
16178	return;
16179
16180	// At least one of TFE or LWE are non-zero
16181	// We have to insert a suitable initialization of the result value and
16182	// tie this to the dest of the image instruction.
16183
16184	// Calculate which dword we have to initialize to 0.
16185	MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
16186
16187	// check that dmask operand is found.
16188	assert(MO_Dmask && "Expected dmask operand in instruction");
16189
16190	unsigned dmask = MO_Dmask->getImm();
16191	// Determine the number of active lanes taking into account the
16192	// Gather4 special case
16193	unsigned ActiveLanes = TII->isGather4(MI) ? `4` : llvm::popcount(Value: dmask);
16194
16195	bool Packed = !Subtarget->hasUnpackedD16VMem();
16196
16197	InitIdx = D16Val && Packed ? ((ActiveLanes + `1`) >> `1`) + `1` : ActiveLanes + `1`;
16198
16199	// Abandon attempt if the dst size isn't large enough
16200	// - this is in fact an error but this is picked up elsewhere and
16201	// reported correctly.
16202	uint32_t DstSize =
16203	TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / `32`;
16204	if (DstSize < InitIdx)
16205	return;
16206	} else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
16207	InitIdx = TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / `32`;
16208	} else {
16209	return;
16210	}
16211
16212	const DebugLoc &DL = MI.getDebugLoc();
16213
16214	// Create a register for the initialization value.
16215	Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
16216	unsigned NewDst = `0`; // Final initialized value will be in here
16217
16218	// If PRTStrictNull feature is enabled (the default) then initialize
16219	// all the result registers to 0, otherwise just the error indication
16220	// register (VGPRn+1)
16221	unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : `1`;
16222	unsigned CurrIdx = Subtarget->usePRTStrictNull() ? `0` : (InitIdx - `1`);
16223
16224	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
16225	for (; SizeLeft; SizeLeft--, CurrIdx++) {
16226	NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
16227	// Initialize dword
16228	Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
16229	// clang-format off
16230	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
16231	.addImm(Val: `0`);
16232	// clang-format on
16233	// Insert into the super-reg
16234	BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
16235	.addReg(RegNo: PrevDst)
16236	.addReg(RegNo: SubReg)
16237	.addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
16238
16239	PrevDst = NewDst;
16240	}
16241
16242	// Add as an implicit operand
16243	MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
16244
16245	// Tie the just added implicit operand to the dst
16246	MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - `1`);
16247	}
16248
16249	/// Assign the register class depending on the number of
16250	/// bits set in the writemask
16251	void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
16252	SDNode Node) const* {
16253	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16254
16255	MachineFunction *MF = MI.getParent()->getParent();
16256	MachineRegisterInfo &MRI = MF->getRegInfo();
16257	SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
16258
16259	if (TII->isVOP3(Opcode: MI.getOpcode())) {
16260	// Make sure constant bus requirements are respected.
16261	TII->legalizeOperandsVOP3(MRI, MI);
16262
16263	// Prefer VGPRs over AGPRs in mAI instructions where possible.
16264	// This saves a chain-copy of registers and better balance register
16265	// use between vgpr and agpr as agpr tuples tend to be big.
16266	if (!MI.getDesc().operands().empty()) {
16267	unsigned Opc = MI.getOpcode();
16268	bool HasAGPRs = Info->mayNeedAGPRs();
16269	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16270	int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
16271	for (auto I :
16272	{AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
16273	AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1), Src2Idx}) {
16274	if (I == -`1`)
16275	break;
16276	if ((I == Src2Idx) && (HasAGPRs))
16277	break;
16278	MachineOperand &Op = MI.getOperand(i: I);
16279	if (!Op.isReg() \|\| !Op.getReg().isVirtual())
16280	continue;
16281	auto *RC = TRI->getRegClassForReg(MRI, Reg: Op.getReg());
16282	if (!TRI->hasAGPRs(RC))
16283	continue;
16284	auto *Src = MRI.getUniqueVRegDef(Reg: Op.getReg());
16285	if (!Src \|\| !Src->isCopy() \|\|
16286	!TRI->isSGPRReg(MRI, Reg: Src->getOperand(i: `1`).getReg()))
16287	continue;
16288	auto *NewRC = TRI->getEquivalentVGPRClass(SRC: RC);
16289	// All uses of agpr64 and agpr32 can also accept vgpr except for
16290	// v_accvgpr_read, but we do not produce agpr reads during selection,
16291	// so no use checks are needed.
16292	MRI.setRegClass(Reg: Op.getReg(), RC: NewRC);
16293	}
16294
16295	if (TII->isMAI(MI)) {
16296	// The ordinary src0, src1, src2 were legalized above.
16297	//
16298	// We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16299	// as a separate instruction.
16300	int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
16301	Name: AMDGPU::OpName::scale_src0);
16302	if (Src0Idx != -`1`) {
16303	int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
16304	Name: AMDGPU::OpName::scale_src1);
16305	if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
16306	TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
16307	TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
16308	}
16309	}
16310
16311	if (!HasAGPRs)
16312	return;
16313
16314	// Resolve the rest of AV operands to AGPRs.
16315	if (auto *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)) {
16316	if (Src2->isReg() && Src2->getReg().isVirtual()) {
16317	auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg());
16318	if (TRI->isVectorSuperClass(RC)) {
16319	auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC);
16320	MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC);
16321	if (Src2->isTied())
16322	MRI.setRegClass(Reg: MI.getOperand(i: `0`).getReg(), RC: NewRC);
16323	}
16324	}
16325	}
16326	}
16327
16328	return;
16329	}
16330
16331	if (TII->isImage(MI))
16332	TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
16333	}
16334
16335	static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
16336	uint64_t Val) {
16337	SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
16338	return SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), `0`);
16339	}
16340
16341	MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
16342	const SDLoc &DL,
16343	SDValue Ptr) const {
16344	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16345
16346	// Build the half of the subregister with the constants before building the
16347	// full 128-bit register. If we are building multiple resource descriptors,
16348	// this will allow CSEing of the 2-component register.
16349	const SDValue Ops0[] = {
16350	DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
16351	buildSMovImm32(DAG, DL, Val: `0`),
16352	DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
16353	buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> `32`),
16354	DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
16355
16356	SDValue SubRegHi = SDValue (
16357	DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), `0`);
16358
16359	// Combine the constants and the pointer.
16360	const SDValue Ops1[] = {
16361	DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
16362	DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
16363	DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
16364
16365	return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
16366	}
16367
16368	/// Return a resource descriptor with the 'Add TID' bit enabled
16369	/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16370	/// of the resource descriptor) to create an offset, which is added to
16371	/// the resource pointer.
16372	MachineSDNode SITargetLowering::buildRSRC(SelectionDAG &DAG, const* SDLoc &DL,
16373	SDValue Ptr, uint32_t RsrcDword1,
16374	uint64_t RsrcDword2And3) const {
16375	SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
16376	SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
16377	if (RsrcDword1) {
16378	PtrHi =
16379	SDValue (DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
16380	Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
16381	`0`);
16382	}
16383
16384	SDValue DataLo =
16385	buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(`0xFFFFFFFF`));
16386	SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> `32`);
16387
16388	const SDValue Ops[] = {
16389	DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
16390	PtrLo,
16391	DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
16392	PtrHi,
16393	DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
16394	DataLo,
16395	DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
16396	DataHi,
16397	DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
16398
16399	return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
16400	}
16401
16402	//===----------------------------------------------------------------------===//
16403	// SI Inline Assembly Support
16404	//===----------------------------------------------------------------------===//
16405
16406	std::pair<unsigned, const TargetRegisterClass *>
16407	SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
16408	StringRef Constraint,
16409	MVT VT) const {
16410	const SIRegisterInfo TRI = static_cast<const* SIRegisterInfo *>(TRI_);
16411
16412	const TargetRegisterClass RC = nullptr*;
16413	if (Constraint.size() == `1`) {
16414	const unsigned BitWidth = VT.getSizeInBits();
16415	switch (Constraint [`0`]) {
16416	default:
16417	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16418	case `'s'`:
16419	case `'r'`:
16420	switch (BitWidth) {
16421	case `16`:
16422	RC = &AMDGPU::SReg_32RegClass;
16423	break;
16424	case `64`:
16425	RC = &AMDGPU::SGPR_64RegClass;
16426	break;
16427	default:
16428	RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
16429	if (!RC)
16430	return std::pair(`0U`, nullptr);
16431	break;
16432	}
16433	break;
16434	case `'v'`:
16435	switch (BitWidth) {
16436	case `16`:
16437	RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16438	: &AMDGPU::VGPR_32RegClass;
16439	break;
16440	default:
16441	RC = TRI->getVGPRClassForBitWidth(BitWidth);
16442	if (!RC)
16443	return std::pair(`0U`, nullptr);
16444	break;
16445	}
16446	break;
16447	case `'a'`:
16448	if (!Subtarget->hasMAIInsts())
16449	break;
16450	switch (BitWidth) {
16451	case `16`:
16452	RC = &AMDGPU::AGPR_32RegClass;
16453	break;
16454	default:
16455	RC = TRI->getAGPRClassForBitWidth(BitWidth);
16456	if (!RC)
16457	return std::pair(`0U`, nullptr);
16458	break;
16459	}
16460	break;
16461	}
16462	// We actually support i128, i16 and f16 as inline parameters
16463	// even if they are not reported as legal
16464	if (RC && (isTypeLegal(VT) \|\| VT.SimpleTy == MVT::i128 \|\|
16465	VT.SimpleTy == MVT::i16 \|\| VT.SimpleTy == MVT::f16))
16466	return std::pair(`0U`, RC);
16467	}
16468
16469	if (Constraint.starts_with(Prefix: "{") && Constraint.ends_with(Suffix: "}")) {
16470	StringRef RegName(Constraint.data() + `1`, Constraint.size() - `2`);
16471	if (RegName.consume_front(Prefix: "v")) {
16472	RC = &AMDGPU::VGPR_32RegClass;
16473	} else if (RegName.consume_front(Prefix: "s")) {
16474	RC = &AMDGPU::SGPR_32RegClass;
16475	} else if (RegName.consume_front(Prefix: "a")) {
16476	RC = &AMDGPU::AGPR_32RegClass;
16477	}
16478
16479	if (RC) {
16480	uint32_t Idx;
16481	if (RegName.consume_front(Prefix: "[")) {
16482	uint32_t End;
16483	bool Failed = RegName.consumeInteger(Radix: `10`, Result&: Idx);
16484	Failed \|= !RegName.consume_front(Prefix: ":");
16485	Failed \|= RegName.consumeInteger(Radix: `10`, Result&: End);
16486	Failed \|= !RegName.consume_back(Suffix: "]");
16487	if (!Failed) {
16488	uint32_t Width = (End - Idx + `1`) * `32`;
16489	// Prohibit constraints for register ranges with a width that does not
16490	// match the required type.
16491	if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16492	return std::pair(`0U`, nullptr);
16493	MCRegister Reg = RC->getRegister(i: Idx);
16494	if (SIRegisterInfo::isVGPRClass(RC))
16495	RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
16496	else if (SIRegisterInfo::isSGPRClass(RC))
16497	RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
16498	else if (SIRegisterInfo::isAGPRClass(RC))
16499	RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
16500	if (RC) {
16501	Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
16502	if (!Reg) {
16503	// The register class does not contain the requested register,
16504	// e.g., because it is an SGPR pair that would violate alignment
16505	// requirements.
16506	return std::pair(`0U`, nullptr);
16507	}
16508	return std::pair(Reg, RC);
16509	}
16510	}
16511	} else {
16512	// Check for lossy scalar/vector conversions.
16513	if (VT.isVector() && VT.getSizeInBits() != `32`)
16514	return std::pair(`0U`, nullptr);
16515	bool Failed = RegName.getAsInteger(Radix: `10`, Result&: Idx);
16516	if (!Failed && Idx < RC->getNumRegs())
16517	return std::pair(RC->getRegister(i: Idx), RC);
16518	}
16519	}
16520	}
16521
16522	auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16523	if (Ret.first)
16524	Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
16525
16526	return Ret;
16527	}
16528
16529	static bool isImmConstraint(StringRef Constraint) {
16530	if (Constraint.size() == `1`) {
16531	switch (Constraint [`0`]) {
16532	default:
16533	break;
16534	case `'I'`:
16535	case `'J'`:
16536	case `'A'`:
16537	case `'B'`:
16538	case `'C'`:
16539	return true;
16540	}
16541	} else if (Constraint == "DA" \|\| Constraint == "DB") {
16542	return true;
16543	}
16544	return false;
16545	}
16546
16547	SITargetLowering::ConstraintType
16548	SITargetLowering::getConstraintType(StringRef Constraint) const {
16549	if (Constraint.size() == `1`) {
16550	switch (Constraint [`0`]) {
16551	default:
16552	break;
16553	case `'s'`:
16554	case `'v'`:
16555	case `'a'`:
16556	return C_RegisterClass;
16557	}
16558	}
16559	if (isImmConstraint(Constraint)) {
16560	return C_Other;
16561	}
16562	return TargetLowering::getConstraintType(Constraint);
16563	}
16564
16565	static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16566	if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
16567	Val = Val & maskTrailingOnes<uint64_t>(N: Size);
16568	}
16569	return Val;
16570	}
16571
16572	void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16573	StringRef Constraint,
16574	std::vector<SDValue> &Ops,
16575	SelectionDAG &DAG) const {
16576	if (isImmConstraint(Constraint)) {
16577	uint64_t Val;
16578	if (getAsmOperandConstVal(Op, Val) &&
16579	checkAsmConstraintVal(Op, Constraint, Val)) {
16580	Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
16581	Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc (Op), VT: MVT::i64));
16582	}
16583	} else {
16584	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16585	}
16586	}
16587
16588	bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
16589	unsigned Size = Op.getScalarValueSizeInBits();
16590	if (Size > `64`)
16591	return false;
16592
16593	if (Size == `16` && !Subtarget->has16BitInsts())
16594	return false;
16595
16596	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
16597	Val = C->getSExtValue();
16598	return true;
16599	}
16600	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
16601	Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16602	return true;
16603	}
16604	if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
16605	if (Size != `16` \|\| Op.getNumOperands() != `2`)
16606	return false;
16607	if (Op.getOperand(i: `0`).isUndef() \|\| Op.getOperand(i: `1`).isUndef())
16608	return false;
16609	if (ConstantSDNode *C = V->getConstantSplatNode()) {
16610	Val = C->getSExtValue();
16611	return true;
16612	}
16613	if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16614	Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16615	return true;
16616	}
16617	}
16618
16619	return false;
16620	}
16621
16622	bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
16623	uint64_t Val) const {
16624	if (Constraint.size() == `1`) {
16625	switch (Constraint [`0`]) {
16626	case `'I'`:
16627	return AMDGPU::isInlinableIntLiteral(Literal: Val);
16628	case `'J'`:
16629	return isInt<`16`>(x: Val);
16630	case `'A'`:
16631	return checkAsmConstraintValA(Op, Val);
16632	case `'B'`:
16633	return isInt<`32`>(x: Val);
16634	case `'C'`:
16635	return isUInt<`32`>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) \|\|
16636	AMDGPU::isInlinableIntLiteral(Literal: Val);
16637	default:
16638	break;
16639	}
16640	} else if (Constraint.size() == `2`) {
16641	if (Constraint == "DA") {
16642	int64_t HiBits = static_cast<int32_t>(Val >> `32`);
16643	int64_t LoBits = static_cast<int32_t>(Val);
16644	return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: `32`) &&
16645	checkAsmConstraintValA(Op, Val: LoBits, MaxSize: `32`);
16646	}
16647	if (Constraint == "DB") {
16648	return true;
16649	}
16650	}
16651	llvm_unreachable("Invalid asm constraint");
16652	}
16653
16654	bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
16655	unsigned MaxSize) const {
16656	unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
16657	bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16658	if (Size == `16`) {
16659	MVT VT = Op.getSimpleValueType();
16660	switch (VT.SimpleTy) {
16661	default:
16662	return false;
16663	case MVT::i16:
16664	return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
16665	case MVT::f16:
16666	return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
16667	case MVT::bf16:
16668	return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
16669	case MVT::v2i16:
16670	return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
16671	case MVT::v2f16:
16672	return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
16673	case MVT::v2bf16:
16674	return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
16675	}
16676	}
16677	if ((Size == `32` && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) \|\|
16678	(Size == `64` && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
16679	return true;
16680	return false;
16681	}
16682
16683	static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16684	switch (UnalignedClassID) {
16685	case AMDGPU::VReg_64RegClassID:
16686	return AMDGPU::VReg_64_Align2RegClassID;
16687	case AMDGPU::VReg_96RegClassID:
16688	return AMDGPU::VReg_96_Align2RegClassID;
16689	case AMDGPU::VReg_128RegClassID:
16690	return AMDGPU::VReg_128_Align2RegClassID;
16691	case AMDGPU::VReg_160RegClassID:
16692	return AMDGPU::VReg_160_Align2RegClassID;
16693	case AMDGPU::VReg_192RegClassID:
16694	return AMDGPU::VReg_192_Align2RegClassID;
16695	case AMDGPU::VReg_224RegClassID:
16696	return AMDGPU::VReg_224_Align2RegClassID;
16697	case AMDGPU::VReg_256RegClassID:
16698	return AMDGPU::VReg_256_Align2RegClassID;
16699	case AMDGPU::VReg_288RegClassID:
16700	return AMDGPU::VReg_288_Align2RegClassID;
16701	case AMDGPU::VReg_320RegClassID:
16702	return AMDGPU::VReg_320_Align2RegClassID;
16703	case AMDGPU::VReg_352RegClassID:
16704	return AMDGPU::VReg_352_Align2RegClassID;
16705	case AMDGPU::VReg_384RegClassID:
16706	return AMDGPU::VReg_384_Align2RegClassID;
16707	case AMDGPU::VReg_512RegClassID:
16708	return AMDGPU::VReg_512_Align2RegClassID;
16709	case AMDGPU::VReg_1024RegClassID:
16710	return AMDGPU::VReg_1024_Align2RegClassID;
16711	case AMDGPU::AReg_64RegClassID:
16712	return AMDGPU::AReg_64_Align2RegClassID;
16713	case AMDGPU::AReg_96RegClassID:
16714	return AMDGPU::AReg_96_Align2RegClassID;
16715	case AMDGPU::AReg_128RegClassID:
16716	return AMDGPU::AReg_128_Align2RegClassID;
16717	case AMDGPU::AReg_160RegClassID:
16718	return AMDGPU::AReg_160_Align2RegClassID;
16719	case AMDGPU::AReg_192RegClassID:
16720	return AMDGPU::AReg_192_Align2RegClassID;
16721	case AMDGPU::AReg_256RegClassID:
16722	return AMDGPU::AReg_256_Align2RegClassID;
16723	case AMDGPU::AReg_512RegClassID:
16724	return AMDGPU::AReg_512_Align2RegClassID;
16725	case AMDGPU::AReg_1024RegClassID:
16726	return AMDGPU::AReg_1024_Align2RegClassID;
16727	default:
16728	return -`1`;
16729	}
16730	}
16731
16732	// Figure out which registers should be reserved for stack access. Only after
16733	// the function is legalized do we know all of the non-spill stack objects or if
16734	// calls are present.
16735	void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
16736	MachineRegisterInfo &MRI = MF.getRegInfo();
16737	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16738	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16739	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16740	const SIInstrInfo *TII = ST.getInstrInfo();
16741
16742	if (Info->isEntryFunction()) {
16743	// Callable functions have fixed registers used for stack access.
16744	reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: TRI, Info&: Info);
16745	}
16746
16747	// TODO: Move this logic to getReservedRegs()
16748	// Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16749	unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16750	Register SReg = ST.isWave32()
16751	? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - `1`)
16752	: TRI->getAlignedHighSGPRForRC(MF, /Align=/`2`,
16753	RC: &AMDGPU::SGPR_64RegClass);
16754	Info->setSGPRForEXECCopy(SReg);
16755
16756	assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16757	Info->getStackPtrOffsetReg()));
16758	if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16759	MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
16760
16761	// We need to worry about replacing the default register with itself in case
16762	// of MIR testcases missing the MFI.
16763	if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16764	MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
16765
16766	if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16767	MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
16768
16769	Info->limitOccupancy(MF);
16770
16771	if (ST.isWave32() && !MF.empty()) {
16772	for (auto &MBB : MF) {
16773	for (auto &MI : MBB) {
16774	TII->fixImplicitOperands(MI);
16775	}
16776	}
16777	}
16778
16779	// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16780	// classes if required. Ideally the register class constraints would differ
16781	// per-subtarget, but there's no easy way to achieve that right now. This is
16782	// not a problem for VGPRs because the correctly aligned VGPR class is implied
16783	// from using them as the register class for legal types.
16784	if (ST.needsAlignedVGPRs()) {
16785	for (unsigned I = `0`, E = MRI.getNumVirtRegs(); I != E; ++I) {
16786	const Register Reg = Register::index2VirtReg(Index: I);
16787	const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16788	if (!RC)
16789	continue;
16790	int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
16791	if (NewClassID != -`1`)
16792	MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID));
16793	}
16794	}
16795
16796	TargetLoweringBase::finalizeLowering(MF);
16797	}
16798
16799	void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16800	KnownBits &Known,
16801	const APInt &DemandedElts,
16802	const SelectionDAG &DAG,
16803	unsigned Depth) const {
16804	Known.resetAll();
16805	unsigned Opc = Op.getOpcode();
16806	switch (Opc) {
16807	case ISD::INTRINSIC_WO_CHAIN: {
16808	unsigned IID = Op.getConstantOperandVal(i: `0`);
16809	switch (IID) {
16810	case Intrinsic::amdgcn_mbcnt_lo:
16811	case Intrinsic::amdgcn_mbcnt_hi: {
16812	const GCNSubtarget &ST =
16813	DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
16814	// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16815	// most 31 + src1.
16816	Known.Zero.setBitsFrom(
16817	IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : `5`);
16818	KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
16819	Known = KnownBits::add(LHS: Known, RHS: Known2);
16820	return;
16821	}
16822	}
16823	break;
16824	}
16825	}
16826	return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
16827	Op, Known, DemandedElts, DAG, Depth);
16828	}
16829
16830	void SITargetLowering::computeKnownBitsForFrameIndex(
16831	const int FI, KnownBits &Known, const MachineFunction &MF) const {
16832	TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
16833
16834	// Set the high bits to zero based on the maximum allowed scratch size per
16835	// wave. We can't use vaddr in MUBUF instructions if we don't know the address
16836	// calculation won't overflow, so assume the sign bit is never set.
16837	Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16838	}
16839
16840	static void knownBitsForWorkitemID(const GCNSubtarget &ST,
16841	GISelValueTracking &VT, KnownBits &Known,
16842	unsigned Dim) {
16843	unsigned MaxValue =
16844	ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
16845	Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
16846	}
16847
16848	void SITargetLowering::computeKnownBitsForTargetInstr(
16849	GISelValueTracking &VT, Register R, KnownBits &Known,
16850	const APInt &DemandedElts, const MachineRegisterInfo &MRI,
16851	unsigned Depth) const {
16852	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
16853	switch (MI->getOpcode()) {
16854	case AMDGPU::G_INTRINSIC:
16855	case AMDGPU::G_INTRINSIC_CONVERGENT: {
16856	Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
16857	switch (IID) {
16858	case Intrinsic::amdgcn_workitem_id_x:
16859	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `0`);
16860	break;
16861	case Intrinsic::amdgcn_workitem_id_y:
16862	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `1`);
16863	break;
16864	case Intrinsic::amdgcn_workitem_id_z:
16865	knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: `2`);
16866	break;
16867	case Intrinsic::amdgcn_mbcnt_lo:
16868	case Intrinsic::amdgcn_mbcnt_hi: {
16869	// Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16870	// most 31 + src1.
16871	Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16872	? getSubtarget()->getWavefrontSizeLog2()
16873	: `5`);
16874	KnownBits Known2;
16875	VT.computeKnownBitsImpl(R: MI->getOperand(i: `3`).getReg(), Known&: Known2, DemandedElts,
16876	Depth: Depth + `1`);
16877	Known = KnownBits::add(LHS: Known, RHS: Known2);
16878	break;
16879	}
16880	case Intrinsic::amdgcn_groupstaticsize: {
16881	// We can report everything over the maximum size as 0. We can't report
16882	// based on the actual size because we don't know if it's accurate or not
16883	// at any given point.
16884	Known.Zero.setHighBits(
16885	llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
16886	break;
16887	}
16888	}
16889	break;
16890	}
16891	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16892	Known.Zero.setHighBits(`24`);
16893	break;
16894	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16895	Known.Zero.setHighBits(`16`);
16896	break;
16897	case AMDGPU::G_AMDGPU_SMED3:
16898	case AMDGPU::G_AMDGPU_UMED3: {
16899	auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16900
16901	KnownBits Known2;
16902	VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + `1`);
16903	if (Known2.isUnknown())
16904	break;
16905
16906	KnownBits Known1;
16907	VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + `1`);
16908	if (Known1.isUnknown())
16909	break;
16910
16911	KnownBits Known0;
16912	VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + `1`);
16913	if (Known0.isUnknown())
16914	break;
16915
16916	// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16917	Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16918	Known.One = Known0.One & Known1.One & Known2.One;
16919	break;
16920	}
16921	}
16922	}
16923
16924	Align SITargetLowering::computeKnownAlignForTargetInstr(
16925	GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
16926	unsigned Depth) const {
16927	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
16928	if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
16929	// FIXME: Can this move to generic code? What about the case where the call
16930	// site specifies a lower alignment?
16931	Intrinsic::ID IID = GI->getIntrinsicID();
16932	LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
16933	AttributeList Attrs =
16934	Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
16935	if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16936	return *RetAlign;
16937	}
16938	return Align (`1`);
16939	}
16940
16941	Align SITargetLowering::getPrefLoopAlignment(MachineLoop ML) const* {
16942	const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
16943	const Align CacheLineAlign = Align (`64`);
16944
16945	// Pre-GFX10 target did not benefit from loop alignment
16946	if (!ML \|\| DisableLoopAlignment \|\| !getSubtarget()->hasInstPrefetch() \|\|
16947	getSubtarget()->hasInstFwdPrefetchBug())
16948	return PrefAlign;
16949
16950	// On GFX10 I$ is 4 x 64 bytes cache lines.
16951	// By default prefetcher keeps one cache line behind and reads two ahead.
16952	// We can modify it with S_INST_PREFETCH for larger loops to have two lines
16953	// behind and one ahead.
16954	// Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16955	// If loop fits 64 bytes it always spans no more than two cache lines and
16956	// does not need an alignment.
16957	// Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16958	// Else if loop is less or equal 192 bytes we need two lines behind.
16959
16960	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16961	const MachineBasicBlock *Header = ML->getHeader();
16962	if (Header->getAlignment() != PrefAlign)
16963	return Header->getAlignment(); // Already processed.
16964
16965	unsigned LoopSize = `0`;
16966	for (const MachineBasicBlock *MBB : ML->blocks()) {
16967	// If inner loop block is aligned assume in average half of the alignment
16968	// size to be added as nops.
16969	if (MBB != Header)
16970	LoopSize += MBB->getAlignment().value() / `2`;
16971
16972	for (const MachineInstr &MI : *MBB) {
16973	LoopSize += TII->getInstSizeInBytes(MI);
16974	if (LoopSize > `192`)
16975	return PrefAlign;
16976	}
16977	}
16978
16979	if (LoopSize <= `64`)
16980	return PrefAlign;
16981
16982	if (LoopSize <= `128`)
16983	return CacheLineAlign;
16984
16985	// If any of parent loops is surrounded by prefetch instructions do not
16986	// insert new for inner loop, which would reset parent's settings.
16987	for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16988	if (MachineBasicBlock *Exit = P->getExitBlock()) {
16989	auto I = Exit->getFirstNonDebugInstr();
16990	if (I != Exit->end() && I ->getOpcode() == AMDGPU::S_INST_PREFETCH)
16991	return CacheLineAlign;
16992	}
16993	}
16994
16995	MachineBasicBlock *Pre = ML->getLoopPreheader();
16996	MachineBasicBlock *Exit = ML->getExitBlock();
16997
16998	if (Pre && Exit) {
16999	auto PreTerm = Pre->getFirstTerminator();
17000	if (PreTerm == Pre->begin() \|\|
17001	std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17002	BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
17003	.addImm(Val: `1`); // prefetch 2 lines behind PC
17004
17005	auto ExitHead = Exit->getFirstNonDebugInstr();
17006	if (ExitHead == Exit->end() \|\|
17007	ExitHead ->getOpcode() != AMDGPU::S_INST_PREFETCH)
17008	BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc (), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
17009	.addImm(Val: `2`); // prefetch 1 line behind PC
17010	}
17011
17012	return CacheLineAlign;
17013	}
17014
17015	LLVM_ATTRIBUTE_UNUSED
17016	static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17017	assert(N->getOpcode() == ISD::CopyFromReg);
17018	do {
17019	// Follow the chain until we find an INLINEASM node.
17020	N = N->getOperand(Num: `0`).getNode();
17021	if (N->getOpcode() == ISD::INLINEASM \|\| N->getOpcode() == ISD::INLINEASM_BR)
17022	return true;
17023	} while (N->getOpcode() == ISD::CopyFromReg);
17024	return false;
17025	}
17026
17027	bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
17028	FunctionLoweringInfo *FLI,
17029	UniformityInfo UA) const* {
17030	switch (N->getOpcode()) {
17031	case ISD::CopyFromReg: {
17032	const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: `1`));
17033	const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17034	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17035	Register Reg = R->getReg();
17036
17037	// FIXME: Why does this need to consider isLiveIn?
17038	if (Reg.isPhysical() \|\| MRI.isLiveIn(Reg))
17039	return !TRI->isSGPRReg(MRI, Reg);
17040
17041	if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
17042	return UA->isDivergent(V);
17043
17044	assert(Reg == FLI->DemoteRegister \|\| isCopyFromRegOfInlineAsm(N));
17045	return !TRI->isSGPRReg(MRI, Reg);
17046	}
17047	case ISD::LOAD: {
17048	const LoadSDNode *L = cast<LoadSDNode>(Val: N);
17049	unsigned AS = L->getAddressSpace();
17050	// A flat load may access private memory.
17051	return AS == AMDGPUAS::PRIVATE_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS;
17052	}
17053	case ISD::CALLSEQ_END:
17054	return true;
17055	case ISD::INTRINSIC_WO_CHAIN:
17056	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: `0`));
17057	case ISD::INTRINSIC_W_CHAIN:
17058	return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: `1`));
17059	case AMDGPUISD::ATOMIC_CMP_SWAP:
17060	case AMDGPUISD::BUFFER_ATOMIC_SWAP:
17061	case AMDGPUISD::BUFFER_ATOMIC_ADD:
17062	case AMDGPUISD::BUFFER_ATOMIC_SUB:
17063	case AMDGPUISD::BUFFER_ATOMIC_SMIN:
17064	case AMDGPUISD::BUFFER_ATOMIC_UMIN:
17065	case AMDGPUISD::BUFFER_ATOMIC_SMAX:
17066	case AMDGPUISD::BUFFER_ATOMIC_UMAX:
17067	case AMDGPUISD::BUFFER_ATOMIC_AND:
17068	case AMDGPUISD::BUFFER_ATOMIC_OR:
17069	case AMDGPUISD::BUFFER_ATOMIC_XOR:
17070	case AMDGPUISD::BUFFER_ATOMIC_INC:
17071	case AMDGPUISD::BUFFER_ATOMIC_DEC:
17072	case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
17073	case AMDGPUISD::BUFFER_ATOMIC_CSUB:
17074	case AMDGPUISD::BUFFER_ATOMIC_FADD:
17075	case AMDGPUISD::BUFFER_ATOMIC_FMIN:
17076	case AMDGPUISD::BUFFER_ATOMIC_FMAX:
17077	// Target-specific read-modify-write atomics are sources of divergence.
17078	return true;
17079	default:
17080	if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
17081	// Generic read-modify-write atomics are sources of divergence.
17082	return A->readMem() && A->writeMem();
17083	}
17084	return false;
17085	}
17086	}
17087
17088	bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
17089	EVT VT) const {
17090	switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17091	case MVT::f32:
17092	return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
17093	case MVT::f64:
17094	case MVT::f16:
17095	return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
17096	default:
17097	return false;
17098	}
17099	}
17100
17101	bool SITargetLowering::denormalsEnabledForType(
17102	LLT Ty, const MachineFunction &MF) const {
17103	switch (Ty.getScalarSizeInBits()) {
17104	case `32`:
17105	return !denormalModeIsFlushAllF32(MF);
17106	case `64`:
17107	case `16`:
17108	return !denormalModeIsFlushAllF64F16(MF);
17109	default:
17110	return false;
17111	}
17112	}
17113
17114	bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
17115	const APInt &DemandedElts,
17116	const SelectionDAG &DAG,
17117	bool SNaN,
17118	unsigned Depth) const {
17119	if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17120	const MachineFunction &MF = DAG.getMachineFunction();
17121	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
17122
17123	if (Info->getMode().DX10Clamp)
17124	return true; // Clamped to 0.
17125	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`);
17126	}
17127
17128	return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
17129	DAG, SNaN, Depth);
17130	}
17131
17132	// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17133	// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17134	static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
17135	if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
17136	return true;
17137
17138	const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
17139	auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
17140	if (DenormMode == DenormalMode::getPreserveSign())
17141	return true;
17142
17143	// TODO: Remove this.
17144	return RMW->getFunction()
17145	->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
17146	.getValueAsBool();
17147	}
17148
17149	static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
17150	LLVMContext &Ctx = RMW->getContext();
17151	StringRef MemScope =
17152	Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
17153
17154	return OptimizationRemark (DEBUG_TYPE, "Passed", RMW)
17155	<< "Hardware instruction generated for atomic "
17156	<< RMW->getOperationName(Op: RMW->getOperation())
17157	<< " operation at memory scope " << MemScope;
17158	}
17159
17160	static bool isV2F16OrV2BF16(Type *Ty) {
17161	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
17162	Type *EltTy = VT->getElementType();
17163	return VT->getNumElements() == `2` &&
17164	(EltTy->isHalfTy() \|\| EltTy->isBFloatTy());
17165	}
17166
17167	return false;
17168	}
17169
17170	static bool isV2F16(Type *Ty) {
17171	FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
17172	return VT && VT->getNumElements() == `2` && VT->getElementType()->isHalfTy();
17173	}
17174
17175	static bool isV2BF16(Type *Ty) {
17176	FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
17177	return VT && VT->getNumElements() == `2` && VT->getElementType()->isBFloatTy();
17178	}
17179
17180	/// \return true if atomicrmw integer ops work for the type.
17181	static bool isAtomicRMWLegalIntTy(Type *Ty) {
17182	if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
17183	unsigned BW = IT->getBitWidth();
17184	return BW == `32` \|\| BW == `64`;
17185	}
17186
17187	return false;
17188	}
17189
17190	/// \return true if this atomicrmw xchg type can be selected.
17191	static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17192	Type *Ty = RMW->getType();
17193	if (isAtomicRMWLegalIntTy(Ty))
17194	return true;
17195
17196	if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
17197	const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17198	unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
17199	return BW == `32` \|\| BW == `64`;
17200	}
17201
17202	if (Ty->isFloatTy() \|\| Ty->isDoubleTy())
17203	return true;
17204
17205	if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
17206	return VT->getNumElements() == `2` &&
17207	VT->getElementType()->getPrimitiveSizeInBits() == `16`;
17208	}
17209
17210	return false;
17211	}
17212
17213	/// \returns true if it's valid to emit a native instruction for \p RMW, based
17214	/// on the properties of the target memory.
17215	static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17216	const AtomicRMWInst *RMW,
17217	bool HasSystemScope) {
17218	// The remote/fine-grained access logic is different from the integer
17219	// atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17220	// fine-grained access does not work, even for a device local allocation.
17221	//
17222	// With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17223	// allocations work.
17224	if (HasSystemScope) {
17225	if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
17226	RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
17227	return true;
17228	} else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17229	return true;
17230
17231	return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
17232	}
17233
17234	/// \return Action to perform on AtomicRMWInsts for integer operations.
17235	static TargetLowering::AtomicExpansionKind
17236	atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
17237	return isAtomicRMWLegalIntTy(Ty: RMW->getType())
17238	? TargetLowering::AtomicExpansionKind::None
17239	: TargetLowering::AtomicExpansionKind::CmpXChg;
17240	}
17241
17242	/// Return if a flat address space atomicrmw can access private memory.
17243	static bool flatInstrMayAccessPrivate(const Instruction *I) {
17244	const MDNode *NoaliasAddrSpaceMD =
17245	I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
17246	if (!NoaliasAddrSpaceMD)
17247	return true;
17248
17249	for (unsigned I = `0`, E = NoaliasAddrSpaceMD->getNumOperands() / `2`; I != E;
17250	++I) {
17251	auto *Low = mdconst::extract<ConstantInt>(
17252	MD: NoaliasAddrSpaceMD->getOperand(I: `2` * I + `0`));
17253	if (Low->getValue().uge(RHS: AMDGPUAS::PRIVATE_ADDRESS)) {
17254	auto *High = mdconst::extract<ConstantInt>(
17255	MD: NoaliasAddrSpaceMD->getOperand(I: `2` * I + `1`));
17256	return High->getValue().ule(RHS: AMDGPUAS::PRIVATE_ADDRESS);
17257	}
17258	}
17259
17260	return true;
17261	}
17262
17263	TargetLowering::AtomicExpansionKind
17264	SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst RMW) const* {
17265	unsigned AS = RMW->getPointerAddressSpace();
17266	if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17267	return AtomicExpansionKind::NotAtomic;
17268
17269	// 64-bit flat atomics that dynamically reside in private memory will silently
17270	// be dropped.
17271	//
17272	// Note that we will emit a new copy of the original atomic in the expansion,
17273	// which will be incrementally relegalized.
17274	const DataLayout &DL = RMW->getFunction()->getDataLayout();
17275	if (AS == AMDGPUAS::FLAT_ADDRESS &&
17276	DL.getTypeSizeInBits(Ty: RMW->getType()) == `64` &&
17277	flatInstrMayAccessPrivate(I: RMW))
17278	return AtomicExpansionKind::Expand;
17279
17280	auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17281	OptimizationRemarkEmitter ORE(RMW->getFunction());
17282	ORE.emit(RemarkBuilder: [=]() {
17283	return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17284	});
17285	return Kind;
17286	};
17287
17288	auto SSID = RMW->getSyncScopeID();
17289	bool HasSystemScope =
17290	SSID == SyncScope::System \|\|
17291	SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
17292
17293	auto Op = RMW->getOperation();
17294	switch (Op) {
17295	case AtomicRMWInst::Xchg: {
17296	// PCIe supports add and xchg for system atomics.
17297	return isAtomicRMWLegalXChgTy(RMW)
17298	? TargetLowering::AtomicExpansionKind::None
17299	: TargetLowering::AtomicExpansionKind::CmpXChg;
17300	}
17301	case AtomicRMWInst::Add:
17302	case AtomicRMWInst::And:
17303	case AtomicRMWInst::UIncWrap:
17304	case AtomicRMWInst::UDecWrap:
17305	return atomicSupportedIfLegalIntType(RMW);
17306	case AtomicRMWInst::Sub:
17307	case AtomicRMWInst::Or:
17308	case AtomicRMWInst::Xor: {
17309	// Atomic sub/or/xor do not work over PCI express, but atomic add
17310	// does. InstCombine transforms these with 0 to or, so undo that.
17311	if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17312	if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
17313	ConstVal && ConstVal->isNullValue())
17314	return AtomicExpansionKind::Expand;
17315	}
17316
17317	return atomicSupportedIfLegalIntType(RMW);
17318	}
17319	case AtomicRMWInst::FAdd: {
17320	Type *Ty = RMW->getType();
17321
17322	// TODO: Handle REGION_ADDRESS
17323	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17324	// DS F32 FP atomics do respect the denormal mode, but the rounding mode
17325	// is fixed to round-to-nearest-even.
17326	//
17327	// F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17328	// round-to-nearest-even.
17329	//
17330	// We ignore the rounding mode problem, even in strictfp. The C++ standard
17331	// suggests it is OK if the floating-point mode may not match the calling
17332	// thread.
17333	if (Ty->isFloatTy()) {
17334	return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
17335	: AtomicExpansionKind::CmpXChg;
17336	}
17337
17338	if (Ty->isDoubleTy()) {
17339	// Ignores denormal mode, but we don't consider flushing mandatory.
17340	return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
17341	: AtomicExpansionKind::CmpXChg;
17342	}
17343
17344	if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17345	return AtomicExpansionKind::None;
17346
17347	return AtomicExpansionKind::CmpXChg;
17348	}
17349
17350	// LDS atomics respect the denormal mode from the mode register.
17351	//
17352	// Traditionally f32 global/buffer memory atomics would unconditionally
17353	// flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17354	// flush.
17355	//
17356	// On targets with flat atomic fadd, denormals would flush depending on
17357	// whether the target address resides in LDS or global memory. We consider
17358	// this flat-maybe-flush as will-flush.
17359	if (Ty->isFloatTy() &&
17360	!Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
17361	!atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
17362	return AtomicExpansionKind::CmpXChg;
17363
17364	// FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17365	// safe. The message phrasing also should be better.
17366	if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
17367	if (AS == AMDGPUAS::FLAT_ADDRESS) {
17368	// gfx942, gfx12
17369	if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17370	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17371	} else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17372	// gfx90a, gfx942, gfx12
17373	if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17374	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17375
17376	// gfx942, gfx12
17377	if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17378	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17379	} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17380	// gfx90a, gfx942, gfx12
17381	if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17382	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17383
17384	// While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17385	// buffer. gfx12 does have the buffer version.
17386	if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17387	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17388	}
17389
17390	// global and flat atomic fadd f64: gfx90a, gfx942.
17391	if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17392	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17393
17394	if (AS != AMDGPUAS::FLAT_ADDRESS) {
17395	if (Ty->isFloatTy()) {
17396	// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17397	// gfx11+.
17398	if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17399	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17400	// global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17401	if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17402	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17403	} else {
17404	// gfx908
17405	if (RMW->use_empty() &&
17406	Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
17407	isV2F16(Ty))
17408	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17409	}
17410	}
17411
17412	// flat atomic fadd f32: gfx942, gfx11+.
17413	if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17414	if (Subtarget->hasFlatAtomicFaddF32Inst())
17415	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17416
17417	// If it is in flat address space, and the type is float, we will try to
17418	// expand it, if the target supports global and lds atomic fadd. The
17419	// reason we need that is, in the expansion, we emit the check of
17420	// address space. If it is in global address space, we emit the global
17421	// atomic fadd; if it is in shared address space, we emit the LDS atomic
17422	// fadd.
17423	if (Subtarget->hasLDSFPAtomicAddF32()) {
17424	if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17425	return AtomicExpansionKind::Expand;
17426	if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17427	return AtomicExpansionKind::Expand;
17428	}
17429	}
17430	}
17431
17432	return AtomicExpansionKind::CmpXChg;
17433	}
17434	case AtomicRMWInst::FMin:
17435	case AtomicRMWInst::FMax: {
17436	Type *Ty = RMW->getType();
17437
17438	// LDS float and double fmin/fmax were always supported.
17439	if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17440	return Ty->isFloatTy() \|\| Ty->isDoubleTy() ? AtomicExpansionKind::None
17441	: AtomicExpansionKind::CmpXChg;
17442	}
17443
17444	if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
17445	// For flat and global cases:
17446	// float, double in gfx7. Manual claims denormal support.
17447	// Removed in gfx8.
17448	// float, double restored in gfx10.
17449	// double removed again in gfx11, so only f32 for gfx11/gfx12.
17450	//
17451	// For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
17452	// no f32.
17453	if (AS == AMDGPUAS::FLAT_ADDRESS) {
17454	if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
17455	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17456	if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
17457	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17458	} else if (AMDGPU::isExtendedGlobalAddrSpace(AS) \|\|
17459	AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17460	if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
17461	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17462	if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
17463	return ReportUnsafeHWInst (AtomicExpansionKind::None);
17464	}
17465	}
17466
17467	return AtomicExpansionKind::CmpXChg;
17468	}
17469	case AtomicRMWInst::Min:
17470	case AtomicRMWInst::Max:
17471	case AtomicRMWInst::UMin:
17472	case AtomicRMWInst::UMax: {
17473	if (AMDGPU::isFlatGlobalAddrSpace(AS) \|\|
17474	AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17475	// Always expand system scope min/max atomics.
17476	if (HasSystemScope)
17477	return AtomicExpansionKind::CmpXChg;
17478	}
17479
17480	return atomicSupportedIfLegalIntType(RMW);
17481	}
17482	case AtomicRMWInst::Nand:
17483	case AtomicRMWInst::FSub:
17484	default:
17485	return AtomicExpansionKind::CmpXChg;
17486	}
17487
17488	llvm_unreachable("covered atomicrmw op switch");
17489	}
17490
17491	TargetLowering::AtomicExpansionKind
17492	SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst LI) const* {
17493	return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17494	? AtomicExpansionKind::NotAtomic
17495	: AtomicExpansionKind::None;
17496	}
17497
17498	TargetLowering::AtomicExpansionKind
17499	SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst SI) const* {
17500	return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17501	? AtomicExpansionKind::NotAtomic
17502	: AtomicExpansionKind::None;
17503	}
17504
17505	TargetLowering::AtomicExpansionKind
17506	SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst CmpX) const* {
17507	unsigned AddrSpace = CmpX->getPointerAddressSpace();
17508	if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
17509	return AtomicExpansionKind::NotAtomic;
17510
17511	if (AddrSpace != AMDGPUAS::FLAT_ADDRESS \|\| !flatInstrMayAccessPrivate(I: CmpX))
17512	return AtomicExpansionKind::None;
17513
17514	const DataLayout &DL = CmpX->getDataLayout();
17515
17516	Type *ValTy = CmpX->getNewValOperand()->getType();
17517
17518	// If a 64-bit flat atomic may alias private, we need to avoid using the
17519	// atomic in the private case.
17520	return DL.getTypeSizeInBits(Ty: ValTy) == `64` ? AtomicExpansionKind::Expand
17521	: AtomicExpansionKind::None;
17522	}
17523
17524	const TargetRegisterClass *
17525	SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
17526	const TargetRegisterClass RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false*);
17527	const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17528	if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
17529	return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
17530	: &AMDGPU::SReg_32RegClass;
17531	if (!TRI->isSGPRClass(RC) && !isDivergent)
17532	return TRI->getEquivalentSGPRClass(VRC: RC);
17533	if (TRI->isSGPRClass(RC) && isDivergent)
17534	return TRI->getEquivalentVGPRClass(SRC: RC);
17535
17536	return RC;
17537	}
17538
17539	// FIXME: This is a workaround for DivergenceAnalysis not understanding always
17540	// uniform values (as produced by the mask results of control flow intrinsics)
17541	// used outside of divergent blocks. The phi users need to also be treated as
17542	// always uniform.
17543	//
17544	// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17545	static bool hasCFUser(const Value V, SmallPtrSet<const* Value *, `16`> &Visited,
17546	unsigned WaveSize) {
17547	// FIXME: We assume we never cast the mask results of a control flow
17548	// intrinsic.
17549	// Early exit if the type won't be consistent as a compile time hack.
17550	IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
17551	if (!IT \|\| IT->getBitWidth() != WaveSize)
17552	return false;
17553
17554	if (!isa<Instruction>(Val: V))
17555	return false;
17556	if (!Visited.insert(Ptr: V).second)
17557	return false;
17558	bool Result = false;
17559	for (const auto *U : V->users()) {
17560	if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
17561	if (V == U->getOperand(i: `1`)) {
17562	switch (Intrinsic->getIntrinsicID()) {
17563	default:
17564	Result = false;
17565	break;
17566	case Intrinsic::amdgcn_if_break:
17567	case Intrinsic::amdgcn_if:
17568	case Intrinsic::amdgcn_else:
17569	Result = true;
17570	break;
17571	}
17572	}
17573	if (V == U->getOperand(i: `0`)) {
17574	switch (Intrinsic->getIntrinsicID()) {
17575	default:
17576	Result = false;
17577	break;
17578	case Intrinsic::amdgcn_end_cf:
17579	case Intrinsic::amdgcn_loop:
17580	Result = true;
17581	break;
17582	}
17583	}
17584	} else {
17585	Result = hasCFUser(V: U, Visited, WaveSize);
17586	}
17587	if (Result)
17588	break;
17589	}
17590	return Result;
17591	}
17592
17593	bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
17594	const Value V) const* {
17595	if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
17596	if (CI->isInlineAsm()) {
17597	// FIXME: This cannot give a correct answer. This should only trigger in
17598	// the case where inline asm returns mixed SGPR and VGPR results, used
17599	// outside the defining block. We don't have a specific result to
17600	// consider, so this assumes if any value is SGPR, the overall register
17601	// also needs to be SGPR.
17602	const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17603	TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
17604	DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
17605	for (auto &TC : TargetConstraints) {
17606	if (TC.Type == InlineAsm::isOutput) {
17607	ComputeConstraintToUse(OpInfo&: TC, Op: SDValue ());
17608	const TargetRegisterClass *RC =
17609	getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
17610	VT: TC.ConstraintVT)
17611	.second;
17612	if (RC && SIRI->isSGPRClass(RC))
17613	return true;
17614	}
17615	}
17616	}
17617	}
17618	SmallPtrSet<const Value *, `16`> Visited;
17619	return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
17620	}
17621
17622	bool SITargetLowering::hasMemSDNodeUser(SDNode N) const* {
17623	for (SDUse &Use : N->uses()) {
17624	if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
17625	if (getBasePtrIndex(N: M) == Use.getOperandNo())
17626	return true;
17627	}
17628	}
17629	return false;
17630	}
17631
17632	bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
17633	SDValue N1) const {
17634	if (!N0.hasOneUse())
17635	return false;
17636	// Take care of the opportunity to keep N0 uniform
17637	if (N0 ->isDivergent() \|\| !N1 ->isDivergent())
17638	return true;
17639	// Check if we have a good chance to form the memory access pattern with the
17640	// base and offset
17641	return (DAG.isBaseWithConstantOffset(Op: N0) &&
17642	hasMemSDNodeUser(N: *N0 ->user_begin()));
17643	}
17644
17645	bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
17646	Register N0, Register N1) const {
17647	return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
17648	}
17649
17650	MachineMemOperand::Flags
17651	SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
17652	// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17653	MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
17654	if (I.getMetadata(Kind: "amdgpu.noclobber"))
17655	Flags \|= MONoClobber;
17656	if (I.getMetadata(Kind: "amdgpu.last.use"))
17657	Flags \|= MOLastUse;
17658	return Flags;
17659	}
17660
17661	bool SITargetLowering::checkForPhysRegDependency(
17662	SDNode Def, SDNode User, unsigned Op, const TargetRegisterInfo *TRI,
17663	const TargetInstrInfo TII, MCRegister &PhysReg, int* &Cost) const {
17664	if (User->getOpcode() != ISD::CopyToReg)
17665	return false;
17666	if (!Def->isMachineOpcode())
17667	return false;
17668	MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def);
17669	if (!MDef)
17670	return false;
17671
17672	unsigned ResNo = User->getOperand(Num: Op).getResNo();
17673	if (User->getOperand(Num: Op)->getValueType(ResNo) != MVT::i1)
17674	return false;
17675	const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode());
17676	if (II.isCompare() && II.hasImplicitDefOfPhysReg(Reg: AMDGPU::SCC)) {
17677	PhysReg = AMDGPU::SCC;
17678	const TargetRegisterClass *RC =
17679	TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo));
17680	Cost = RC->getCopyCost();
17681	return true;
17682	}
17683	return false;
17684	}
17685
17686	void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
17687	Instruction AI) const* {
17688	// Given: atomicrmw fadd ptr %addr, float %val ordering
17689	//
17690	// With this expansion we produce the following code:
17691	// [...]
17692	// %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17693	// br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17694	//
17695	// atomicrmw.shared:
17696	// %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17697	// %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17698	// float %val ordering
17699	// br label %atomicrmw.phi
17700	//
17701	// atomicrmw.check.private:
17702	// %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17703	// br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17704	//
17705	// atomicrmw.private:
17706	// %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17707	// %loaded.private = load float, ptr addrspace(5) %cast.private
17708	// %val.new = fadd float %loaded.private, %val
17709	// store float %val.new, ptr addrspace(5) %cast.private
17710	// br label %atomicrmw.phi
17711	//
17712	// atomicrmw.global:
17713	// %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17714	// %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17715	// float %val ordering
17716	// br label %atomicrmw.phi
17717	//
17718	// atomicrmw.phi:
17719	// %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17720	// [ %loaded.private, %atomicrmw.private ],
17721	// [ %loaded.global, %atomicrmw.global ]
17722	// br label %atomicrmw.end
17723	//
17724	// atomicrmw.end:
17725	// [...]
17726	//
17727	//
17728	// For 64-bit atomics which may reside in private memory, we perform a simpler
17729	// version that only inserts the private check, and uses the flat operation.
17730
17731	IRBuilder<> Builder(AI);
17732	LLVMContext &Ctx = Builder.getContext();
17733
17734	auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
17735	const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17736	: AtomicCmpXchgInst::getPointerOperandIndex();
17737	Value *Addr = AI->getOperand(i: PtrOpIdx);
17738
17739	/// TODO: Only need to check private, then emit flat-known-not private (no
17740	/// need for shared block, or cast to global).
17741	AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
17742
17743	Align Alignment;
17744	if (RMW)
17745	Alignment = RMW->getAlign();
17746	else if (CX)
17747	Alignment = CX->getAlign();
17748	else
17749	llvm_unreachable("unhandled atomic operation");
17750
17751	// FullFlatEmulation is true if we need to issue the private, shared, and
17752	// global cases.
17753	//
17754	// If this is false, we are only dealing with the flat-targeting-private case,
17755	// where we only insert a check for private and still use the flat instruction
17756	// for global and shared.
17757
17758	bool FullFlatEmulation =
17759	RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17760	((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) \|\|
17761	(Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
17762	RMW->getType()->isDoubleTy()));
17763
17764	// If the return value isn't used, do not introduce a false use in the phi.
17765	bool ReturnValueIsUsed = !AI->use_empty();
17766
17767	BasicBlock *BB = Builder.GetInsertBlock();
17768	Function *F = BB->getParent();
17769	BasicBlock *ExitBB =
17770	BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
17771	BasicBlock SharedBB = nullptr*;
17772
17773	BasicBlock *CheckPrivateBB = BB;
17774	if (FullFlatEmulation) {
17775	SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
17776	CheckPrivateBB =
17777	BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
17778	}
17779
17780	BasicBlock *PrivateBB =
17781	BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
17782	BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
17783	BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
17784
17785	std::prev(x: BB->end())->eraseFromParent();
17786	Builder.SetInsertPoint(BB);
17787
17788	Value LoadedShared = nullptr*;
17789	if (FullFlatEmulation) {
17790	CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
17791	Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
17792	Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
17793	Builder.SetInsertPoint(SharedBB);
17794	Value *CastToLocal = Builder.CreateAddrSpaceCast(
17795	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
17796
17797	Instruction *Clone = AI->clone();
17798	Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
17799	Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
17800	LoadedShared = Clone;
17801
17802	Builder.CreateBr(Dest: PhiBB);
17803	Builder.SetInsertPoint(CheckPrivateBB);
17804	}
17805
17806	CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
17807	Args: {Addr}, FMFSource: nullptr, Name: "is.private");
17808	Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
17809
17810	Builder.SetInsertPoint(PrivateBB);
17811
17812	Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17813	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
17814
17815	Value *LoadedPrivate;
17816	if (RMW) {
17817	LoadedPrivate = Builder.CreateAlignedLoad(
17818	Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
17819
17820	Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
17821	Loaded: LoadedPrivate, Val: RMW->getValOperand());
17822
17823	Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
17824	} else {
17825	auto [ResultLoad, Equal] =
17826	buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
17827	Val: CX->getNewValOperand(), Alignment: CX->getAlign());
17828
17829	Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
17830	Val: ResultLoad, Idxs: `0`);
17831	LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: `1`);
17832	}
17833
17834	Builder.CreateBr(Dest: PhiBB);
17835
17836	Builder.SetInsertPoint(GlobalBB);
17837
17838	// Continue using a flat instruction if we only emitted the check for private.
17839	Instruction *LoadedGlobal = AI;
17840	if (FullFlatEmulation) {
17841	Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17842	V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
17843	AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
17844	}
17845
17846	AI->removeFromParent();
17847	AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
17848
17849	// The new atomicrmw may go through another round of legalization later.
17850	if (!FullFlatEmulation) {
17851	// We inserted the runtime check already, make sure we do not try to
17852	// re-expand this.
17853	// TODO: Should union with any existing metadata.
17854	MDBuilder MDB(F->getContext());
17855	MDNode *RangeNotPrivate =
17856	MDB.createRange(Lo: APInt (`32`, AMDGPUAS::PRIVATE_ADDRESS),
17857	Hi: APInt (`32`, AMDGPUAS::PRIVATE_ADDRESS + `1`));
17858	LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
17859	Node: RangeNotPrivate);
17860	}
17861
17862	Builder.CreateBr(Dest: PhiBB);
17863
17864	Builder.SetInsertPoint(PhiBB);
17865
17866	if (ReturnValueIsUsed) {
17867	PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: `3`);
17868	AI->replaceAllUsesWith(V: Loaded);
17869	if (FullFlatEmulation)
17870	Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
17871	Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
17872	Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
17873	Loaded->takeName(V: AI);
17874	}
17875
17876	Builder.CreateBr(Dest: ExitBB);
17877	}
17878
17879	void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst AI) const* {
17880	AtomicRMWInst::BinOp Op = AI->getOperation();
17881
17882	if (Op == AtomicRMWInst::Sub \|\| Op == AtomicRMWInst::Or \|\|
17883	Op == AtomicRMWInst::Xor) {
17884	if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
17885	ConstVal && ConstVal->isNullValue()) {
17886	// atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17887	AI->setOperation(AtomicRMWInst::Add);
17888
17889	// We may still need the private-alias-flat handling below.
17890
17891	// TODO: Skip this for cases where we cannot access remote memory.
17892	}
17893	}
17894
17895	// The non-flat expansions should only perform the de-canonicalization of
17896	// identity values.
17897	if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
17898	return;
17899
17900	emitExpandAtomicAddrSpacePredicate(AI);
17901	}
17902
17903	void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst CI) const* {
17904	emitExpandAtomicAddrSpacePredicate(AI: CI);
17905	}
17906
17907	LoadInst *
17908	SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst AI) const* {
17909	IRBuilder<> Builder(AI);
17910	auto Order = AI->getOrdering();
17911
17912	// The optimization removes store aspect of the atomicrmw. Therefore, cache
17913	// must be flushed if the atomic ordering had a release semantics. This is
17914	// not necessary a fence, a release fence just coincides to do that flush.
17915	// Avoid replacing of an atomicrmw with a release semantics.
17916	if (isReleaseOrStronger(AO: Order))
17917	return nullptr;
17918
17919	LoadInst *LI = Builder.CreateAlignedLoad(
17920	Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
17921	LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
17922	LI->copyMetadata(SrcInst: *AI);
17923	LI->takeName(V: AI);
17924	AI->replaceAllUsesWith(V: LI);
17925	AI->eraseFromParent();
17926	return LI;
17927	}
17928

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/SIISelLowering.cpp