AMDGPUISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp]

1	//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This is the parent TargetLowering class for hardware code gen
11	/// targets.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUISelLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUMachineFunction.h"
19	#include "AMDGPUMemoryUtils.h"
20	#include "AMDGPUSelectionDAGInfo.h"
21	#include "SIMachineFunctionInfo.h"
22	#include "llvm/CodeGen/Analysis.h"
23	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24	#include "llvm/CodeGen/MachineFrameInfo.h"
25	#include "llvm/IR/DiagnosticInfo.h"
26	#include "llvm/IR/IntrinsicsAMDGPU.h"
27	#include "llvm/Support/CommandLine.h"
28	#include "llvm/Support/KnownBits.h"
29	#include "llvm/Target/TargetMachine.h"
30
31	using namespace llvm;
32
33	#include "AMDGPUGenCallingConv.inc"
34
35	static cl::opt<bool> AMDGPUBypassSlowDiv(
36	"amdgpu-bypass-slow-div",
37	cl::desc ("Skip 64-bit divide for dynamic 32-bit values"),
38	cl::init(Val: true));
39
40	// Find a larger type to do a load / store of a vector with.
41	EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42	unsigned StoreSize = VT.getStoreSizeInBits();
43	if (StoreSize <= `32`)
44	return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46	if (StoreSize % `32` == `0`)
47	return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / `32`);
48
49	return VT;
50	}
51
52	unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53	return DAG.computeKnownBits(Op).countMaxActiveBits();
54	}
55
56	unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57	// In order for this to be a signed 24-bit value, bit 23, must
58	// be a sign bit.
59	return DAG.ComputeMaxSignificantBits(Op);
60	}
61
62	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63	const TargetSubtargetInfo &STI,
64	const AMDGPUSubtarget &AMDGPUSTI)
65	: TargetLowering (TM, STI), Subtarget(&AMDGPUSTI) {
66	// Always lower memset, memcpy, and memmove intrinsics to load/store
67	// instructions, rather then generating calls to memset, mempcy or memmove.
68	MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~`0U`;
69	MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~`0U`;
70	MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~`0U`;
71
72	// Enable ganging up loads and stores in the memcpy DAG lowering.
73	MaxGluedStoresPerMemcpy = `16`;
74
75	// Lower floating point store/load to integer store/load to reduce the number
76	// of patterns in tablegen.
77	setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83	setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89	setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92	setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95	setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101	setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104	setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107	setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110	setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116	setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119	setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122	setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125	setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131	setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137	setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155	setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158	// TODO: Would be better to consume as directly legal
159	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
172	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
173
174	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
175	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
176
177	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
178	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
179
180	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
181	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
182
183	// There are no 64-bit extloads. These should be done as a 32-bit extload and
184	// an extension to 64-bit.
185	for (MVT VT : MVT::integer_valuetypes())
186	setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
187	Action: Expand);
188
189	for (MVT VT : MVT::integer_valuetypes()) {
190	if (VT == MVT::i64)
191	continue;
192
193	for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
195	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
196	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
197	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
198	}
199	}
200
201	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
202	for (auto MemVT :
203	{MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
204	setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
205	Action: Expand);
206
207	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
208	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
209	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
210	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
211	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
212	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
213	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
214	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
215	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
216	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
217	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
218	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
219	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
220	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
221
222	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
223	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
224	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
225	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
226	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
227	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
228
229	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
230	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
231	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
232	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
233	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
234	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
235	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
236	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
237	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
238	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
239	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
240	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
241
242	setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
243	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
244
245	setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
246	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
247
248	setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
249	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
250
251	setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
252	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
253
254	setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
255	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
256
257	setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
258	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
259
260	setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
261	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
262
263	setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
264	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
265
266	setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
267	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
268
269	setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
270	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
271
272	setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
273	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
274
275	setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
276	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
277
278	setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
279	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
280
281	setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
282	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
283
284	setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
285	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
286
287	setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
288	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
289
290	setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
291	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
292
293	setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
294	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
295
296	setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
297	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
298
299	setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
300	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
301
302	setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
303	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
304
305	setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
306	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
307
308	setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
309	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
310
311	setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
312	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
313
314	setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
315	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
316
317	setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
318	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
319
320	setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
321	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
322
323	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
324	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
325	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
326	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
327
328	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
329	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
330	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
331	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
332
333	setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
334	setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
335	setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
336	setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
337	setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
338	setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
339	setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
340	setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
341	setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
342	setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
343	setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
344	setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
345	setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
346	setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
347	setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
348
349	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
350	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
351	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
352
353	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
354	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
355	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
356
357	setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
358
359	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
360	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
361	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
362	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
363	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
364	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
365	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
366
367	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
368	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
369	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
370	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
371	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
372
373	setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
374	setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
375	setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
376
377	setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
378	setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
379	setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
380
381	setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
382	setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
383	setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
384
385	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
386	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
387	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
388
389	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
390	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
391	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
392	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
393	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
394	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
395	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
396
397	setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
398	setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
399
400	setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
401
402	// For R600, this is totally unsupported, just custom lower to produce an
403	// error.
404	setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
405
406	// Library functions. These default to Expand, but we have instructions
407	// for them.
408	setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409	ISD::FROUNDEVEN, ISD::FTRUNC},
410	VTs: {MVT::f16, MVT::f32}, Action: Legal);
411	setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
412
413	setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
414	setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
415	setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
416	VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
417
418	setOperationAction(
419	Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
420	Action: Custom);
421	setOperationAction(Ops: {ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f64, Action: Custom);
422
423	setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
424
425	setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
426
427	setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
428	Action: Expand);
429
430	setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
431	setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
432	setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
433
434	setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
435	Action: Custom);
436
437	setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
438
439	// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
440	// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
441	// default unless marked custom/legal.
442	setOperationAction(Ops: ISD::IS_FPCLASS,
443	VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
444	MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
445	MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
446	MVT::v16f64},
447	Action: Custom);
448
449	// Expand to fneg + fadd.
450	setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
451
452	setOperationAction(Ops: ISD::CONCAT_VECTORS,
453	VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
454	MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
455	MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
456	MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
457	MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
458	Action: Custom);
459
460	setOperationAction(
461	Ops: ISD::EXTRACT_SUBVECTOR,
462	VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
463	MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
464	MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
465	MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
466	MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
467	MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
468	MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
469	Action: Custom);
470
471	setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
472	Action: Expand);
473	setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
474
475	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
476	for (MVT VT : ScalarIntVTs) {
477	// These should use [SU]DIVREM, so set them to expand
478	setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
479	Action: Expand);
480
481	// GPU does not have divrem function for signed or unsigned.
482	setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
483
484	// GPU does not have [S\|U]MUL_LOHI functions as a single instruction.
485	setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
486
487	setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
488
489	// AMDGPU uses ADDC/SUBC/ADDE/SUBE
490	setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
491	}
492
493	// The hardware supports 32-bit FSHR, but not FSHL.
494	setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
495
496	setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
497
498	setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
499
500	setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
501	setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
502	ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
503	ISD::FP_TO_UINT_SAT},
504	VT: MVT::i64, Action: Custom);
505	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
506
507	setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
508	Action: Legal);
509
510	setOperationAction(
511	Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
512	VT: MVT::i64, Action: Custom);
513
514	for (auto VT : {MVT::i8, MVT::i16})
515	setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
516
517	static const MVT::SimpleValueType VectorIntTypes[] = {
518	MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
519	MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
520
521	for (MVT VT : VectorIntTypes) {
522	// Expand the following operations for the current type by default.
523	// clang-format off
524	setOperationAction(Ops: {ISD::ADD, ISD::AND,
525	ISD::FP_TO_SINT, ISD::FP_TO_UINT,
526	ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
527	ISD::MUL, ISD::MULHU,
528	ISD::MULHS, ISD::OR,
529	ISD::SHL, ISD::SRA,
530	ISD::SRL, ISD::ROTL,
531	ISD::ROTR, ISD::SUB,
532	ISD::SINT_TO_FP, ISD::UINT_TO_FP,
533	ISD::SDIV, ISD::UDIV,
534	ISD::SREM, ISD::UREM,
535	ISD::SMUL_LOHI, ISD::UMUL_LOHI,
536	ISD::SDIVREM, ISD::UDIVREM,
537	ISD::SELECT, ISD::VSELECT,
538	ISD::SELECT_CC, ISD::XOR,
539	ISD::BSWAP, ISD::CTPOP,
540	ISD::CTTZ, ISD::CTLZ,
541	ISD::VECTOR_SHUFFLE, ISD::SETCC,
542	ISD::ADDRSPACECAST},
543	VT, Action: Expand);
544	// clang-format on
545	}
546
547	static const MVT::SimpleValueType FloatVectorTypes[] = {
548	MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
549	MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
550
551	for (MVT VT : FloatVectorTypes) {
552	setOperationAction(
553	Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
554	ISD::FADD, ISD::FCEIL, ISD::FCOS,
555	ISD::FDIV, ISD::FEXP2, ISD::FEXP,
556	ISD::FEXP10, ISD::FLOG2, ISD::FREM,
557	ISD::FLOG, ISD::FLOG10, ISD::FPOW,
558	ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
559	ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
560	ISD::FSQRT, ISD::FSIN, ISD::FSUB,
561	ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
562	ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
563	ISD::FCANONICALIZE, ISD::FROUNDEVEN},
564	VT, Action: Expand);
565	}
566
567	// This causes using an unrolled select operation rather than expansion with
568	// bit operations. This is in general better, but the alternative using BFI
569	// instructions may be better if the select sources are SGPRs.
570	setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
571	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
572
573	setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
574	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
575
576	setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
577	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
578
579	setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
580	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
581
582	setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
583	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
584
585	setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
586	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
587
588	setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
589	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
590
591	setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
592	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
593
594	setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
595	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
596
597	setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
598	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
599
600	setSchedulingPreference(Sched::RegPressure);
601	setJumpIsExpensive(true);
602
603	setMinCmpXchgSizeInBits(`32`);
604	setSupportsUnalignedAtomics(false);
605
606	PredictableSelectIsExpensive = false;
607
608	// We want to find all load dependencies for long chains of stores to enable
609	// merging into very wide vectors. The problem is with vectors with > 4
610	// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
611	// vectors are a legal type, even though we have to split the loads
612	// usually. When we can more precisely specify load legality per address
613	// space, we should be able to make FindBetterChain/MergeConsecutiveStores
614	// smarter so that they can figure out what to do in 2 iterations without all
615	// N > 4 stores on the same chain.
616	GatherAllAliasesMaxDepth = `16`;
617
618	// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
619	// about these during lowering.
620	MaxStoresPerMemcpy = `0xffffffff`;
621	MaxStoresPerMemmove = `0xffffffff`;
622	MaxStoresPerMemset = `0xffffffff`;
623
624	// The expansion for 64-bit division is enormous.
625	if (AMDGPUBypassSlowDiv)
626	addBypassSlowDiv(SlowBitWidth: `64`, FastBitWidth: `32`);
627
628	setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
629	ISD::SRA, ISD::SRL,
630	ISD::TRUNCATE, ISD::MUL,
631	ISD::SMUL_LOHI, ISD::UMUL_LOHI,
632	ISD::MULHU, ISD::MULHS,
633	ISD::SELECT, ISD::SELECT_CC,
634	ISD::STORE, ISD::FADD,
635	ISD::FSUB, ISD::FNEG,
636	ISD::FABS, ISD::AssertZext,
637	ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
638
639	setMaxAtomicSizeInBitsSupported(`64`);
640	setMaxDivRemBitWidthSupported(`64`);
641	setMaxLargeFPConvertBitWidthSupported(`64`);
642	}
643
644	bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
645	const auto Flags = Op.getNode()->getFlags();
646	if (Flags.hasNoSignedZeros())
647	return true;
648
649	return false;
650	}
651
652	//===----------------------------------------------------------------------===//
653	// Target Information
654	//===----------------------------------------------------------------------===//
655
656	LLVM_READNONE
657	static bool fnegFoldsIntoOpcode(unsigned Opc) {
658	switch (Opc) {
659	case ISD::FADD:
660	case ISD::FSUB:
661	case ISD::FMUL:
662	case ISD::FMA:
663	case ISD::FMAD:
664	case ISD::FMINNUM:
665	case ISD::FMAXNUM:
666	case ISD::FMINNUM_IEEE:
667	case ISD::FMAXNUM_IEEE:
668	case ISD::FMINIMUM:
669	case ISD::FMAXIMUM:
670	case ISD::FMINIMUMNUM:
671	case ISD::FMAXIMUMNUM:
672	case ISD::SELECT:
673	case ISD::FSIN:
674	case ISD::FTRUNC:
675	case ISD::FRINT:
676	case ISD::FNEARBYINT:
677	case ISD::FROUNDEVEN:
678	case ISD::FCANONICALIZE:
679	case AMDGPUISD::RCP:
680	case AMDGPUISD::RCP_LEGACY:
681	case AMDGPUISD::RCP_IFLAG:
682	case AMDGPUISD::SIN_HW:
683	case AMDGPUISD::FMUL_LEGACY:
684	case AMDGPUISD::FMIN_LEGACY:
685	case AMDGPUISD::FMAX_LEGACY:
686	case AMDGPUISD::FMED3:
687	// TODO: handle llvm.amdgcn.fma.legacy
688	return true;
689	case ISD::BITCAST:
690	llvm_unreachable("bitcast is special cased");
691	default:
692	return false;
693	}
694	}
695
696	static bool fnegFoldsIntoOp(const SDNode *N) {
697	unsigned Opc = N->getOpcode();
698	if (Opc == ISD::BITCAST) {
699	// TODO: Is there a benefit to checking the conditions performFNegCombine
700	// does? We don't for the other cases.
701	SDValue BCSrc = N->getOperand(Num: `0`);
702	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
703	return BCSrc.getNumOperands() == `2` &&
704	BCSrc.getOperand(i: `1`).getValueSizeInBits() == `32`;
705	}
706
707	return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
708	}
709
710	return fnegFoldsIntoOpcode(Opc);
711	}
712
713	/// \p returns true if the operation will definitely need to use a 64-bit
714	/// encoding, and thus will use a VOP3 encoding regardless of the source
715	/// modifiers.
716	LLVM_READONLY
717	static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
718	return (N->getNumOperands() > `2` && N->getOpcode() != ISD::SELECT) \|\|
719	VT == MVT::f64;
720	}
721
722	/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
723	/// type for ISD::SELECT.
724	LLVM_READONLY
725	static bool selectSupportsSourceMods(const SDNode *N) {
726	// TODO: Only applies if select will be vector
727	return N->getValueType(ResNo: `0`) == MVT::f32;
728	}
729
730	// Most FP instructions support source modifiers, but this could be refined
731	// slightly.
732	LLVM_READONLY
733	static bool hasSourceMods(const SDNode *N) {
734	if (isa<MemSDNode>(Val: N))
735	return false;
736
737	switch (N->getOpcode()) {
738	case ISD::CopyToReg:
739	case ISD::FDIV:
740	case ISD::FREM:
741	case ISD::INLINEASM:
742	case ISD::INLINEASM_BR:
743	case AMDGPUISD::DIV_SCALE:
744	case ISD::INTRINSIC_W_CHAIN:
745
746	// TODO: Should really be looking at the users of the bitcast. These are
747	// problematic because bitcasts are used to legalize all stores to integer
748	// types.
749	case ISD::BITCAST:
750	return false;
751	case ISD::INTRINSIC_WO_CHAIN: {
752	switch (N->getConstantOperandVal(Num: `0`)) {
753	case Intrinsic::amdgcn_interp_p1:
754	case Intrinsic::amdgcn_interp_p2:
755	case Intrinsic::amdgcn_interp_mov:
756	case Intrinsic::amdgcn_interp_p1_f16:
757	case Intrinsic::amdgcn_interp_p2_f16:
758	return false;
759	default:
760	return true;
761	}
762	}
763	case ISD::SELECT:
764	return selectSupportsSourceMods(N);
765	default:
766	return true;
767	}
768	}
769
770	bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
771	unsigned CostThreshold) {
772	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
773	// it is truly free to use a source modifier in all cases. If there are
774	// multiple users but for each one will necessitate using VOP3, there will be
775	// a code size increase. Try to avoid increasing code size unless we know it
776	// will save on the instruction count.
777	unsigned NumMayIncreaseSize = `0`;
778	MVT VT = N->getValueType(ResNo: `0`).getScalarType().getSimpleVT();
779
780	assert(!N->use_empty());
781
782	// XXX - Should this limit number of uses to check?
783	for (const SDNode *U : N->users()) {
784	if (!hasSourceMods(N: U))
785	return false;
786
787	if (!opMustUseVOP3Encoding(N: U, VT)) {
788	if (++NumMayIncreaseSize > CostThreshold)
789	return false;
790	}
791	}
792
793	return true;
794	}
795
796	EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
797	ISD::NodeType ExtendKind) const {
798	assert(!VT.isVector() && "only scalar expected");
799
800	// Round to the next multiple of 32-bits.
801	unsigned Size = VT.getSizeInBits();
802	if (Size <= `32`)
803	return MVT::i32;
804	return EVT::getIntegerVT(Context, BitWidth: `32` * ((Size + `31`) / `32`));
805	}
806
807	unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
808	return `32`;
809	}
810
811	bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
812	return true;
813	}
814
815	// The backend supports 32 and 64 bit floating point immediates.
816	// FIXME: Why are we reporting vectors of FP immediates as legal?
817	bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
818	bool ForCodeSize) const {
819	return isTypeLegal(VT: VT.getScalarType());
820	}
821
822	// We don't want to shrink f64 / f32 constants.
823	bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
824	EVT ScalarVT = VT.getScalarType();
825	return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
826	}
827
828	bool AMDGPUTargetLowering::shouldReduceLoadWidth(
829	SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
830	std::optional<unsigned> ByteOffset) const {
831	// TODO: This may be worth removing. Check regression tests for diffs.
832	if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
833	return false;
834
835	unsigned NewSize = NewVT.getStoreSizeInBits();
836
837	// If we are reducing to a 32-bit load or a smaller multi-dword load,
838	// this is always better.
839	if (NewSize >= `32`)
840	return true;
841
842	EVT OldVT = N->getValueType(ResNo: `0`);
843	unsigned OldSize = OldVT.getStoreSizeInBits();
844
845	MemSDNode *MN = cast<MemSDNode>(Val: N);
846	unsigned AS = MN->getAddressSpace();
847	// Do not shrink an aligned scalar load to sub-dword.
848	// Scalar engine cannot do sub-dword loads.
849	// TODO: Update this for GFX12 which does have scalar sub-dword loads.
850	if (OldSize >= `32` && NewSize < `32` && MN->getAlign() >= Align (`4`) &&
851	(AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
852	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
853	(isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
854	MN->isInvariant())) &&
855	AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
856	return false;
857
858	// Don't produce extloads from sub 32-bit types. SI doesn't have scalar
859	// extloads, so doing one requires using a buffer_load. In cases where we
860	// still couldn't use a scalar load, using the wider load shouldn't really
861	// hurt anything.
862
863	// If the old size already had to be an extload, there's no harm in continuing
864	// to reduce the width.
865	return (OldSize < `32`);
866	}
867
868	bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
869	const SelectionDAG &DAG,
870	const MachineMemOperand &MMO) const {
871
872	assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
873
874	if (LoadTy.getScalarType() == MVT::i32)
875	return false;
876
877	unsigned LScalarSize = LoadTy.getScalarSizeInBits();
878	unsigned CastScalarSize = CastTy.getScalarSizeInBits();
879
880	if ((LScalarSize >= CastScalarSize) && (CastScalarSize < `32`))
881	return false;
882
883	unsigned Fast = `0`;
884	return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
885	VT: CastTy, MMO, Fast: &Fast) &&
886	Fast;
887	}
888
889	// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
890	// profitable with the expansion for 64-bit since it's generally good to
891	// speculate things.
892	bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type Ty) const* {
893	return true;
894	}
895
896	bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type Ty) const* {
897	return true;
898	}
899
900	bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode N) const* {
901	switch (N->getOpcode()) {
902	case ISD::EntryToken:
903	case ISD::TokenFactor:
904	return true;
905	case ISD::INTRINSIC_WO_CHAIN: {
906	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
907	return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
908	}
909	case ISD::INTRINSIC_W_CHAIN: {
910	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
911	return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
912	}
913	case ISD::LOAD:
914	if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
915	AMDGPUAS::CONSTANT_ADDRESS_32BIT)
916	return true;
917	return false;
918	case AMDGPUISD::SETCC: // ballot-style instruction
919	return true;
920	}
921	return false;
922	}
923
924	SDValue AMDGPUTargetLowering::getNegatedExpression(
925	SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
926	NegatibleCost &Cost, unsigned Depth) const {
927
928	switch (Op.getOpcode()) {
929	case ISD::FMA:
930	case ISD::FMAD: {
931	// Negating a fma is not free if it has users without source mods.
932	if (!allUsesHaveSourceMods(N: Op.getNode()))
933	return SDValue ();
934	break;
935	}
936	case AMDGPUISD::RCP: {
937	SDValue Src = Op.getOperand(i: `0`);
938	EVT VT = Op.getValueType();
939	SDLoc SL(Op);
940
941	SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
942	ForCodeSize, Cost, Depth: Depth + `1`);
943	if (NegSrc)
944	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op ->getFlags());
945	return SDValue ();
946	}
947	default:
948	break;
949	}
950
951	return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
952	OptForSize: ForCodeSize, Cost, Depth);
953	}
954
955	//===---------------------------------------------------------------------===//
956	// Target Properties
957	//===---------------------------------------------------------------------===//
958
959	bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
960	assert(VT.isFloatingPoint());
961
962	// Packed operations do not have a fabs modifier.
963	// Report this based on the end legalized type.
964	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f16 \|\| VT == MVT::bf16;
965	}
966
967	bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
968	assert(VT.isFloatingPoint());
969	// Report this based on the end legalized type.
970	VT = VT.getScalarType();
971	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f16 \|\| VT == MVT::bf16;
972	}
973
974	bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
975	unsigned NumElem,
976	unsigned AS) const {
977	return true;
978	}
979
980	bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
981	// There are few operations which truly have vector input operands. Any vector
982	// operation is going to involve operations on each component, and a
983	// build_vector will be a copy per element, so it always makes sense to use a
984	// build_vector input in place of the extracted element to avoid a copy into a
985	// super register.
986	//
987	// We should probably only do this if all users are extracts only, but this
988	// should be the common case.
989	return true;
990	}
991
992	bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
993	// Truncate is just accessing a subregister.
994
995	unsigned SrcSize = Source.getSizeInBits();
996	unsigned DestSize = Dest.getSizeInBits();
997
998	return DestSize < SrcSize && DestSize % `32` == `0` ;
999	}
1000
1001	bool AMDGPUTargetLowering::isTruncateFree(Type Source, Type Dest) const {
1002	// Truncate is just accessing a subregister.
1003
1004	unsigned SrcSize = Source->getScalarSizeInBits();
1005	unsigned DestSize = Dest->getScalarSizeInBits();
1006
1007	if (DestSize== `16` && Subtarget->has16BitInsts())
1008	return SrcSize >= `32`;
1009
1010	return DestSize < SrcSize && DestSize % `32` == `0`;
1011	}
1012
1013	bool AMDGPUTargetLowering::isZExtFree(Type Src, Type Dest) const {
1014	unsigned SrcSize = Src->getScalarSizeInBits();
1015	unsigned DestSize = Dest->getScalarSizeInBits();
1016
1017	if (SrcSize == `16` && Subtarget->has16BitInsts())
1018	return DestSize >= `32`;
1019
1020	return SrcSize == `32` && DestSize == `64`;
1021	}
1022
1023	bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1024	// Any register load of a 64-bit value really requires 2 32-bit moves. For all
1025	// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1026	// this will enable reducing 64-bit operations the 32-bit, which is always
1027	// good.
1028
1029	if (Src == MVT::i16)
1030	return Dest == MVT::i32 \|\|Dest == MVT::i64 ;
1031
1032	return Src == MVT::i32 && Dest == MVT::i64;
1033	}
1034
1035	bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1036	EVT DestVT) const {
1037	switch (N->getOpcode()) {
1038	case ISD::ADD:
1039	case ISD::SUB:
1040	case ISD::SHL:
1041	case ISD::SRL:
1042	case ISD::SRA:
1043	case ISD::AND:
1044	case ISD::OR:
1045	case ISD::XOR:
1046	case ISD::MUL:
1047	case ISD::SETCC:
1048	case ISD::SELECT:
1049	case ISD::SMIN:
1050	case ISD::SMAX:
1051	case ISD::UMIN:
1052	case ISD::UMAX:
1053	if (isTypeLegal(VT: MVT::i16) &&
1054	(!DestVT.isVector() \|\|
1055	!isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1056	// Don't narrow back down to i16 if promoted to i32 already.
1057	if (!N->isDivergent() && DestVT.isInteger() &&
1058	DestVT.getScalarSizeInBits() > `1` &&
1059	DestVT.getScalarSizeInBits() <= `16` &&
1060	SrcVT.getScalarSizeInBits() > `16`) {
1061	return false;
1062	}
1063	}
1064	return true;
1065	default:
1066	break;
1067	}
1068
1069	// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1070	// limited number of native 64-bit operations. Shrinking an operation to fit
1071	// in a single 32-bit register should always be helpful. As currently used,
1072	// this is much less general than the name suggests, and is only used in
1073	// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1074	// not profitable, and may actually be harmful.
1075	if (isa<LoadSDNode>(Val: N))
1076	return SrcVT.getSizeInBits() > `32` && DestVT.getSizeInBits() == `32`;
1077
1078	return true;
1079	}
1080
1081	bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1082	const SDNode* N, CombineLevel Level) const {
1083	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
1084	N->getOpcode() == ISD::SRL) &&
1085	"Expected shift op");
1086
1087	SDValue ShiftLHS = N->getOperand(Num: `0`);
1088	if (!ShiftLHS ->hasOneUse())
1089	return false;
1090
1091	if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1092	!ShiftLHS.getOperand(i: `0`)->hasOneUse())
1093	return false;
1094
1095	// Always commute pre-type legalization and right shifts.
1096	// We're looking for shl(or(x,y),z) patterns.
1097	if (Level < CombineLevel::AfterLegalizeTypes \|\|
1098	N->getOpcode() != ISD::SHL \|\| N->getOperand(Num: `0`).getOpcode() != ISD::OR)
1099	return true;
1100
1101	// If only user is a i32 right-shift, then don't destroy a BFE pattern.
1102	if (N->getValueType(ResNo: `0`) == MVT::i32 && N->hasOneUse() &&
1103	(N->user_begin()->getOpcode() == ISD::SRA \|\|
1104	N->user_begin()->getOpcode() == ISD::SRL))
1105	return false;
1106
1107	// Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1108	auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1109	if (LHS.getOpcode() != ISD::SHL)
1110	return false;
1111	auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1112	auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: `0`));
1113	auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
1114	return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1115	LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1116	RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1117	};
1118	SDValue LHS = N->getOperand(Num: `0`).getOperand(i: `0`);
1119	SDValue RHS = N->getOperand(Num: `0`).getOperand(i: `1`);
1120	return !(IsShiftAndLoad (LHS, RHS) \|\| IsShiftAndLoad (RHS, LHS));
1121	}
1122
1123	//===---------------------------------------------------------------------===//
1124	// TargetLowering Callbacks
1125	//===---------------------------------------------------------------------===//
1126
1127	CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1128	bool IsVarArg) {
1129	switch (CC) {
1130	case CallingConv::AMDGPU_VS:
1131	case CallingConv::AMDGPU_GS:
1132	case CallingConv::AMDGPU_PS:
1133	case CallingConv::AMDGPU_CS:
1134	case CallingConv::AMDGPU_HS:
1135	case CallingConv::AMDGPU_ES:
1136	case CallingConv::AMDGPU_LS:
1137	return CC_AMDGPU;
1138	case CallingConv::AMDGPU_CS_Chain:
1139	case CallingConv::AMDGPU_CS_ChainPreserve:
1140	return CC_AMDGPU_CS_CHAIN;
1141	case CallingConv::C:
1142	case CallingConv::Fast:
1143	case CallingConv::Cold:
1144	return CC_AMDGPU_Func;
1145	case CallingConv::AMDGPU_Gfx:
1146	case CallingConv::AMDGPU_Gfx_WholeWave:
1147	return CC_SI_Gfx;
1148	case CallingConv::AMDGPU_KERNEL:
1149	case CallingConv::SPIR_KERNEL:
1150	default:
1151	reportFatalUsageError(reason: "unsupported calling convention for call");
1152	}
1153	}
1154
1155	CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1156	bool IsVarArg) {
1157	switch (CC) {
1158	case CallingConv::AMDGPU_KERNEL:
1159	case CallingConv::SPIR_KERNEL:
1160	llvm_unreachable("kernels should not be handled here");
1161	case CallingConv::AMDGPU_VS:
1162	case CallingConv::AMDGPU_GS:
1163	case CallingConv::AMDGPU_PS:
1164	case CallingConv::AMDGPU_CS:
1165	case CallingConv::AMDGPU_CS_Chain:
1166	case CallingConv::AMDGPU_CS_ChainPreserve:
1167	case CallingConv::AMDGPU_HS:
1168	case CallingConv::AMDGPU_ES:
1169	case CallingConv::AMDGPU_LS:
1170	return RetCC_SI_Shader;
1171	case CallingConv::AMDGPU_Gfx:
1172	case CallingConv::AMDGPU_Gfx_WholeWave:
1173	return RetCC_SI_Gfx;
1174	case CallingConv::C:
1175	case CallingConv::Fast:
1176	case CallingConv::Cold:
1177	return RetCC_AMDGPU_Func;
1178	default:
1179	reportFatalUsageError(reason: "unsupported calling convention");
1180	}
1181	}
1182
1183	/// The SelectionDAGBuilder will automatically promote function arguments
1184	/// with illegal types. However, this does not work for the AMDGPU targets
1185	/// since the function arguments are stored in memory as these illegal types.
1186	/// In order to handle this properly we need to get the original types sizes
1187	/// from the LLVM IR Function and fixup the ISD:InputArg values before
1188	/// passing them to AnalyzeFormalArguments()
1189
1190	/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1191	/// input values across multiple registers. Each item in the Ins array
1192	/// represents a single value that will be stored in registers. Ins[x].VT is
1193	/// the value type of the value that will be stored in the register, so
1194	/// whatever SDNode we lower the argument to needs to be this type.
1195	///
1196	/// In order to correctly lower the arguments we need to know the size of each
1197	/// argument. Since Ins[x].VT gives us the size of the register that will
1198	/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1199	/// for the original function argument so that we can deduce the correct memory
1200	/// type to use for Ins[x]. In most cases the correct memory type will be
1201	/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1202	/// we have a kernel argument of type v8i8, this argument will be split into
1203	/// 8 parts and each part will be represented by its own item in the Ins array.
1204	/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1205	/// the argument before it was split. From this, we deduce that the memory type
1206	/// for each individual part is i8. We pass the memory type as LocVT to the
1207	/// calling convention analysis function and the register type (Ins[x].VT) as
1208	/// the ValVT.
1209	void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1210	CCState &State,
1211	const SmallVectorImpl<ISD::InputArg> &Ins) const {
1212	const MachineFunction &MF = State.getMachineFunction();
1213	const Function &Fn = MF.getFunction();
1214	LLVMContext &Ctx = Fn.getContext();
1215	const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1216	CallingConv::ID CC = Fn.getCallingConv();
1217
1218	Align MaxAlign = Align (`1`);
1219	uint64_t ExplicitArgOffset = `0`;
1220	const DataLayout &DL = Fn.getDataLayout();
1221
1222	unsigned InIndex = `0`;
1223
1224	for (const Argument &Arg : Fn.args()) {
1225	const bool IsByRef = Arg.hasByRefAttr();
1226	Type *BaseArgTy = Arg.getType();
1227	Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1228	Align Alignment = DL.getValueOrABITypeAlignment(
1229	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1230	MaxAlign = std::max(a: Alignment, b: MaxAlign);
1231	uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1232
1233	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1234	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1235
1236	// We're basically throwing away everything passed into us and starting over
1237	// to get accurate in-memory offsets. The "PartOffset" is completely useless
1238	// to us as computed in Ins.
1239	//
1240	// We also need to figure out what type legalization is trying to do to get
1241	// the correct memory offsets.
1242
1243	SmallVector<EVT, `16`> ValueVTs;
1244	SmallVector<uint64_t, `16`> Offsets;
1245	ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /MemVTs=/nullptr,
1246	FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1247
1248	for (unsigned Value = `0`, NumValues = ValueVTs.size();
1249	Value != NumValues; ++Value) {
1250	uint64_t BasePartOffset = Offsets [Value];
1251
1252	EVT ArgVT = ValueVTs [Value];
1253	EVT MemVT = ArgVT;
1254	MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1255	unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1256
1257	if (NumRegs == `1`) {
1258	// This argument is not split, so the IR type is the memory type.
1259	if (ArgVT.isExtended()) {
1260	// We have an extended type, like i24, so we should just use the
1261	// register type.
1262	MemVT = RegisterVT;
1263	} else {
1264	MemVT = ArgVT;
1265	}
1266	} else if (ArgVT.isVector() && RegisterVT.isVector() &&
1267	ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1268	assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1269	// We have a vector value which has been split into a vector with
1270	// the same scalar type, but fewer elements. This should handle
1271	// all the floating-point vector types.
1272	MemVT = RegisterVT;
1273	} else if (ArgVT.isVector() &&
1274	ArgVT.getVectorNumElements() == NumRegs) {
1275	// This arg has been split so that each element is stored in a separate
1276	// register.
1277	MemVT = ArgVT.getScalarType();
1278	} else if (ArgVT.isExtended()) {
1279	// We have an extended type, like i65.
1280	MemVT = RegisterVT;
1281	} else {
1282	unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1283	assert(ArgVT.getStoreSizeInBits() % NumRegs == `0`);
1284	if (RegisterVT.isInteger()) {
1285	MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1286	} else if (RegisterVT.isVector()) {
1287	assert(!RegisterVT.getScalarType().isFloatingPoint());
1288	unsigned NumElements = RegisterVT.getVectorNumElements();
1289	assert(MemoryBits % NumElements == `0`);
1290	// This vector type has been split into another vector type with
1291	// a different elements size.
1292	EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1293	BitWidth: MemoryBits / NumElements);
1294	MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1295	} else {
1296	llvm_unreachable("cannot deduce memory type.");
1297	}
1298	}
1299
1300	// Convert one element vectors to scalar.
1301	if (MemVT.isVector() && MemVT.getVectorNumElements() == `1`)
1302	MemVT = MemVT.getScalarType();
1303
1304	// Round up vec3/vec5 argument.
1305	if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1306	MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1307	} else if (!MemVT.isSimple() && !MemVT.isVector()) {
1308	MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1309	}
1310
1311	unsigned PartOffset = `0`;
1312	for (unsigned i = `0`; i != NumRegs; ++i) {
1313	State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1314	Offset: BasePartOffset + PartOffset,
1315	LocVT: MemVT.getSimpleVT(),
1316	HTP: CCValAssign::Full));
1317	PartOffset += MemVT.getStoreSize();
1318	}
1319	}
1320	}
1321	}
1322
1323	SDValue AMDGPUTargetLowering::LowerReturn(
1324	SDValue Chain, CallingConv::ID CallConv,
1325	bool isVarArg,
1326	const SmallVectorImpl<ISD::OutputArg> &Outs,
1327	const SmallVectorImpl<SDValue> &OutVals,
1328	const SDLoc &DL, SelectionDAG &DAG) const {
1329	// FIXME: Fails for r600 tests
1330	//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1331	// "wave terminate should not have return values");
1332	return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1333	}
1334
1335	//===---------------------------------------------------------------------===//
1336	// Target specific lowering
1337	//===---------------------------------------------------------------------===//
1338
1339	/// Selects the correct CCAssignFn for a given CallingConvention value.
1340	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1341	bool IsVarArg) {
1342	return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1343	}
1344
1345	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1346	bool IsVarArg) {
1347	return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1348	}
1349
1350	SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1351	SelectionDAG &DAG,
1352	MachineFrameInfo &MFI,
1353	int ClobberedFI) const {
1354	SmallVector<SDValue, `8`> ArgChains;
1355	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1356	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
1357
1358	// Include the original chain at the beginning of the list. When this is
1359	// used by target LowerCall hooks, this helps legalize find the
1360	// CALLSEQ_BEGIN node.
1361	ArgChains.push_back(Elt: Chain);
1362
1363	// Add a chain value for each stack argument corresponding
1364	for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1365	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1366	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1367	if (FI->getIndex() < `0`) {
1368	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1369	int64_t InLastByte = InFirstByte;
1370	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
1371
1372	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
1373	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
1374	ArgChains.push_back(Elt: SDValue (L, `1`));
1375	}
1376	}
1377	}
1378	}
1379
1380	// Build a tokenfactor for all the chains.
1381	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ArgChains);
1382	}
1383
1384	SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1385	SmallVectorImpl<SDValue> &InVals,
1386	StringRef Reason) const {
1387	SDValue Callee = CLI.Callee;
1388	SelectionDAG &DAG = CLI.DAG;
1389
1390	const Function &Fn = DAG.getMachineFunction().getFunction();
1391
1392	StringRef FuncName("<unknown>");
1393
1394	if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1395	FuncName = G->getSymbol();
1396	else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1397	FuncName = G->getGlobal()->getName();
1398
1399	DAG.getContext()->diagnose(
1400	DI: DiagnosticInfoUnsupported (Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1401
1402	if (!CLI.IsTailCall) {
1403	for (ISD::InputArg &Arg : CLI.Ins)
1404	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1405	}
1406
1407	// FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1408	if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1409	return CLI.Chain;
1410
1411	SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: `0`, OutSize: `0`, DL: CLI.DL);
1412	return DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, /InGlue=/Glue: SDValue (), DL: CLI.DL);
1413	}
1414
1415	SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1416	SmallVectorImpl<SDValue> &InVals) const {
1417	return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1418	}
1419
1420	SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1421	SelectionDAG &DAG) const {
1422	const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1424	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
1425	Fn, "unsupported dynamic alloca", SDLoc (Op).getDebugLoc()));
1426	auto Ops = {DAG.getConstant(Val: `0`, DL: SDLoc (), VT: Op.getValueType()), Op.getOperand(i: `0`)};
1427	return DAG.getMergeValues(Ops, dl: SDLoc ());
1428	}
1429
1430	SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1431	SelectionDAG &DAG) const {
1432	switch (Op.getOpcode()) {
1433	default:
1434	Op ->print(OS&: errs(), G: &DAG);
1435	llvm_unreachable("Custom lowering code for this "
1436	"instruction is not implemented yet!");
1437	break;
1438	case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1439	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1440	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1441	case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442	case ISD::SDIVREM:
1443	return LowerSDIVREM(Op, DAG);
1444	case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445	case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446	case ISD::FRINT: return LowerFRINT(Op, DAG);
1447	case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448	case ISD::FROUNDEVEN:
1449	return LowerFROUNDEVEN(Op, DAG);
1450	case ISD::FROUND: return LowerFROUND(Op, DAG);
1451	case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452	case ISD::FLOG2:
1453	return LowerFLOG2(Op, DAG);
1454	case ISD::FLOG:
1455	case ISD::FLOG10:
1456	return LowerFLOGCommon(Op, DAG);
1457	case ISD::FEXP:
1458	case ISD::FEXP10:
1459	return lowerFEXP(Op, DAG);
1460	case ISD::FEXP2:
1461	return lowerFEXP2(Op, DAG);
1462	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464	case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465	case ISD::FP_TO_SINT:
1466	case ISD::FP_TO_UINT:
1467	return LowerFP_TO_INT(Op, DAG);
1468	case ISD::FP_TO_SINT_SAT:
1469	case ISD::FP_TO_UINT_SAT:
1470	return LowerFP_TO_INT_SAT(Op, DAG);
1471	case ISD::CTTZ:
1472	case ISD::CTTZ_ZERO_UNDEF:
1473	case ISD::CTLZ:
1474	case ISD::CTLZ_ZERO_UNDEF:
1475	return LowerCTLZ_CTTZ(Op, DAG);
1476	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1477	}
1478	return Op;
1479	}
1480
1481	void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1482	SmallVectorImpl<SDValue> &Results,
1483	SelectionDAG &DAG) const {
1484	switch (N->getOpcode()) {
1485	case ISD::SIGN_EXTEND_INREG:
1486	// Different parts of legalization seem to interpret which type of
1487	// sign_extend_inreg is the one to check for custom lowering. The extended
1488	// from type is what really matters, but some places check for custom
1489	// lowering of the result type. This results in trying to use
1490	// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1491	// nothing here and let the illegal result integer be handled normally.
1492	return;
1493	case ISD::FLOG2:
1494	if (SDValue Lowered = LowerFLOG2(Op: SDValue (N, `0`), DAG))
1495	Results.push_back(Elt: Lowered);
1496	return;
1497	case ISD::FLOG:
1498	case ISD::FLOG10:
1499	if (SDValue Lowered = LowerFLOGCommon(Op: SDValue (N, `0`), DAG))
1500	Results.push_back(Elt: Lowered);
1501	return;
1502	case ISD::FEXP2:
1503	if (SDValue Lowered = lowerFEXP2(Op: SDValue (N, `0`), DAG))
1504	Results.push_back(Elt: Lowered);
1505	return;
1506	case ISD::FEXP:
1507	case ISD::FEXP10:
1508	if (SDValue Lowered = lowerFEXP(Op: SDValue (N, `0`), DAG))
1509	Results.push_back(Elt: Lowered);
1510	return;
1511	case ISD::CTLZ:
1512	case ISD::CTLZ_ZERO_UNDEF:
1513	if (auto Lowered = lowerCTLZResults(Op: SDValue (N, `0u`), DAG))
1514	Results.push_back(Elt: Lowered);
1515	return;
1516	default:
1517	return;
1518	}
1519	}
1520
1521	SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1522	SDValue Op,
1523	SelectionDAG &DAG) const {
1524
1525	const DataLayout &DL = DAG.getDataLayout();
1526	GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1527	const GlobalValue *GV = G->getGlobal();
1528
1529	if (!MFI->isModuleEntryFunction()) {
1530	auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1531	if (std::optional<uint32_t> Address =
1532	AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1533	if (IsNamedBarrier) {
1534	unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / `16`;
1535	MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1536	}
1537	return DAG.getConstant(Val: *Address, DL: SDLoc (Op), VT: Op.getValueType());
1538	} else if (IsNamedBarrier) {
1539	llvm_unreachable("named barrier should have an assigned address");
1540	}
1541	}
1542
1543	if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
1544	G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1545	if (!MFI->isModuleEntryFunction() &&
1546	GV->getName() != "llvm.amdgcn.module.lds" &&
1547	!AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1548	SDLoc DL(Op);
1549	const Function &Fn = DAG.getMachineFunction().getFunction();
1550	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
1551	Fn, "local memory global used by non-kernel function",
1552	DL.getDebugLoc(), DS_Warning));
1553
1554	// We currently don't have a way to correctly allocate LDS objects that
1555	// aren't directly associated with a kernel. We do force inlining of
1556	// functions that use local objects. However, if these dead functions are
1557	// not eliminated, we don't want a compile time error. Just emit a warning
1558	// and a trap, since there should be no callable path here.
1559	SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1560	SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1561	N1: Trap, N2: DAG.getRoot());
1562	DAG.setRoot(OutputChain);
1563	return DAG.getPOISON(VT: Op.getValueType());
1564	}
1565
1566	// XXX: What does the value of G->getOffset() mean?
1567	assert(G->getOffset() == `0` &&
1568	"Do not know what to do with an non-zero offset");
1569
1570	// TODO: We could emit code to handle the initialization somewhere.
1571	// We ignore the initializer for now and legalize it to allow selection.
1572	// The initializer will anyway get errored out during assembly emission.
1573	unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1574	return DAG.getConstant(Val: Offset, DL: SDLoc (Op), VT: Op.getValueType());
1575	}
1576	return SDValue ();
1577	}
1578
1579	SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1580	SelectionDAG &DAG) const {
1581	SmallVector<SDValue, `8`> Args;
1582	SDLoc SL(Op);
1583
1584	EVT VT = Op.getValueType();
1585	if (VT.getVectorElementType().getSizeInBits() < `32`) {
1586	unsigned OpBitSize = Op.getOperand(i: `0`).getValueType().getSizeInBits();
1587	if (OpBitSize >= `32` && OpBitSize % `32` == `0`) {
1588	unsigned NewNumElt = OpBitSize / `32`;
1589	EVT NewEltVT = (NewNumElt == `1`) ? MVT::i32
1590	: EVT::getVectorVT(Context&: *DAG.getContext(),
1591	VT: MVT::i32, NumElements: NewNumElt);
1592	for (const SDUse &U : Op ->ops()) {
1593	SDValue In = U.get();
1594	SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1595	if (NewNumElt > `1`)
1596	DAG.ExtractVectorElements(Op: NewIn, Args);
1597	else
1598	Args.push_back(Elt: NewIn);
1599	}
1600
1601	EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1602	NumElements: NewNumElt * Op.getNumOperands());
1603	SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1604	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1605	}
1606	}
1607
1608	for (const SDUse &U : Op ->ops())
1609	DAG.ExtractVectorElements(Op: U.get(), Args);
1610
1611	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1612	}
1613
1614	SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1615	SelectionDAG &DAG) const {
1616	SDLoc SL(Op);
1617	SmallVector<SDValue, `8`> Args;
1618	unsigned Start = Op.getConstantOperandVal(i: `1`);
1619	EVT VT = Op.getValueType();
1620	EVT SrcVT = Op.getOperand(i: `0`).getValueType();
1621
1622	if (VT.getScalarSizeInBits() == `16` && Start % `2` == `0`) {
1623	unsigned NumElt = VT.getVectorNumElements();
1624	unsigned NumSrcElt = SrcVT.getVectorNumElements();
1625	assert(NumElt % `2` == `0` && NumSrcElt % `2` == `0` && "expect legal types");
1626
1627	// Extract 32-bit registers at a time.
1628	EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / `2`);
1629	EVT NewVT = NumElt == `2`
1630	? MVT::i32
1631	: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / `2`);
1632	SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: `0`));
1633
1634	DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / `2`, Count: NumElt / `2`);
1635	if (NumElt == `2`)
1636	Tmp = Args [`0`];
1637	else
1638	Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1639
1640	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1641	}
1642
1643	DAG.ExtractVectorElements(Op: Op.getOperand(i: `0`), Args, Start,
1644	Count: VT.getVectorNumElements());
1645
1646	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1647	}
1648
1649	// TODO: Handle fabs too
1650	static SDValue peekFNeg(SDValue Val) {
1651	if (Val.getOpcode() == ISD::FNEG)
1652	return Val.getOperand(i: `0`);
1653
1654	return Val;
1655	}
1656
1657	static SDValue peekFPSignOps(SDValue Val) {
1658	if (Val.getOpcode() == ISD::FNEG)
1659	Val = Val.getOperand(i: `0`);
1660	if (Val.getOpcode() == ISD::FABS)
1661	Val = Val.getOperand(i: `0`);
1662	if (Val.getOpcode() == ISD::FCOPYSIGN)
1663	Val = Val.getOperand(i: `0`);
1664	return Val;
1665	}
1666
1667	SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1668	const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1669	SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1670	SelectionDAG &DAG = DCI.DAG;
1671	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1672	switch (CCOpcode) {
1673	case ISD::SETOEQ:
1674	case ISD::SETONE:
1675	case ISD::SETUNE:
1676	case ISD::SETNE:
1677	case ISD::SETUEQ:
1678	case ISD::SETEQ:
1679	case ISD::SETFALSE:
1680	case ISD::SETFALSE2:
1681	case ISD::SETTRUE:
1682	case ISD::SETTRUE2:
1683	case ISD::SETUO:
1684	case ISD::SETO:
1685	break;
1686	case ISD::SETULE:
1687	case ISD::SETULT: {
1688	if (LHS == True)
1689	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1690	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1691	}
1692	case ISD::SETOLE:
1693	case ISD::SETOLT:
1694	case ISD::SETLE:
1695	case ISD::SETLT: {
1696	// Ordered. Assume ordered for undefined.
1697
1698	// Only do this after legalization to avoid interfering with other combines
1699	// which might occur.
1700	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1701	!DCI.isCalledByLegalizer())
1702	return SDValue ();
1703
1704	// We need to permute the operands to get the correct NaN behavior. The
1705	// selected operand is the second one based on the failing compare with NaN,
1706	// so permute it based on the compare type the hardware uses.
1707	if (LHS == True)
1708	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1709	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1710	}
1711	case ISD::SETUGE:
1712	case ISD::SETUGT: {
1713	if (LHS == True)
1714	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1715	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1716	}
1717	case ISD::SETGT:
1718	case ISD::SETGE:
1719	case ISD::SETOGE:
1720	case ISD::SETOGT: {
1721	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1722	!DCI.isCalledByLegalizer())
1723	return SDValue ();
1724
1725	if (LHS == True)
1726	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1727	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1728	}
1729	case ISD::SETCC_INVALID:
1730	llvm_unreachable("Invalid setcc condcode!");
1731	}
1732	return SDValue ();
1733	}
1734
1735	/// Generate Min/Max node
1736	SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1737	SDValue LHS, SDValue RHS,
1738	SDValue True, SDValue False,
1739	SDValue CC,
1740	DAGCombinerInfo &DCI) const {
1741	if ((LHS == True && RHS == False) \|\| (LHS == False && RHS == True))
1742	return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1743
1744	SelectionDAG &DAG = DCI.DAG;
1745
1746	// If we can't directly match this, try to see if we can fold an fneg to
1747	// match.
1748
1749	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1750	ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1751	SDValue NegTrue = peekFNeg(Val: True);
1752
1753	// Undo the combine foldFreeOpFromSelect does if it helps us match the
1754	// fmin/fmax.
1755	//
1756	// select (fcmp olt (lhs, K)), (fneg lhs), -K
1757	// -> fneg (fmin_legacy lhs, K)
1758	//
1759	// TODO: Use getNegatedExpression
1760	if (LHS == NegTrue && CFalse && CRHS) {
1761	APFloat NegRHS = neg(X: CRHS->getValueAPF());
1762	if (NegRHS == CFalse->getValueAPF()) {
1763	SDValue Combined =
1764	combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1765	if (Combined)
1766	return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1767	return SDValue ();
1768	}
1769	}
1770
1771	return SDValue ();
1772	}
1773
1774	std::pair<SDValue, SDValue>
1775	AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1776	SDLoc SL(Op);
1777
1778	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1779
1780	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
1781	const SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
1782
1783	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1784	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1785
1786	return std::pair(Lo, Hi);
1787	}
1788
1789	SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1790	SDLoc SL(Op);
1791
1792	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1793	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
1794	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1795	}
1796
1797	SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1798	SDLoc SL(Op);
1799
1800	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1801	const SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
1802	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1803	}
1804
1805	// Split a vector type into two parts. The first part is a power of two vector.
1806	// The second part is whatever is left over, and is a scalar if it would
1807	// otherwise be a 1-vector.
1808	std::pair<EVT, EVT>
1809	AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1810	EVT LoVT, HiVT;
1811	EVT EltVT = VT.getVectorElementType();
1812	unsigned NumElts = VT.getVectorNumElements();
1813	unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + `1`) / `2`);
1814	LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1815	HiVT = NumElts - LoNumElts == `1`
1816	? EltVT
1817	: EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1818	return std::pair(LoVT, HiVT);
1819	}
1820
1821	// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1822	// scalar.
1823	std::pair<SDValue, SDValue>
1824	AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1825	const EVT &LoVT, const EVT &HiVT,
1826	SelectionDAG &DAG) const {
1827	EVT VT = N.getValueType();
1828	assert(LoVT.getVectorNumElements() +
1829	(HiVT.isVector() ? HiVT.getVectorNumElements() : `1`) <=
1830	VT.getVectorNumElements() &&
1831	"More vector elements requested than available!");
1832	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1833	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
1834
1835	unsigned LoNumElts = LoVT.getVectorNumElements();
1836
1837	if (HiVT.isVector()) {
1838	unsigned HiNumElts = HiVT.getVectorNumElements();
1839	if ((VT.getVectorNumElements() % HiNumElts) == `0`) {
1840	// Avoid creating an extract_subvector with an index that isn't a multiple
1841	// of the result type.
1842	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1843	N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1844	return {Lo, Hi};
1845	}
1846
1847	SmallVector<SDValue, `8`> Elts;
1848	DAG.ExtractVectorElements(Op: N, Args&: Elts, /Start=/LoNumElts,
1849	/Count=/HiNumElts);
1850	SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1851	return {Lo, Hi};
1852	}
1853
1854	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1855	N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1856	return {Lo, Hi};
1857	}
1858
1859	SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1860	SelectionDAG &DAG) const {
1861	LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1862	EVT VT = Op.getValueType();
1863	SDLoc SL(Op);
1864
1865
1866	// If this is a 2 element vector, we really want to scalarize and not create
1867	// weird 1 element vectors.
1868	if (VT.getVectorNumElements() == `2`) {
1869	SDValue Ops[`2`];
1870	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = scalarizeVectorLoad(LD: Load, DAG);
1871	return DAG.getMergeValues(Ops, dl: SL);
1872	}
1873
1874	SDValue BasePtr = Load->getBasePtr();
1875	EVT MemVT = Load->getMemoryVT();
1876
1877	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1878
1879	EVT LoVT, HiVT;
1880	EVT LoMemVT, HiMemVT;
1881	SDValue Lo, Hi;
1882
1883	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1884	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1885	std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1886
1887	unsigned Size = LoMemVT.getStoreSize();
1888	Align BaseAlign = Load->getAlign();
1889	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1890
1891	SDValue LoLoad = DAG.getExtLoad(
1892	ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1893	MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1894	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1895	SDValue HiLoad = DAG.getExtLoad(
1896	ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1897	PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1898	MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1899
1900	SDValue Join;
1901	if (LoVT == HiVT) {
1902	// This is the case that the vector is power of two so was evenly split.
1903	Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1904	} else {
1905	Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1906	N3: DAG.getVectorIdxConstant(Val: `0`, DL: SL));
1907	Join = DAG.getNode(
1908	Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1909	VT, N1: Join, N2: HiLoad,
1910	N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1911	}
1912
1913	SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1914	N1: LoLoad.getValue(R: `1`), N2: HiLoad.getValue(R: `1`))};
1915
1916	return DAG.getMergeValues(Ops, dl: SL);
1917	}
1918
1919	SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1920	SelectionDAG &DAG) const {
1921	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1922	EVT VT = Op.getValueType();
1923	SDValue BasePtr = Load->getBasePtr();
1924	EVT MemVT = Load->getMemoryVT();
1925	SDLoc SL(Op);
1926	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1927	Align BaseAlign = Load->getAlign();
1928	unsigned NumElements = MemVT.getVectorNumElements();
1929
1930	// Widen from vec3 to vec4 when the load is at least 8-byte aligned
1931	// or 16-byte fully dereferenceable. Otherwise, split the vector load.
1932	if (NumElements != `3` \|\|
1933	(BaseAlign < Align (`8`) &&
1934	!SrcValue.isDereferenceable(Size: `16`, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1935	return SplitVectorLoad(Op, DAG);
1936
1937	assert(NumElements == `3`);
1938
1939	EVT WideVT =
1940	EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: `4`);
1941	EVT WideMemVT =
1942	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: `4`);
1943	SDValue WideLoad = DAG.getExtLoad(
1944	ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1945	MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1946	return DAG.getMergeValues(
1947	Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1948	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SL)),
1949	WideLoad.getValue(R: `1`)},
1950	dl: SL);
1951	}
1952
1953	SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1954	SelectionDAG &DAG) const {
1955	StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1956	SDValue Val = Store->getValue();
1957	EVT VT = Val.getValueType();
1958
1959	// If this is a 2 element vector, we really want to scalarize and not create
1960	// weird 1 element vectors.
1961	if (VT.getVectorNumElements() == `2`)
1962	return scalarizeVectorStore(ST: Store, DAG);
1963
1964	EVT MemVT = Store->getMemoryVT();
1965	SDValue Chain = Store->getChain();
1966	SDValue BasePtr = Store->getBasePtr();
1967	SDLoc SL(Op);
1968
1969	EVT LoVT, HiVT;
1970	EVT LoMemVT, HiMemVT;
1971	SDValue Lo, Hi;
1972
1973	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1974	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1975	std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1976
1977	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1978
1979	const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1980	Align BaseAlign = Store->getAlign();
1981	unsigned Size = LoMemVT.getStoreSize();
1982	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1983
1984	SDValue LoStore =
1985	DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1986	MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1987	SDValue HiStore = DAG.getTruncStore(
1988	Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
1989	MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1990
1991	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1992	}
1993
1994	// This is a shortcut for integer division because we have fast i32<->f32
1995	// conversions, and fast f32 reciprocal instructions. The fractional part of a
1996	// float is enough to accurately represent up to a 24-bit signed integer.
1997	SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1998	bool Sign) const {
1999	SDLoc DL(Op);
2000	EVT VT = Op.getValueType();
2001	SDValue LHS = Op.getOperand(i: `0`);
2002	SDValue RHS = Op.getOperand(i: `1`);
2003	MVT IntVT = MVT::i32;
2004	MVT FltVT = MVT::f32;
2005
2006	unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2007	if (LHSSignBits < `9`)
2008	return SDValue ();
2009
2010	unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2011	if (RHSSignBits < `9`)
2012	return SDValue ();
2013
2014	unsigned BitSize = VT.getSizeInBits();
2015	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2016	unsigned DivBits = BitSize - SignBits;
2017	if (Sign)
2018	++DivBits;
2019
2020	ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2021	ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2022
2023	SDValue jq = DAG.getConstant(Val: `1`, DL, VT: IntVT);
2024
2025	if (Sign) {
2026	// char\|short jq = ia ^ ib;
2027	jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2028
2029	// jq = jq >> (bitsize - 2)
2030	jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2031	N2: DAG.getConstant(Val: BitSize - `2`, DL, VT));
2032
2033	// jq = jq \| 0x1
2034	jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: `1`, DL, VT));
2035	}
2036
2037	// int ia = (int)LHS;
2038	SDValue ia = LHS;
2039
2040	// int ib, (int)RHS;
2041	SDValue ib = RHS;
2042
2043	// float fa = (float)ia;
2044	SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2045
2046	// float fb = (float)ib;
2047	SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2048
2049	SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2050	N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2051
2052	// fq = trunc(fq);
2053	fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2054
2055	// float fqneg = -fq;
2056	SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2057
2058	MachineFunction &MF = DAG.getMachineFunction();
2059
2060	bool UseFmadFtz = false;
2061	if (Subtarget->isGCN()) {
2062	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2063	UseFmadFtz =
2064	MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2065	}
2066
2067	// float fr = mad(fqneg, fb, fa);
2068	unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2069	: UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2070	: (unsigned)ISD::FMAD;
2071	SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2072
2073	// int iq = (int)fq;
2074	SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2075
2076	// fr = fabs(fr);
2077	fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2078
2079	// fb = fabs(fb);
2080	fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2081
2082	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2083
2084	// int cv = fr >= fb;
2085	SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2086
2087	// jq = (cv ? jq : 0);
2088	jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: `0`, DL, VT));
2089
2090	// dst = iq + jq;
2091	SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2092
2093	// Rem needs compensation, it's easier to recompute it
2094	SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2095	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2096
2097	// Truncate to number of bits this divide really is.
2098	if (Sign) {
2099	SDValue InRegSize
2100	= DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2101	Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2102	Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2103	} else {
2104	SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(`1`) << DivBits) - `1`, DL, VT);
2105	Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2106	Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2107	}
2108
2109	return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2110	}
2111
2112	void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2113	SelectionDAG &DAG,
2114	SmallVectorImpl<SDValue> &Results) const {
2115	SDLoc DL(Op);
2116	EVT VT = Op.getValueType();
2117
2118	assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2119
2120	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2121
2122	SDValue One = DAG.getConstant(Val: `1`, DL, VT: HalfVT);
2123	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: HalfVT);
2124
2125	//HiLo split
2126	SDValue LHS_Lo, LHS_Hi;
2127	SDValue LHS = Op.getOperand(i: `0`);
2128	std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2129
2130	SDValue RHS_Lo, RHS_Hi;
2131	SDValue RHS = Op.getOperand(i: `1`);
2132	std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2133
2134	if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`)) &&
2135	DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`))) {
2136
2137	SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2138	N1: LHS_Lo, N2: RHS_Lo);
2139
2140	SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: `0`), Zero});
2141	SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: `1`), Zero});
2142
2143	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2144	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2145	return;
2146	}
2147
2148	if (isTypeLegal(VT: MVT::i64)) {
2149	// The algorithm here is based on ideas from "Software Integer Division",
2150	// Tom Rodeheffer, August 2008.
2151
2152	MachineFunction &MF = DAG.getMachineFunction();
2153	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2154
2155	// Compute denominator reciprocal.
2156	unsigned FMAD =
2157	!Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2158	: MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2159	? (unsigned)ISD::FMAD
2160	: (unsigned)AMDGPUISD::FMAD_FTZ;
2161
2162	SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2163	SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2164	SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2165	N2: DAG.getConstantFP(Val: APInt (`32`, `0x4f800000`).bitsToFloat(), DL, VT: MVT::f32),
2166	N3: Cvt_Lo);
2167	SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2168	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2169	N2: DAG.getConstantFP(Val: APInt (`32`, `0x5f7ffffc`).bitsToFloat(), DL, VT: MVT::f32));
2170	SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2171	N2: DAG.getConstantFP(Val: APInt (`32`, `0x2f800000`).bitsToFloat(), DL, VT: MVT::f32));
2172	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2173	SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2174	N2: DAG.getConstantFP(Val: APInt (`32`, `0xcf800000`).bitsToFloat(), DL, VT: MVT::f32),
2175	N3: Mul1);
2176	SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2177	SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2178	SDValue Rcp64 = DAG.getBitcast(VT,
2179	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2180
2181	SDValue Zero64 = DAG.getConstant(Val: `0`, DL, VT);
2182	SDValue One64 = DAG.getConstant(Val: `1`, DL, VT);
2183	SDValue Zero1 = DAG.getConstant(Val: `0`, DL, VT: MVT::i1);
2184	SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2185
2186	// First round of UNR (Unsigned integer Newton-Raphson).
2187	SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2188	SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2189	SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2190	SDValue Mulhi1_Lo, Mulhi1_Hi;
2191	std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2192	DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2193	SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2194	N2: Mulhi1_Lo, N3: Zero1);
2195	SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2196	N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: `1`));
2197	SDValue Add1 = DAG.getBitcast(VT,
2198	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2199
2200	// Second round of UNR.
2201	SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2202	SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2203	SDValue Mulhi2_Lo, Mulhi2_Hi;
2204	std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2205	DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2206	SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2207	N2: Mulhi2_Lo, N3: Zero1);
2208	SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2209	N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: `1`));
2210	SDValue Add2 = DAG.getBitcast(VT,
2211	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2212
2213	SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2214
2215	SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2216
2217	SDValue Mul3_Lo, Mul3_Hi;
2218	std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2219	SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2220	N2: Mul3_Lo, N3: Zero1);
2221	SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2222	N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: `1`));
2223	SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2224	SDValue Sub1 = DAG.getBitcast(VT,
2225	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2226
2227	SDValue MinusOne = DAG.getConstant(Val: `0xffffffffu`, DL, VT: HalfVT);
2228	SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2229	Cond: ISD::SETUGE);
2230	SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2231	Cond: ISD::SETUGE);
2232	SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2233
2234	// TODO: Here and below portions of the code can be enclosed into if/endif.
2235	// Currently control flow is unconditional and we have 4 selects after
2236	// potential endif to substitute PHIs.
2237
2238	// if C3 != 0 ...
2239	SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2240	N2: RHS_Lo, N3: Zero1);
2241	SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2242	N2: RHS_Hi, N3: Sub1_Lo.getValue(R: `1`));
2243	SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2244	N2: Zero, N3: Sub2_Lo.getValue(R: `1`));
2245	SDValue Sub2 = DAG.getBitcast(VT,
2246	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2247
2248	SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2249
2250	SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2251	Cond: ISD::SETUGE);
2252	SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2253	Cond: ISD::SETUGE);
2254	SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2255
2256	// if (C6 != 0)
2257	SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2258
2259	SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2260	N2: RHS_Lo, N3: Zero1);
2261	SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2262	N2: RHS_Hi, N3: Sub2_Lo.getValue(R: `1`));
2263	SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2264	N2: Zero, N3: Sub3_Lo.getValue(R: `1`));
2265	SDValue Sub3 = DAG.getBitcast(VT,
2266	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2267
2268	// endif C6
2269	// endif C3
2270
2271	SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2272	SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2273
2274	SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2275	SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2276
2277	Results.push_back(Elt: Div);
2278	Results.push_back(Elt: Rem);
2279
2280	return;
2281	}
2282
2283	// r600 expandion.
2284	// Get Speculative values
2285	SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2286	SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2287
2288	SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2289	SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2290	REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2291
2292	SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2293	SDValue DIV_Lo = Zero;
2294
2295	const unsigned halfBitWidth = HalfVT.getSizeInBits();
2296
2297	for (unsigned i = `0`; i < halfBitWidth; ++i) {
2298	const unsigned bitPos = halfBitWidth - i - `1`;
2299	SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2300	// Get value of high bit
2301	SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2302	HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2303	HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2304
2305	// Shift
2306	REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: `1`, DL, VT));
2307	// Add LHS high bit
2308	REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2309
2310	SDValue BIT = DAG.getConstant(Val: `1ULL` << bitPos, DL, VT: HalfVT);
2311	SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2312
2313	DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2314
2315	// Update REM
2316	SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2317	REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2318	}
2319
2320	SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2321	DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2322	Results.push_back(Elt: DIV);
2323	Results.push_back(Elt: REM);
2324	}
2325
2326	SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2327	SelectionDAG &DAG) const {
2328	SDLoc DL(Op);
2329	EVT VT = Op.getValueType();
2330
2331	if (VT == MVT::i64) {
2332	SmallVector<SDValue, `2`> Results;
2333	LowerUDIVREM64(Op, DAG, Results);
2334	return DAG.getMergeValues(Ops: Results, dl: DL);
2335	}
2336
2337	if (VT == MVT::i32) {
2338	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2339	return Res;
2340	}
2341
2342	SDValue X = Op.getOperand(i: `0`);
2343	SDValue Y = Op.getOperand(i: `1`);
2344
2345	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2346	// algorithm used here.
2347
2348	// Initial estimate of inv(y).
2349	SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2350
2351	// One round of UNR.
2352	SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT), N2: Y);
2353	SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2354	Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2355	N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2356
2357	// Quotient/remainder estimate.
2358	SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2359	SDValue R =
2360	DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2361
2362	// First quotient/remainder refinement.
2363	EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2364	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
2365	SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2366	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2367	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2368	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2369	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2370
2371	// Second quotient/remainder refinement.
2372	Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2373	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2374	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2375	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2376	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2377
2378	return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2379	}
2380
2381	SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2382	SelectionDAG &DAG) const {
2383	SDLoc DL(Op);
2384	EVT VT = Op.getValueType();
2385
2386	SDValue LHS = Op.getOperand(i: `0`);
2387	SDValue RHS = Op.getOperand(i: `1`);
2388
2389	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
2390	SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2391
2392	if (VT == MVT::i32) {
2393	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2394	return Res;
2395	}
2396
2397	if (VT == MVT::i64 &&
2398	DAG.ComputeNumSignBits(Op: LHS) > `32` &&
2399	DAG.ComputeNumSignBits(Op: RHS) > `32`) {
2400	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2401
2402	//HiLo split
2403	SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2404	SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2405	SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2406	N1: LHS_Lo, N2: RHS_Lo);
2407	SDValue Res[`2`] = {
2408	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `0`)),
2409	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `1`))
2410	};
2411	return DAG.getMergeValues(Ops: Res, dl: DL);
2412	}
2413
2414	SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2415	SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2416	SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2417	SDValue RSign = LHSign; // Remainder sign is the same as LHS
2418
2419	LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2420	RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2421
2422	LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2423	RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2424
2425	SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2426	SDValue Rem = Div.getValue(R: `1`);
2427
2428	Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2429	Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2430
2431	Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2432	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2433
2434	SDValue Res[`2`] = {
2435	Div,
2436	Rem
2437	};
2438	return DAG.getMergeValues(Ops: Res, dl: DL);
2439	}
2440
2441	SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2442	SDLoc SL(Op);
2443	SDValue Src = Op.getOperand(i: `0`);
2444
2445	// result = trunc(src)
2446	// if (src > 0.0 && src != result)
2447	// result += 1.0
2448
2449	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2450
2451	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT: MVT::f64);
2452	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f64);
2453
2454	EVT SetCCVT =
2455	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2456
2457	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2458	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2459	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2460
2461	SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2462	// TODO: Should this propagate fast-math-flags?
2463	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2464	}
2465
2466	static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2467	SelectionDAG &DAG) {
2468	const unsigned FractBits = `52`;
2469	const unsigned ExpBits = `11`;
2470
2471	SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2472	N1: Hi,
2473	N2: DAG.getConstant(Val: FractBits - `32`, DL: SL, VT: MVT::i32),
2474	N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2475	SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2476	N2: DAG.getConstant(Val: `1023`, DL: SL, VT: MVT::i32));
2477
2478	return Exp;
2479	}
2480
2481	SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2482	SDLoc SL(Op);
2483	SDValue Src = Op.getOperand(i: `0`);
2484
2485	assert(Op.getValueType() == MVT::f64);
2486
2487	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
2488
2489	// Extract the upper half, since this is where we will find the sign and
2490	// exponent.
2491	SDValue Hi = getHiHalf64(Op: Src, DAG);
2492
2493	SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2494
2495	const unsigned FractBits = `52`;
2496
2497	// Extract the sign bit.
2498	const SDValue SignBitMask = DAG.getConstant(UINT32_C(`1`) << `31`, DL: SL, VT: MVT::i32);
2499	SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2500
2501	// Extend back to 64-bits.
2502	SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2503	SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2504
2505	SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2506	const SDValue FractMask
2507	= DAG.getConstant(Val: (UINT64_C(`1`) << FractBits) - `1`, DL: SL, VT: MVT::i64);
2508
2509	SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2510	SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2511	SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2512
2513	EVT SetCCVT =
2514	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2515
2516	const SDValue FiftyOne = DAG.getConstant(Val: FractBits - `1`, DL: SL, VT: MVT::i32);
2517
2518	SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2519	SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2520
2521	SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2522	SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2523
2524	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2525	}
2526
2527	SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2528	SelectionDAG &DAG) const {
2529	SDLoc SL(Op);
2530	SDValue Src = Op.getOperand(i: `0`);
2531
2532	assert(Op.getValueType() == MVT::f64);
2533
2534	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2535	SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2536	SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2537
2538	// TODO: Should this propagate fast-math-flags?
2539
2540	SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2541	SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2542
2543	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2544
2545	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2546	SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2547
2548	EVT SetCCVT =
2549	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2550	SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2551
2552	return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2553	}
2554
2555	SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2556	SelectionDAG &DAG) const {
2557	// FNEARBYINT and FRINT are the same, except in their handling of FP
2558	// exceptions. Those aren't really meaningful for us, and OpenCL only has
2559	// rint, so just treat them as equivalent.
2560	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT: Op.getValueType(),
2561	Operand: Op.getOperand(i: `0`));
2562	}
2563
2564	SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2565	auto VT = Op.getValueType();
2566	auto Arg = Op.getOperand(i: `0u`);
2567	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT, Operand: Arg);
2568	}
2569
2570	// XXX - May require not supporting f32 denormals?
2571
2572	// Don't handle v2f16. The extra instructions to scalarize and repack around the
2573	// compare and vselect end up producing worse code than scalarizing the whole
2574	// operation.
2575	SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2576	SDLoc SL(Op);
2577	SDValue X = Op.getOperand(i: `0`);
2578	EVT VT = Op.getValueType();
2579
2580	SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2581
2582	// TODO: Should this propagate fast-math-flags?
2583
2584	SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2585
2586	SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2587
2588	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2589	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2590
2591	EVT SetCCVT =
2592	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2593
2594	const SDValue Half = DAG.getConstantFP(Val: `0.5`, DL: SL, VT);
2595	SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2596	SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2597
2598	SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2599	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2600	}
2601
2602	SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2603	SDLoc SL(Op);
2604	SDValue Src = Op.getOperand(i: `0`);
2605
2606	// result = trunc(src);
2607	// if (src < 0.0 && src != result)
2608	// result += -1.0.
2609
2610	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2611
2612	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT: MVT::f64);
2613	const SDValue NegOne = DAG.getConstantFP(Val: -`1.0`, DL: SL, VT: MVT::f64);
2614
2615	EVT SetCCVT =
2616	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2617
2618	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2619	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2620	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2621
2622	SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2623	// TODO: Should this propagate fast-math-flags?
2624	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2625	}
2626
2627	/// Return true if it's known that \p Src can never be an f32 denormal value.
2628	static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2629	switch (Src.getOpcode()) {
2630	case ISD::FP_EXTEND:
2631	return Src.getOperand(i: `0`).getValueType() == MVT::f16;
2632	case ISD::FP16_TO_FP:
2633	case ISD::FFREXP:
2634	case ISD::FSQRT:
2635	case AMDGPUISD::LOG:
2636	case AMDGPUISD::EXP:
2637	return true;
2638	case ISD::INTRINSIC_WO_CHAIN: {
2639	unsigned IntrinsicID = Src.getConstantOperandVal(i: `0`);
2640	switch (IntrinsicID) {
2641	case Intrinsic::amdgcn_frexp_mant:
2642	case Intrinsic::amdgcn_log:
2643	case Intrinsic::amdgcn_log_clamp:
2644	case Intrinsic::amdgcn_exp2:
2645	case Intrinsic::amdgcn_sqrt:
2646	return true;
2647	default:
2648	return false;
2649	}
2650	}
2651	default:
2652	return false;
2653	}
2654
2655	llvm_unreachable("covered opcode switch");
2656	}
2657
2658	bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2659	SDNodeFlags Flags) {
2660	return Flags.hasApproximateFuncs();
2661	}
2662
2663	bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2664	SDValue Src,
2665	SDNodeFlags Flags) {
2666	return !valueIsKnownNeverF32Denorm(Src) &&
2667	DAG.getMachineFunction()
2668	.getDenormalMode(FPType: APFloat::IEEEsingle())
2669	.Input != DenormalMode::PreserveSign;
2670	}
2671
2672	SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2673	SDValue Src,
2674	SDNodeFlags Flags) const {
2675	SDLoc SL(Src);
2676	EVT VT = Src.getValueType();
2677	const fltSemantics &Semantics = VT.getFltSemantics();
2678	SDValue SmallestNormal =
2679	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2680
2681	// Want to scale denormals up, but negatives and 0 work just as well on the
2682	// scaled path.
2683	SDValue IsLtSmallestNormal = DAG.getSetCC(
2684	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2685	RHS: SmallestNormal, Cond: ISD::SETOLT);
2686
2687	return IsLtSmallestNormal;
2688	}
2689
2690	SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2691	SDNodeFlags Flags) const {
2692	SDLoc SL(Src);
2693	EVT VT = Src.getValueType();
2694	const fltSemantics &Semantics = VT.getFltSemantics();
2695	SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2696
2697	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2698	SDValue IsFinite = DAG.getSetCC(
2699	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2700	RHS: Inf, Cond: ISD::SETOLT);
2701	return IsFinite;
2702	}
2703
2704	/// If denormal handling is required return the scaled input to FLOG2, and the
2705	/// check for denormal range. Otherwise, return null values.
2706	std::pair<SDValue, SDValue>
2707	AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2708	SDValue Src, SDNodeFlags Flags) const {
2709	if (!needsDenormHandlingF32(DAG, Src, Flags))
2710	return {};
2711
2712	MVT VT = MVT::f32;
2713	const fltSemantics &Semantics = APFloat::IEEEsingle();
2714	SDValue SmallestNormal =
2715	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2716
2717	SDValue IsLtSmallestNormal = DAG.getSetCC(
2718	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2719	RHS: SmallestNormal, Cond: ISD::SETOLT);
2720
2721	SDValue Scale32 = DAG.getConstantFP(Val: `0x1.0p+32`, DL: SL, VT);
2722	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2723	SDValue ScaleFactor =
2724	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2725
2726	SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2727	return {ScaledInput, IsLtSmallestNormal};
2728	}
2729
2730	SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2731	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2732	// If we have to handle denormals, scale up the input and adjust the result.
2733
2734	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
2735	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2736
2737	SDLoc SL(Op);
2738	EVT VT = Op.getValueType();
2739	SDValue Src = Op.getOperand(i: `0`);
2740	SDNodeFlags Flags = Op ->getFlags();
2741
2742	if (VT == MVT::f16) {
2743	// Nothing in half is a denormal when promoted to f32.
2744	assert(!isTypeLegal(VT));
2745	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2746	SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2747	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2748	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
2749	}
2750
2751	auto [ScaledInput, IsLtSmallestNormal] =
2752	getScaledLogInput(DAG, SL, Src, Flags);
2753	if (!ScaledInput)
2754	return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2755
2756	SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2757
2758	SDValue ThirtyTwo = DAG.getConstantFP(Val: `32.0`, DL: SL, VT);
2759	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2760	SDValue ResultOffset =
2761	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2762	return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2763	}
2764
2765	static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2766	SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags ()) {
2767	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2768	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2769	}
2770
2771	SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2772	SelectionDAG &DAG) const {
2773	SDValue X = Op.getOperand(i: `0`);
2774	EVT VT = Op.getValueType();
2775	SDNodeFlags Flags = Op ->getFlags();
2776	SDLoc DL(Op);
2777	const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2778	assert(IsLog10 \|\| Op.getOpcode() == ISD::FLOG);
2779
2780	if (VT == MVT::f16 \|\| Flags.hasApproximateFuncs()) {
2781	// TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2782	// depending on !fpmath metadata.
2783
2784	bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() \|\|
2785	!isTypeLegal(VT: MVT::f16));
2786
2787	if (PromoteToF32) {
2788	// Log and multiply in f32 is always good enough for f16.
2789	X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2790	}
2791
2792	SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2793	if (PromoteToF32) {
2794	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2795	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32), Flags);
2796	}
2797
2798	return Lowered;
2799	}
2800
2801	SDValue ScaledInput, IsScaled;
2802	if (VT == MVT::f16)
2803	X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2804	else {
2805	std::tie(args&: ScaledInput, args&: IsScaled) = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2806	if (ScaledInput)
2807	X = ScaledInput;
2808	}
2809
2810	SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2811
2812	SDValue R;
2813	if (Subtarget->hasFastFMAF32()) {
2814	// c+cc are ln(2)/ln(10) to more than 49 bits
2815	const float c_log10 = `0x1.344134p-2f`;
2816	const float cc_log10 = `0x1.09f79ep-26f`;
2817
2818	// c + cc is ln(2) to more than 49 bits
2819	const float c_log = `0x1.62e42ep-1f`;
2820	const float cc_log = `0x1.efa39ep-25f`;
2821
2822	SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2823	SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2824	// This adds correction terms for which contraction may lead to an increase
2825	// in the error of the approximation, so disable it.
2826	Flags.setAllowContract(false);
2827	R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2828	SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2829	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2830	SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2831	R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2832	} else {
2833	// ch+ct is ln(2)/ln(10) to more than 36 bits
2834	const float ch_log10 = `0x1.344000p-2f`;
2835	const float ct_log10 = `0x1.3509f6p-18f`;
2836
2837	// ch + ct is ln(2) to more than 36 bits
2838	const float ch_log = `0x1.62e000p-1f`;
2839	const float ct_log = `0x1.0bfbe8p-15f`;
2840
2841	SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2842	SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2843
2844	SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2845	SDValue MaskConst = DAG.getConstant(Val: `0xfffff000`, DL, VT: MVT::i32);
2846	SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2847	SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2848	SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2849	// This adds correction terms for which contraction may lead to an increase
2850	// in the error of the approximation, so disable it.
2851	Flags.setAllowContract(false);
2852	SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2853	SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2854	SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2855	R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2856	}
2857
2858	const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2859
2860	// TODO: Check if known finite from source value.
2861	if (!IsFiniteOnly) {
2862	SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2863	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2864	}
2865
2866	if (IsScaled) {
2867	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL, VT);
2868	SDValue ShiftK =
2869	DAG.getConstantFP(Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`, DL, VT);
2870	SDValue Shift =
2871	DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2872	R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2873	}
2874
2875	return R;
2876	}
2877
2878	SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2879	return LowerFLOGCommon(Op, DAG);
2880	}
2881
2882	// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2883	// promote f16 operation.
2884	SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2885	SelectionDAG &DAG, bool IsLog10,
2886	SDNodeFlags Flags) const {
2887	EVT VT = Src.getValueType();
2888	unsigned LogOp =
2889	VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2890
2891	double Log2BaseInverted =
2892	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2893
2894	if (VT == MVT::f32) {
2895	auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2896	if (ScaledInput) {
2897	SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2898	SDValue ScaledResultOffset =
2899	DAG.getConstantFP(Val: -`32.0` * Log2BaseInverted, DL: SL, VT);
2900
2901	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL: SL, VT);
2902
2903	SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2904	N2: ScaledResultOffset, N3: Zero, Flags);
2905
2906	SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2907
2908	if (Subtarget->hasFastFMAF32())
2909	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2910	Flags);
2911	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2912	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2913	}
2914	}
2915
2916	SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2917	SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2918
2919	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2920	Flags);
2921	}
2922
2923	// This expansion gives a result slightly better than 1ulp.
2924	SDValue AMDGPUTargetLowering::lowerFEXPF64(SDValue Op,
2925	SelectionDAG &DAG) const {
2926	SDLoc DL(Op);
2927	SDValue X = Op.getOperand(i: `0`);
2928
2929	// TODO: Check if reassoc is safe. There is an output change in exp2 and
2930	// exp10, which slightly increases ulp.
2931	SDNodeFlags Flags = Op ->getFlags() & ~SDNodeFlags::AllowReassociation;
2932
2933	SDValue DN, F, T;
2934
2935	if (Op.getOpcode() == ISD::FEXP2) {
2936	// dn = rint(x)
2937	DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: X, Flags);
2938	// f = x - dn
2939	F = DAG.getNode(Opcode: ISD::FSUB, DL, VT: MVT::f64, N1: X, N2: DN, Flags);
2940	// t = fC1 + fC2
2941	SDValue C1 = DAG.getConstantFP(Val: `0x1.62e42fefa39efp-1`, DL, VT: MVT::f64);
2942	SDValue C2 = DAG.getConstantFP(Val: `0x1.abc9e3b39803fp-56`, DL, VT: MVT::f64);
2943	SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C2, Flags);
2944	T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C1, N3: Mul2, Flags);
2945	} else if (Op.getOpcode() == ISD::FEXP10) {
2946	// dn = rint(x C1)*
2947	SDValue C1 = DAG.getConstantFP(Val: `0x1.a934f0979a371p+1`, DL, VT: MVT::f64);
2948	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2949	DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2950
2951	// f = FMA(-dn, C2, FMA(-dn, C3, x))
2952	SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2953	SDValue C2 = DAG.getConstantFP(Val: -`0x1.9dc1da994fd21p-59`, DL, VT: MVT::f64);
2954	SDValue C3 = DAG.getConstantFP(Val: `0x1.34413509f79ffp-2`, DL, VT: MVT::f64);
2955	SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2956	F = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2957
2958	// t = FMA(f, C4, fC5)*
2959	SDValue C4 = DAG.getConstantFP(Val: `0x1.26bb1bbb55516p+1`, DL, VT: MVT::f64);
2960	SDValue C5 = DAG.getConstantFP(Val: -`0x1.f48ad494ea3e9p-53`, DL, VT: MVT::f64);
2961	SDValue MulF = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C5, Flags);
2962	T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C4, N3: MulF, Flags);
2963	} else { // ISD::FEXP
2964	// dn = rint(x C1)*
2965	SDValue C1 = DAG.getConstantFP(Val: `0x1.71547652b82fep+0`, DL, VT: MVT::f64);
2966	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2967	DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2968
2969	// t = FMA(-dn, C2, FMA(-dn, C3, x))
2970	SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2971	SDValue C2 = DAG.getConstantFP(Val: `0x1.abc9e3b39803fp-56`, DL, VT: MVT::f64);
2972	SDValue C3 = DAG.getConstantFP(Val: `0x1.62e42fefa39efp-1`, DL, VT: MVT::f64);
2973	SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2974	T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2975	}
2976
2977	// Polynomial expansion for p
2978	SDValue P = DAG.getConstantFP(Val: `0x1.ade156a5dcb37p-26`, DL, VT: MVT::f64);
2979	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2980	N3: DAG.getConstantFP(Val: `0x1.28af3fca7ab0cp-22`, DL, VT: MVT::f64),
2981	Flags);
2982	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2983	N3: DAG.getConstantFP(Val: `0x1.71dee623fde64p-19`, DL, VT: MVT::f64),
2984	Flags);
2985	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2986	N3: DAG.getConstantFP(Val: `0x1.a01997c89e6b0p-16`, DL, VT: MVT::f64),
2987	Flags);
2988	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2989	N3: DAG.getConstantFP(Val: `0x1.a01a014761f6ep-13`, DL, VT: MVT::f64),
2990	Flags);
2991	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2992	N3: DAG.getConstantFP(Val: `0x1.6c16c1852b7b0p-10`, DL, VT: MVT::f64),
2993	Flags);
2994	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2995	N3: DAG.getConstantFP(Val: `0x1.1111111122322p-7`, DL, VT: MVT::f64), Flags);
2996	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2997	N3: DAG.getConstantFP(Val: `0x1.55555555502a1p-5`, DL, VT: MVT::f64), Flags);
2998	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2999	N3: DAG.getConstantFP(Val: `0x1.5555555555511p-3`, DL, VT: MVT::f64), Flags);
3000	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3001	N3: DAG.getConstantFP(Val: `0x1.000000000000bp-1`, DL, VT: MVT::f64), Flags);
3002
3003	SDValue One = DAG.getConstantFP(Val: `1.0`, DL, VT: MVT::f64);
3004
3005	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3006	P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3007
3008	// z = ldexp(p, (int)dn)
3009	SDValue DNInt = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL, VT: MVT::i32, Operand: DN);
3010	SDValue Z = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: P, N2: DNInt, Flags);
3011
3012	// Overflow/underflow guards
3013	SDValue CondHi = DAG.getSetCC(
3014	DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: `1024.0`, DL, VT: MVT::f64), Cond: ISD::SETULE);
3015
3016	if (!Flags.hasNoInfs()) {
3017	SDValue PInf = DAG.getConstantFP(Val: std::numeric_limits<double>::infinity(),
3018	DL, VT: MVT::f64);
3019	Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondHi, LHS: Z, RHS: PInf, Flags);
3020	}
3021
3022	SDValue CondLo = DAG.getSetCC(
3023	DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: -`1075.0`, DL, VT: MVT::f64), Cond: ISD::SETUGE);
3024	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL, VT: MVT::f64);
3025	Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondLo, LHS: Z, RHS: Zero, Flags);
3026
3027	return Z;
3028	}
3029
3030	SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
3031	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3032	// If we have to handle denormals, scale up the input and adjust the result.
3033
3034	EVT VT = Op.getValueType();
3035	if (VT == MVT::f64)
3036	return lowerFEXPF64(Op, DAG);
3037
3038	SDLoc SL(Op);
3039	SDValue Src = Op.getOperand(i: `0`);
3040	SDNodeFlags Flags = Op ->getFlags();
3041
3042	if (VT == MVT::f16) {
3043	// Nothing in half is a denormal when promoted to f32.
3044	assert(!isTypeLegal(MVT::f16));
3045	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3046	SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
3047	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
3048	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
3049	}
3050
3051	assert(VT == MVT::f32);
3052
3053	if (!needsDenormHandlingF32(DAG, Src, Flags))
3054	return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3055
3056	// bool needs_scaling = x < -0x1.f80000p+6f;
3057	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
3058
3059	// -nextafter(128.0, -1)
3060	SDValue RangeCheckConst = DAG.getConstantFP(Val: -`0x1.f80000p+6f`, DL: SL, VT);
3061
3062	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3063
3064	SDValue NeedsScaling =
3065	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
3066
3067	SDValue SixtyFour = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
3068	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
3069
3070	SDValue AddOffset =
3071	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
3072
3073	SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
3074	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
3075
3076	SDValue TwoExpNeg64 = DAG.getConstantFP(Val: `0x1.0p-64f`, DL: SL, VT);
3077	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
3078	SDValue ResultScale =
3079	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
3080
3081	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
3082	}
3083
3084	SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
3085	SelectionDAG &DAG,
3086	SDNodeFlags Flags,
3087	bool IsExp10) const {
3088	// exp(x) -> exp2(M_LOG2E_F x);*
3089	// exp10(x) -> exp2(log2(10) x);*
3090	EVT VT = X.getValueType();
3091	SDValue Const =
3092	DAG.getConstantFP(Val: IsExp10 ? `0x1.a934f0p+1f` : numbers::log2e, DL: SL, VT);
3093
3094	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
3095	return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3096	: (unsigned)ISD::FEXP2,
3097	DL: SL, VT, Operand: Mul, Flags);
3098	}
3099
3100	SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
3101	SelectionDAG &DAG,
3102	SDNodeFlags Flags) const {
3103	EVT VT = X.getValueType();
3104	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, Src: X, Flags))
3105	return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /IsExp10=/false);
3106
3107	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3108
3109	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.5d58a0p+6f`, DL: SL, VT);
3110	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3111
3112	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
3113
3114	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3115
3116	SDValue AdjustedX =
3117	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3118
3119	const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3120	SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3121
3122	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3123
3124	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.969d48p-93f`, DL: SL, VT);
3125	SDValue AdjustedResult =
3126	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3127
3128	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3129	Flags);
3130	}
3131
3132	/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3133	/// handled correctly.
3134	SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3135	SelectionDAG &DAG,
3136	SDNodeFlags Flags) const {
3137	const EVT VT = X.getValueType();
3138
3139	const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3140	: static_cast<unsigned>(ISD::FEXP2);
3141
3142	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3143	// exp2(x 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);*
3144	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
3145	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
3146
3147	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3148	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3149	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3150	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3151	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3152	}
3153
3154	// bool s = x < -0x1.2f7030p+5f;
3155	// x += s ? 0x1.0p+5f : 0.0f;
3156	// exp10 = exp2(x * 0x1.a92000p+1f) *
3157	// exp2(x * 0x1.4f0978p-11f) *
3158	// (s ? 0x1.9f623ep-107f : 1.0f);
3159
3160	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3161
3162	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.2f7030p+5f`, DL: SL, VT);
3163	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3164
3165	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+5f`, DL: SL, VT);
3166	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3167	SDValue AdjustedX =
3168	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3169
3170	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
3171	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
3172
3173	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3174	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3175	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3176	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3177
3178	SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3179
3180	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.9f623ep-107f`, DL: SL, VT);
3181	SDValue AdjustedResult =
3182	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3183
3184	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3185	Flags);
3186	}
3187
3188	SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3189	EVT VT = Op.getValueType();
3190
3191	if (VT == MVT::f64)
3192	return lowerFEXPF64(Op, DAG);
3193
3194	SDLoc SL(Op);
3195	SDValue X = Op.getOperand(i: `0`);
3196	SDNodeFlags Flags = Op ->getFlags();
3197	const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3198
3199	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3200	// library behavior. Also, is known-not-daz source sufficient?
3201	if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3202	return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3203	: lowerFEXPUnsafe(X, SL, DAG, Flags);
3204	}
3205
3206	if (VT.getScalarType() == MVT::f16) {
3207	if (VT.isVector())
3208	return SDValue ();
3209
3210	// Nothing in half is a denormal when promoted to f32.
3211	//
3212	// exp(f16 x) ->
3213	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3214	//
3215	// exp10(f16 x) ->
3216	// fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3217	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3218	SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3219	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3220	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
3221	}
3222
3223	assert(VT == MVT::f32);
3224
3225	// Algorithm:
3226	//
3227	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
3228	//
3229	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
3230	// n = 64m + j, 0 <= j < 64*
3231	//
3232	// e^x = 2^((64m + j + f)/64)*
3233	// = (2^m) (2^(j/64)) * 2^(f/64)*
3234	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
3235	//
3236	// f = x(64/ln(2)) - n*
3237	// r = f(ln(2)/64) = x - n(ln(2)/64)
3238	//
3239	// e^x = (2^m) (2^(j/64)) * e^r*
3240	//
3241	// (2^(j/64)) is precomputed
3242	//
3243	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3244	// e^r = 1 + q
3245	//
3246	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3247	//
3248	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
3249	SDNodeFlags FlagsNoContract = Flags;
3250	FlagsNoContract.setAllowContract(false);
3251
3252	SDValue PH, PL;
3253	if (Subtarget->hasFastFMAF32()) {
3254	const float c_exp = numbers::log2ef;
3255	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3256	const float c_exp10 = `0x1.a934f0p+1f`;
3257	const float cc_exp10 = `0x1.2f346ep-24f`;
3258
3259	SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3260	SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3261
3262	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3263	SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3264	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3265	PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3266	} else {
3267	const float ch_exp = `0x1.714000p+0f`;
3268	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3269
3270	const float ch_exp10 = `0x1.a92000p+1f`;
3271	const float cl_exp10 = `0x1.4f0978p-11f`;
3272
3273	SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3274	SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3275
3276	SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3277	SDValue MaskConst = DAG.getConstant(Val: `0xfffff000`, DL: SL, VT: MVT::i32);
3278	SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3279	SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3280	SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3281
3282	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3283
3284	SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3285	SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3286	PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3287	}
3288
3289	SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3290
3291	// It is unsafe to contract this fsub into the PH multiply.
3292	SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3293
3294	SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3295	SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3296	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3297
3298	SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3299
3300	SDValue UnderflowCheckConst =
3301	DAG.getConstantFP(Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`, DL: SL, VT);
3302
3303	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3304	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
3305	SDValue Underflow =
3306	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3307
3308	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3309
3310	if (!Flags.hasNoInfs()) {
3311	SDValue OverflowCheckConst =
3312	DAG.getConstantFP(Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`, DL: SL, VT);
3313	SDValue Overflow =
3314	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3315	SDValue Inf =
3316	DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3317	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3318	}
3319
3320	return R;
3321	}
3322
3323	static bool isCtlzOpc(unsigned Opc) {
3324	return Opc == ISD::CTLZ \|\| Opc == ISD::CTLZ_ZERO_UNDEF;
3325	}
3326
3327	static bool isCttzOpc(unsigned Opc) {
3328	return Opc == ISD::CTTZ \|\| Opc == ISD::CTTZ_ZERO_UNDEF;
3329	}
3330
3331	SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3332	SelectionDAG &DAG) const {
3333	auto SL = SDLoc (Op);
3334	auto Opc = Op.getOpcode();
3335	auto Arg = Op.getOperand(i: `0u`);
3336	auto ResultVT = Op.getValueType();
3337
3338	if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3339	return {};
3340
3341	assert(isCtlzOpc(Opc));
3342	assert(ResultVT == Arg.getValueType());
3343
3344	const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3345	SDValue NumExtBits = DAG.getConstant(Val: `32u` - NumBits, DL: SL, VT: MVT::i32);
3346	SDValue NewOp;
3347
3348	if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3349	NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3350	NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3351	NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3352	} else {
3353	NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3354	NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3355	NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3356	}
3357
3358	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3359	}
3360
3361	SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3362	SDLoc SL(Op);
3363	SDValue Src = Op.getOperand(i: `0`);
3364
3365	assert(isCtlzOpc(Op.getOpcode()) \|\| isCttzOpc(Op.getOpcode()));
3366	bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3367	unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3368
3369	bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF \|\|
3370	Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3371	bool Is64BitScalar = !Src ->isDivergent() && Src.getValueType() == MVT::i64;
3372
3373	if (Src.getValueType() == MVT::i32 \|\| Is64BitScalar) {
3374	// (ctlz hi:lo) -> (umin (ffbh src), 32)
3375	// (cttz hi:lo) -> (umin (ffbl src), 32)
3376	// (ctlz_zero_undef src) -> (ffbh src)
3377	// (cttz_zero_undef src) -> (ffbl src)
3378
3379	// 64-bit scalar version produce 32-bit result
3380	// (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3381	// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3382	// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3383	// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3384	SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3385	if (!ZeroUndef) {
3386	const SDValue ConstVal = DAG.getConstant(
3387	Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3388	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3389	}
3390	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3391	}
3392
3393	SDValue Lo, Hi;
3394	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3395
3396	SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3397	SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3398
3399	// (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3400	// (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3401	// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3402	// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3403
3404	unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3405	const SDValue Const32 = DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32);
3406	if (Ctlz)
3407	OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3408	else
3409	OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3410
3411	SDValue NewOpr;
3412	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3413	if (!ZeroUndef) {
3414	const SDValue Const64 = DAG.getConstant(Val: `64`, DL: SL, VT: MVT::i32);
3415	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3416	}
3417
3418	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3419	}
3420
3421	SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3422	bool Signed) const {
3423	// The regular method converting a 64-bit integer to float roughly consists of
3424	// 2 steps: normalization and rounding. In fact, after normalization, the
3425	// conversion from a 64-bit integer to a float is essentially the same as the
3426	// one from a 32-bit integer. The only difference is that it has more
3427	// trailing bits to be rounded. To leverage the native 32-bit conversion, a
3428	// 64-bit integer could be preprocessed and fit into a 32-bit integer then
3429	// converted into the correct float number. The basic steps for the unsigned
3430	// conversion are illustrated in the following pseudo code:
3431	//
3432	// f32 uitofp(i64 u) {
3433	// i32 hi, lo = split(u);
3434	// // Only count the leading zeros in hi as we have native support of the
3435	// // conversion from i32 to f32. If hi is all 0s, the conversion is
3436	// // reduced to a 32-bit one automatically.
3437	// i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3438	// u <<= shamt;
3439	// hi, lo = split(u);
3440	// hi \|= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3441	// // convert it as a 32-bit integer and scale the result back.
3442	// return uitofp(hi) 2^(32 - shamt);*
3443	// }
3444	//
3445	// The signed one follows the same principle but uses 'ffbh_i32' to count its
3446	// sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3447	// converted instead followed by negation based its sign bit.
3448
3449	SDLoc SL(Op);
3450	SDValue Src = Op.getOperand(i: `0`);
3451
3452	SDValue Lo, Hi;
3453	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3454	SDValue Sign;
3455	SDValue ShAmt;
3456	if (Signed && Subtarget->isGCN()) {
3457	// We also need to consider the sign bit in Lo if Hi has just sign bits,
3458	// i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3459	// account. That is, the maximal shift is
3460	// - 32 if Lo and Hi have opposite signs;
3461	// - 33 if Lo and Hi have the same sign.
3462	//
3463	// Or, MaxShAmt = 33 + OppositeSign, where
3464	//
3465	// OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3466	// - -1 if Lo and Hi have opposite signs; and
3467	// - 0 otherwise.
3468	//
3469	// All in all, ShAmt is calculated as
3470	//
3471	// umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3472	//
3473	// or
3474	//
3475	// umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3476	//
3477	// to reduce the critical path.
3478	SDValue OppositeSign = DAG.getNode(
3479	Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3480	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3481	SDValue MaxShAmt =
3482	DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32),
3483	N2: OppositeSign);
3484	// Count the leading sign bits.
3485	ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3486	// Different from unsigned conversion, the shift should be one bit less to
3487	// preserve the sign bit.
3488	ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3489	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
3490	ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3491	} else {
3492	if (Signed) {
3493	// Without 'ffbh_i32', only leading zeros could be counted. Take the
3494	// absolute value first.
3495	Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3496	N2: DAG.getConstant(Val: `63`, DL: SL, VT: MVT::i64));
3497	SDValue Abs =
3498	DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3499	N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3500	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3501	}
3502	// Count the leading zeros.
3503	ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3504	// The shift amount for signed integers is [0, 32].
3505	}
3506	// Normalize the given 64-bit integer.
3507	SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3508	// Split it again.
3509	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3510	// Calculate the adjust bit for rounding.
3511	// (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3512	SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3513	N1: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32), N2: Lo);
3514	// Get the 32-bit normalized integer.
3515	Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3516	// Convert the normalized 32-bit integer into f32.
3517
3518	bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3519	unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3520	SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3521
3522	// Finally, need to scale back the converted floating number as the original
3523	// 64-bit integer is converted as a 32-bit one.
3524	ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32),
3525	N2: ShAmt);
3526	// On GCN, use LDEXP directly.
3527	if (UseLDEXP)
3528	return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3529
3530	// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3531	// part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3532	// exponent is enough to avoid overflowing into the sign bit.
3533	SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3534	N2: DAG.getConstant(Val: `23`, DL: SL, VT: MVT::i32));
3535	SDValue IVal =
3536	DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3537	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3538	if (Signed) {
3539	// Set the sign bit.
3540	Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3541	N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3542	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3543	IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3544	}
3545	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3546	}
3547
3548	SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3549	bool Signed) const {
3550	SDLoc SL(Op);
3551	SDValue Src = Op.getOperand(i: `0`);
3552
3553	SDValue Lo, Hi;
3554	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3555
3556	SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3557	DL: SL, VT: MVT::f64, Operand: Hi);
3558
3559	SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3560
3561	SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3562	N2: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32));
3563	// TODO: Should this propagate fast-math-flags?
3564	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3565	}
3566
3567	SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3568	SelectionDAG &DAG) const {
3569	// TODO: Factor out code common with LowerSINT_TO_FP.
3570	EVT DestVT = Op.getValueType();
3571	SDValue Src = Op.getOperand(i: `0`);
3572	EVT SrcVT = Src.getValueType();
3573
3574	if (SrcVT == MVT::i16) {
3575	if (DestVT == MVT::f16)
3576	return Op;
3577	SDLoc DL(Op);
3578
3579	// Promote src to i32
3580	SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3581	return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3582	}
3583
3584	if (DestVT == MVT::bf16) {
3585	SDLoc SL(Op);
3586	SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3587	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3588	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3589	}
3590
3591	if (SrcVT != MVT::i64)
3592	return Op;
3593
3594	if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3595	SDLoc DL(Op);
3596
3597	SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3598	SDValue FPRoundFlag =
3599	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3600	SDValue FPRound =
3601	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3602
3603	return FPRound;
3604	}
3605
3606	if (DestVT == MVT::f32)
3607	return LowerINT_TO_FP32(Op, DAG, Signed: false);
3608
3609	assert(DestVT == MVT::f64);
3610	return LowerINT_TO_FP64(Op, DAG, Signed: false);
3611	}
3612
3613	SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3614	SelectionDAG &DAG) const {
3615	EVT DestVT = Op.getValueType();
3616
3617	SDValue Src = Op.getOperand(i: `0`);
3618	EVT SrcVT = Src.getValueType();
3619
3620	if (SrcVT == MVT::i16) {
3621	if (DestVT == MVT::f16)
3622	return Op;
3623
3624	SDLoc DL(Op);
3625	// Promote src to i32
3626	SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3627	return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3628	}
3629
3630	if (DestVT == MVT::bf16) {
3631	SDLoc SL(Op);
3632	SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3633	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3634	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3635	}
3636
3637	if (SrcVT != MVT::i64)
3638	return Op;
3639
3640	// TODO: Factor out code common with LowerUINT_TO_FP.
3641
3642	if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3643	SDLoc DL(Op);
3644	SDValue Src = Op.getOperand(i: `0`);
3645
3646	SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3647	SDValue FPRoundFlag =
3648	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3649	SDValue FPRound =
3650	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3651
3652	return FPRound;
3653	}
3654
3655	if (DestVT == MVT::f32)
3656	return LowerINT_TO_FP32(Op, DAG, Signed: true);
3657
3658	assert(DestVT == MVT::f64);
3659	return LowerINT_TO_FP64(Op, DAG, Signed: true);
3660	}
3661
3662	SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3663	bool Signed) const {
3664	SDLoc SL(Op);
3665
3666	SDValue Src = Op.getOperand(i: `0`);
3667	EVT SrcVT = Src.getValueType();
3668
3669	assert(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64);
3670
3671	// The basic idea of converting a floating point number into a pair of 32-bit
3672	// integers is illustrated as follows:
3673	//
3674	// tf := trunc(val);
3675	// hif := floor(tf 2^-32);*
3676	// lof := tf - hif 2^32; // lof is always positive due to floor.*
3677	// hi := fptoi(hif);
3678	// lo := fptoi(lof);
3679	//
3680	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3681	SDValue Sign;
3682	if (Signed && SrcVT == MVT::f32) {
3683	// However, a 32-bit floating point number has only 23 bits mantissa and
3684	// it's not enough to hold all the significant bits of `lof` if val is
3685	// negative. To avoid the loss of precision, We need to take the absolute
3686	// value after truncating and flip the result back based on the original
3687	// signedness.
3688	Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3689	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3690	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3691	Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3692	}
3693
3694	SDValue K0, K1;
3695	if (SrcVT == MVT::f64) {
3696	K0 = DAG.getConstantFP(
3697	Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)), DL: SL,
3698	VT: SrcVT);
3699	K1 = DAG.getConstantFP(
3700	Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)), DL: SL,
3701	VT: SrcVT);
3702	} else {
3703	K0 = DAG.getConstantFP(
3704	Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)), DL: SL, VT: SrcVT);
3705	K1 = DAG.getConstantFP(
3706	Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)), DL: SL, VT: SrcVT);
3707	}
3708	// TODO: Should this propagate fast-math-flags?
3709	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3710
3711	SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3712
3713	SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3714
3715	SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3716	: ISD::FP_TO_UINT,
3717	DL: SL, VT: MVT::i32, Operand: FloorMul);
3718	SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3719
3720	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3721	Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3722
3723	if (Signed && SrcVT == MVT::f32) {
3724	assert(Sign);
3725	// Flip the result based on the signedness, which is either all 0s or 1s.
3726	Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3727	Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3728	// r := xor(r, sign) - sign;
3729	Result =
3730	DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3731	N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3732	}
3733
3734	return Result;
3735	}
3736
3737	SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3738	SDLoc DL(Op);
3739	SDValue N0 = Op.getOperand(i: `0`);
3740
3741	// Convert to target node to get known bits
3742	if (N0.getValueType() == MVT::f32)
3743	return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3744
3745	if (Op ->getFlags().hasApproximateFuncs()) {
3746	// There is a generic expand for FP_TO_FP16 with unsafe fast math.
3747	return SDValue ();
3748	}
3749
3750	return LowerF64ToF16Safe(Src: N0, DL, DAG);
3751	}
3752
3753	// return node in i32
3754	SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3755	SelectionDAG &DAG) const {
3756	assert(Src.getSimpleValueType() == MVT::f64);
3757
3758	// f64 -> f16 conversion using round-to-nearest-even rounding mode.
3759	// TODO: We can generate better code for True16.
3760	const unsigned ExpMask = `0x7ff`;
3761	const unsigned ExpBiasf64 = `1023`;
3762	const unsigned ExpBiasf16 = `15`;
3763	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
3764	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
3765	SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3766	SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3767	N2: DAG.getConstant(Val: `32`, DL, VT: MVT::i64));
3768	UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3769	U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3770	SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3771	N2: DAG.getConstant(Val: `20`, DL, VT: MVT::i64));
3772	E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3773	N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3774	// Subtract the fp64 exponent bias (1023) to get the real exponent and
3775	// add the f16 bias (15) to get the biased exponent for the f16 format.
3776	E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3777	N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3778
3779	SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3780	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i32));
3781	M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3782	N2: DAG.getConstant(Val: `0xffe`, DL, VT: MVT::i32));
3783
3784	SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3785	N2: DAG.getConstant(Val: `0x1ff`, DL, VT: MVT::i32));
3786	MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3787
3788	SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3789	M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3790
3791	// (M != 0 ? 0x0200 : 0) \| 0x7c00;
3792	SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3793	N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: `0x0200`, DL, VT: MVT::i32),
3794	False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: `0x7c00`, DL, VT: MVT::i32));
3795
3796	// N = M \| (E << 12);
3797	SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3798	N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3799	N2: DAG.getConstant(Val: `12`, DL, VT: MVT::i32)));
3800
3801	// B = clamp(1-E, 0, 13);
3802	SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3803	N1: One, N2: E);
3804	SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3805	B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3806	N2: DAG.getConstant(Val: `13`, DL, VT: MVT::i32));
3807
3808	SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3809	N2: DAG.getConstant(Val: `0x1000`, DL, VT: MVT::i32));
3810
3811	SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3812	SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3813	SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3814	D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3815
3816	SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3817	SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3818	N2: DAG.getConstant(Val: `0x7`, DL, VT: MVT::i32));
3819	V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3820	N2: DAG.getConstant(Val: `2`, DL, VT: MVT::i32));
3821	SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: `3`, DL, VT: MVT::i32),
3822	True: One, False: Zero, Cond: ISD::SETEQ);
3823	SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: `5`, DL, VT: MVT::i32),
3824	True: One, False: Zero, Cond: ISD::SETGT);
3825	V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3826	V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3827
3828	V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: `30`, DL, VT: MVT::i32),
3829	True: DAG.getConstant(Val: `0x7c00`, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3830	V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: `1039`, DL, VT: MVT::i32),
3831	True: I, False: V, Cond: ISD::SETEQ);
3832
3833	// Extract the sign bit.
3834	SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3835	N2: DAG.getConstant(Val: `16`, DL, VT: MVT::i32));
3836	Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3837	N2: DAG.getConstant(Val: `0x8000`, DL, VT: MVT::i32));
3838
3839	return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3840	}
3841
3842	SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3843	SelectionDAG &DAG) const {
3844	SDValue Src = Op.getOperand(i: `0`);
3845	unsigned OpOpcode = Op.getOpcode();
3846	EVT SrcVT = Src.getValueType();
3847	EVT DestVT = Op.getValueType();
3848
3849	// Will be selected natively
3850	if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3851	return Op;
3852
3853	if (SrcVT == MVT::bf16) {
3854	SDLoc DL(Op);
3855	SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3856	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3857	}
3858
3859	// Promote i16 to i32
3860	if (DestVT == MVT::i16 && (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
3861	SDLoc DL(Op);
3862
3863	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3864	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3865	}
3866
3867	if (DestVT != MVT::i64)
3868	return Op;
3869
3870	if (SrcVT == MVT::f16 \|\|
3871	(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3872	SDLoc DL(Op);
3873
3874	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3875	unsigned Ext =
3876	OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3877	return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3878	}
3879
3880	if (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)
3881	return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3882
3883	return SDValue ();
3884	}
3885
3886	SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3887	SelectionDAG &DAG) const {
3888	SDValue Src = Op.getOperand(i: `0`);
3889	unsigned OpOpcode = Op.getOpcode();
3890	EVT SrcVT = Src.getValueType();
3891	EVT DstVT = Op.getValueType();
3892	SDValue SatVTOp = Op.getNode()->getOperand(Num: `1`);
3893	EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3894	SDLoc DL(Op);
3895
3896	uint64_t DstWidth = DstVT.getScalarSizeInBits();
3897	uint64_t SatWidth = SatVT.getScalarSizeInBits();
3898	assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3899
3900	// Will be selected natively
3901	if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3902	(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64))
3903	return Op;
3904
3905	const SDValue Int32VT = DAG.getValueType(MVT::i32);
3906
3907	// Perform all saturation at i32 and truncate
3908	if (SatWidth < DstWidth) {
3909	const uint64_t Int32Width = `32`;
3910	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: Int32VT);
3911	SDValue Int32SatVal;
3912
3913	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3914	SDValue MinConst = DAG.getConstant(
3915	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3916	SDValue MaxConst = DAG.getConstant(
3917	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3918	SDValue MinVal =
3919	DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3920	Int32SatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: MinVal, N2: MaxConst);
3921	} else {
3922	SDValue MinConst = DAG.getConstant(
3923	Val: APInt::getMaxValue(numBits: SatWidth).zext(width: Int32Width), DL, VT: MVT::i32);
3924	Int32SatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3925	}
3926
3927	if (DstWidth == Int32Width)
3928	return Int32SatVal;
3929	if (DstWidth < Int32Width)
3930	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Int32SatVal);
3931
3932	// DstWidth > Int32Width
3933	const unsigned Ext =
3934	OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3935	return DAG.getNode(Opcode: Ext, DL, VT: DstVT, Operand: FpToInt32);
3936	}
3937
3938	// SatWidth == DstWidth
3939
3940	// Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3941	if (DstVT == MVT::i64 &&
3942	(SrcVT == MVT::f16 \|\| SrcVT == MVT::bf16 \|\|
3943	(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3944	return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VT);
3945	}
3946
3947	// Promote f16/bf16 src to f32
3948	if (SrcVT == MVT::f16 \|\| SrcVT == MVT::bf16) {
3949	SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3950	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3951	}
3952
3953	// Promote sub-i32 dst to i32 with sub-i32 saturation
3954	if (DstWidth < `32`) {
3955	// Note: this triggers SatWidth < DstWidth above to generate saturated
3956	// truncate by requesting MVT::i32 destination with SatWidth < 32.
3957	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: SatVTOp);
3958	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt32);
3959	}
3960
3961	// TODO: can we implement i64 dst for f32/f64?
3962
3963	return SDValue ();
3964	}
3965
3966	SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3967	SelectionDAG &DAG) const {
3968	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
3969	MVT VT = Op.getSimpleValueType();
3970	MVT ScalarVT = VT.getScalarType();
3971
3972	assert(VT.isVector());
3973
3974	SDValue Src = Op.getOperand(i: `0`);
3975	SDLoc DL(Op);
3976
3977	// TODO: Don't scalarize on Evergreen?
3978	unsigned NElts = VT.getVectorNumElements();
3979	SmallVector<SDValue, `8`> Args;
3980	DAG.ExtractVectorElements(Op: Src, Args, Start: `0`, Count: NElts);
3981
3982	SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3983	for (unsigned I = `0`; I < NElts; ++I)
3984	Args [I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args [I], N2: VTOp);
3985
3986	return DAG.getBuildVector(VT, DL, Ops: Args);
3987	}
3988
3989	//===----------------------------------------------------------------------===//
3990	// Custom DAG optimizations
3991	//===----------------------------------------------------------------------===//
3992
3993	static bool isU24(SDValue Op, SelectionDAG &DAG) {
3994	return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= `24`;
3995	}
3996
3997	static bool isI24(SDValue Op, SelectionDAG &DAG) {
3998	EVT VT = Op.getValueType();
3999	return VT.getSizeInBits() >= `24` && // Types less than 24-bit should be treated
4000	// as unsigned 24-bit values.
4001	AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= `24`;
4002	}
4003
4004	static SDValue simplifyMul24(SDNode *Node24,
4005	TargetLowering::DAGCombinerInfo &DCI) {
4006	SelectionDAG &DAG = DCI.DAG;
4007	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4008	bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4009
4010	SDValue LHS = IsIntrin ? Node24->getOperand(Num: `1`) : Node24->getOperand(Num: `0`);
4011	SDValue RHS = IsIntrin ? Node24->getOperand(Num: `2`) : Node24->getOperand(Num: `1`);
4012	unsigned NewOpcode = Node24->getOpcode();
4013	if (IsIntrin) {
4014	unsigned IID = Node24->getConstantOperandVal(Num: `0`);
4015	switch (IID) {
4016	case Intrinsic::amdgcn_mul_i24:
4017	NewOpcode = AMDGPUISD::MUL_I24;
4018	break;
4019	case Intrinsic::amdgcn_mul_u24:
4020	NewOpcode = AMDGPUISD::MUL_U24;
4021	break;
4022	case Intrinsic::amdgcn_mulhi_i24:
4023	NewOpcode = AMDGPUISD::MULHI_I24;
4024	break;
4025	case Intrinsic::amdgcn_mulhi_u24:
4026	NewOpcode = AMDGPUISD::MULHI_U24;
4027	break;
4028	default:
4029	llvm_unreachable("Expected 24-bit mul intrinsic");
4030	}
4031	}
4032
4033	APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: `24`);
4034
4035	// First try to simplify using SimplifyMultipleUseDemandedBits which allows
4036	// the operands to have other uses, but will only perform simplifications that
4037	// involve bypassing some nodes for this user.
4038	SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
4039	SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
4040	if (DemandedLHS \|\| DemandedRHS)
4041	return DAG.getNode(Opcode: NewOpcode, DL: SDLoc (Node24), VTList: Node24->getVTList(),
4042	N1: DemandedLHS ? DemandedLHS : LHS,
4043	N2: DemandedRHS ? DemandedRHS : RHS);
4044
4045	// Now try SimplifyDemandedBits which can simplify the nodes used by our
4046	// operands if this node is the only user.
4047	if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
4048	return SDValue (Node24, `0`);
4049	if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
4050	return SDValue (Node24, `0`);
4051
4052	return SDValue ();
4053	}
4054
4055	template <typename IntTy>
4056	static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
4057	uint32_t Width, const SDLoc &DL) {
4058	if (Width + Offset < `32`) {
4059	uint32_t Shl = static_cast<uint32_t>(Src0) << (`32` - Offset - Width);
4060	IntTy Result = static_cast<IntTy>(Shl) >> (`32` - Width);
4061	if constexpr (std::is_signed_v<IntTy>) {
4062	return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
4063	} else {
4064	return DAG.getConstant(Result, DL, MVT::i32);
4065	}
4066	}
4067
4068	return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4069	}
4070
4071	static bool hasVolatileUser(SDNode *Val) {
4072	for (SDNode *U : Val->users()) {
4073	if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
4074	if (M->isVolatile())
4075	return true;
4076	}
4077	}
4078
4079	return false;
4080	}
4081
4082	bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
4083	// i32 vectors are the canonical memory type.
4084	if (VT.getScalarType() == MVT::i32 \|\| isTypeLegal(VT))
4085	return false;
4086
4087	if (!VT.isByteSized())
4088	return false;
4089
4090	unsigned Size = VT.getStoreSize();
4091
4092	if ((Size == `1` \|\| Size == `2` \|\| Size == `4`) && !VT.isVector())
4093	return false;
4094
4095	if (Size == `3` \|\| (Size > `4` && (Size % `4` != `0`)))
4096	return false;
4097
4098	return true;
4099	}
4100
4101	// Replace load of an illegal type with a bitcast from a load of a friendlier
4102	// type.
4103	SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
4104	DAGCombinerInfo &DCI) const {
4105	if (!DCI.isBeforeLegalize())
4106	return SDValue ();
4107
4108	LoadSDNode *LN = cast<LoadSDNode>(Val: N);
4109	if (!LN->isSimple() \|\| !ISD::isNormalLoad(N: LN) \|\| hasVolatileUser(Val: LN))
4110	return SDValue ();
4111
4112	SDLoc SL(N);
4113	SelectionDAG &DAG = DCI.DAG;
4114	EVT VT = LN->getMemoryVT();
4115
4116	unsigned Size = VT.getStoreSize();
4117	Align Alignment = LN->getAlign();
4118	if (Alignment < Size && isTypeLegal(VT)) {
4119	unsigned IsFast;
4120	unsigned AS = LN->getAddressSpace();
4121
4122	// Expand unaligned loads earlier than legalization. Due to visitation order
4123	// problems during legalization, the emitted instructions to pack and unpack
4124	// the bytes again are not eliminated in the case of an unaligned copy.
4125	if (!allowsMisalignedMemoryAccesses(
4126	VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4127	if (VT.isVector())
4128	return SplitVectorLoad(Op: SDValue (LN, `0`), DAG);
4129
4130	SDValue Ops[`2`];
4131	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = expandUnalignedLoad(LD: LN, DAG);
4132
4133	return DAG.getMergeValues(Ops, dl: SDLoc (N));
4134	}
4135
4136	if (!IsFast)
4137	return SDValue ();
4138	}
4139
4140	if (!shouldCombineMemoryType(VT))
4141	return SDValue ();
4142
4143	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4144
4145	SDValue NewLoad
4146	= DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4147	Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4148
4149	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4150	DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: `1`));
4151	return SDValue (N, `0`);
4152	}
4153
4154	// Replace store of an illegal type with a store of a bitcast to a friendlier
4155	// type.
4156	SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4157	DAGCombinerInfo &DCI) const {
4158	if (!DCI.isBeforeLegalize())
4159	return SDValue ();
4160
4161	StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4162	if (!SN->isSimple() \|\| !ISD::isNormalStore(N: SN))
4163	return SDValue ();
4164
4165	EVT VT = SN->getMemoryVT();
4166	unsigned Size = VT.getStoreSize();
4167
4168	SDLoc SL(N);
4169	SelectionDAG &DAG = DCI.DAG;
4170	Align Alignment = SN->getAlign();
4171	if (Alignment < Size && isTypeLegal(VT)) {
4172	unsigned IsFast;
4173	unsigned AS = SN->getAddressSpace();
4174
4175	// Expand unaligned stores earlier than legalization. Due to visitation
4176	// order problems during legalization, the emitted instructions to pack and
4177	// unpack the bytes again are not eliminated in the case of an unaligned
4178	// copy.
4179	if (!allowsMisalignedMemoryAccesses(
4180	VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4181	if (VT.isVector())
4182	return SplitVectorStore(Op: SDValue (SN, `0`), DAG);
4183
4184	return expandUnalignedStore(ST: SN, DAG);
4185	}
4186
4187	if (!IsFast)
4188	return SDValue ();
4189	}
4190
4191	if (!shouldCombineMemoryType(VT))
4192	return SDValue ();
4193
4194	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4195	SDValue Val = SN->getValue();
4196
4197	//DCI.AddToWorklist(Val.getNode());
4198
4199	bool OtherUses = !Val.hasOneUse();
4200	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
4201	if (OtherUses) {
4202	SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
4203	DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4204	}
4205
4206	return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4207	Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4208	}
4209
4210	// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4211	// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4212	// issues.
4213	SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4214	DAGCombinerInfo &DCI) const {
4215	SelectionDAG &DAG = DCI.DAG;
4216	SDValue N0 = N->getOperand(Num: `0`);
4217
4218	// (vt2 (assertzext (truncate vt0:x), vt1)) ->
4219	// (vt2 (truncate (assertzext vt0:x, vt1)))
4220	if (N0.getOpcode() == ISD::TRUNCATE) {
4221	SDValue N1 = N->getOperand(Num: `1`);
4222	EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4223	SDLoc SL(N);
4224
4225	SDValue Src = N0.getOperand(i: `0`);
4226	EVT SrcVT = Src.getValueType();
4227	if (SrcVT.bitsGE(VT: ExtVT)) {
4228	SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4229	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: NewInReg);
4230	}
4231	}
4232
4233	return SDValue ();
4234	}
4235
4236	SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4237	SDNode N, DAGCombinerInfo &DCI) const* {
4238	unsigned IID = N->getConstantOperandVal(Num: `0`);
4239	switch (IID) {
4240	case Intrinsic::amdgcn_mul_i24:
4241	case Intrinsic::amdgcn_mul_u24:
4242	case Intrinsic::amdgcn_mulhi_i24:
4243	case Intrinsic::amdgcn_mulhi_u24:
4244	return simplifyMul24(Node24: N, DCI);
4245	case Intrinsic::amdgcn_fract:
4246	case Intrinsic::amdgcn_rsq:
4247	case Intrinsic::amdgcn_rcp_legacy:
4248	case Intrinsic::amdgcn_rsq_legacy:
4249	case Intrinsic::amdgcn_rsq_clamp:
4250	case Intrinsic::amdgcn_tanh:
4251	case Intrinsic::amdgcn_prng_b32: {
4252	// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4253	SDValue Src = N->getOperand(Num: `1`);
4254	return Src.isUndef() ? Src : SDValue ();
4255	}
4256	case Intrinsic::amdgcn_frexp_exp: {
4257	// frexp_exp (fneg x) -> frexp_exp x
4258	// frexp_exp (fabs x) -> frexp_exp x
4259	// frexp_exp (fneg (fabs x)) -> frexp_exp x
4260	SDValue Src = N->getOperand(Num: `1`);
4261	SDValue PeekSign = peekFPSignOps(Val: Src);
4262	if (PeekSign == Src)
4263	return SDValue ();
4264	return SDValue (DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: PeekSign),
4265	`0`);
4266	}
4267	default:
4268	return SDValue ();
4269	}
4270	}
4271
4272	/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4273	/// binary operation \p Opc to it with the corresponding constant operands.
4274	SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4275	DAGCombinerInfo &DCI, const SDLoc &SL,
4276	unsigned Opc, SDValue LHS,
4277	uint32_t ValLo, uint32_t ValHi) const {
4278	SelectionDAG &DAG = DCI.DAG;
4279	SDValue Lo, Hi;
4280	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4281
4282	SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4283	SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4284
4285	SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4286	SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4287
4288	// Re-visit the ands. It's possible we eliminated one of them and it could
4289	// simplify the vector.
4290	DCI.AddToWorklist(N: Lo.getNode());
4291	DCI.AddToWorklist(N: Hi.getNode());
4292
4293	SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4294	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4295	}
4296
4297	SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4298	DAGCombinerInfo &DCI) const {
4299	EVT VT = N->getValueType(ResNo: `0`);
4300	SDValue LHS = N->getOperand(Num: `0`);
4301	SDValue RHS = N->getOperand(Num: `1`);
4302	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4303	SDLoc SL(N);
4304	SelectionDAG &DAG = DCI.DAG;
4305
4306	unsigned RHSVal;
4307	if (CRHS) {
4308	RHSVal = CRHS->getZExtValue();
4309	if (!RHSVal)
4310	return LHS;
4311
4312	switch (LHS ->getOpcode()) {
4313	default:
4314	break;
4315	case ISD::ZERO_EXTEND:
4316	case ISD::SIGN_EXTEND:
4317	case ISD::ANY_EXTEND: {
4318	SDValue X = LHS ->getOperand(Num: `0`);
4319
4320	if (VT == MVT::i32 && RHSVal == `16` && X.getValueType() == MVT::i16 &&
4321	isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4322	// Prefer build_vector as the canonical form if packed types are legal.
4323	// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4324	SDValue Vec = DAG.getBuildVector(
4325	VT: MVT::v2i16, DL: SL,
4326	Ops: {DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i16), LHS ->getOperand(Num: `0`)});
4327	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4328	}
4329
4330	// shl (ext x) => zext (shl x), if shift does not overflow int
4331	if (VT != MVT::i64)
4332	break;
4333	KnownBits Known = DAG.computeKnownBits(Op: X);
4334	unsigned LZ = Known.countMinLeadingZeros();
4335	if (LZ < RHSVal)
4336	break;
4337	EVT XVT = X.getValueType();
4338	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue (CRHS, `0`));
4339	return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4340	}
4341	}
4342	}
4343
4344	if (VT.getScalarType() != MVT::i64)
4345	return SDValue ();
4346
4347	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4348	// common case, splitting this into a move and a 32-bit shift is faster and
4349	// the same code size.
4350	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4351
4352	EVT ElementType = VT.getScalarType();
4353	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4354	EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4355
4356	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4357	return SDValue ();
4358	SDValue ShiftAmt;
4359
4360	if (CRHS) {
4361	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4362	VT: TargetType);
4363	} else {
4364	SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4365	const SDValue ShiftMask =
4366	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4367	// This AND instruction will clamp out of bounds shift values.
4368	// It will also be removed during later instruction selection.
4369	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4370	}
4371
4372	SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4373	SDValue NewShift =
4374	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4375
4376	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: TargetScalarType);
4377	SDValue Vec;
4378
4379	if (VT.isVector()) {
4380	EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4381	unsigned NElts = TargetType.getVectorNumElements();
4382	SmallVector<SDValue, `8`> HiOps;
4383	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`, Zero);
4384
4385	DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: `0`, Count: NElts);
4386	for (unsigned I = `0`; I != NElts; ++I)
4387	HiAndLoOps [`2` * I + `1`] = HiOps [I];
4388	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4389	} else {
4390	EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4391	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4392	}
4393	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4394	}
4395
4396	SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4397	DAGCombinerInfo &DCI) const {
4398	SDValue RHS = N->getOperand(Num: `1`);
4399	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4400	EVT VT = N->getValueType(ResNo: `0`);
4401	SDValue LHS = N->getOperand(Num: `0`);
4402	SelectionDAG &DAG = DCI.DAG;
4403	SDLoc SL(N);
4404
4405	if (VT.getScalarType() != MVT::i64)
4406	return SDValue ();
4407
4408	// For C >= 32
4409	// i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4410
4411	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4412	// common case, splitting this into a move and a 32-bit shift is faster and
4413	// the same code size.
4414	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4415
4416	EVT ElementType = VT.getScalarType();
4417	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4418	EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4419
4420	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4421	return SDValue ();
4422
4423	SDValue ShiftFullAmt =
4424	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4425	SDValue ShiftAmt;
4426	if (CRHS) {
4427	unsigned RHSVal = CRHS->getZExtValue();
4428	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4429	VT: TargetType);
4430	} else if (Known.getMinValue().getZExtValue() ==
4431	(ElementType.getSizeInBits() - `1`)) {
4432	ShiftAmt = ShiftFullAmt;
4433	} else {
4434	SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4435	const SDValue ShiftMask =
4436	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4437	// This AND instruction will clamp out of bounds shift values.
4438	// It will also be removed during later instruction selection.
4439	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4440	}
4441
4442	EVT ConcatType;
4443	SDValue Hi;
4444	SDLoc LHSSL(LHS);
4445	// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4446	if (VT.isVector()) {
4447	unsigned NElts = TargetType.getVectorNumElements();
4448	ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4449	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4450	SmallVector<SDValue, `8`> HiOps(NElts);
4451	SmallVector<SDValue, `16`> HiAndLoOps;
4452
4453	DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: `0`, Count: NElts * `2`);
4454	for (unsigned I = `0`; I != NElts; ++I) {
4455	HiOps [I] = HiAndLoOps [`2` * I + `1`];
4456	}
4457	Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4458	} else {
4459	const SDValue One = DAG.getConstant(Val: `1`, DL: LHSSL, VT: TargetScalarType);
4460	ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4461	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4462	Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4463	}
4464
4465	KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4466	SDValue HiShift;
4467	if (KnownLHS.isNegative()) {
4468	HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4469	} else {
4470	Hi = DAG.getFreeze(V: Hi);
4471	HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4472	}
4473	SDValue NewShift =
4474	DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4475
4476	SDValue Vec;
4477	if (VT.isVector()) {
4478	unsigned NElts = TargetType.getVectorNumElements();
4479	SmallVector<SDValue, `8`> HiOps;
4480	SmallVector<SDValue, `8`> LoOps;
4481	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`);
4482
4483	DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: `0`, Count: NElts);
4484	DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: `0`, Count: NElts);
4485	for (unsigned I = `0`; I != NElts; ++I) {
4486	HiAndLoOps [`2` * I + `1`] = HiOps [I];
4487	HiAndLoOps [`2` * I] = LoOps [I];
4488	}
4489	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4490	} else {
4491	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4492	}
4493	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4494	}
4495
4496	SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4497	DAGCombinerInfo &DCI) const {
4498	SDValue RHS = N->getOperand(Num: `1`);
4499	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4500	EVT VT = N->getValueType(ResNo: `0`);
4501	SDValue LHS = N->getOperand(Num: `0`);
4502	SelectionDAG &DAG = DCI.DAG;
4503	SDLoc SL(N);
4504	unsigned RHSVal;
4505
4506	if (CRHS) {
4507	RHSVal = CRHS->getZExtValue();
4508
4509	// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4510	// this improves the ability to match BFE patterns in isel.
4511	if (LHS.getOpcode() == ISD::AND) {
4512	if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`))) {
4513	unsigned MaskIdx, MaskLen;
4514	if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4515	MaskIdx == RHSVal) {
4516	return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4517	N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `0`),
4518	N2: N->getOperand(Num: `1`)),
4519	N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `1`),
4520	N2: N->getOperand(Num: `1`)));
4521	}
4522	}
4523	}
4524	}
4525
4526	if (VT.getScalarType() != MVT::i64)
4527	return SDValue ();
4528
4529	// for C >= 32
4530	// i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4531
4532	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4533	// common case, splitting this into a move and a 32-bit shift is faster and
4534	// the same code size.
4535	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4536
4537	EVT ElementType = VT.getScalarType();
4538	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4539	EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4540
4541	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4542	return SDValue ();
4543
4544	SDValue ShiftAmt;
4545	if (CRHS) {
4546	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4547	VT: TargetType);
4548	} else {
4549	SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4550	const SDValue ShiftMask =
4551	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4552	// This AND instruction will clamp out of bounds shift values.
4553	// It will also be removed during later instruction selection.
4554	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4555	}
4556
4557	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: TargetScalarType);
4558	EVT ConcatType;
4559	SDValue Hi;
4560	SDLoc LHSSL(LHS);
4561	// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4562	if (VT.isVector()) {
4563	unsigned NElts = TargetType.getVectorNumElements();
4564	ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4565	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4566	SmallVector<SDValue, `8`> HiOps(NElts);
4567	SmallVector<SDValue, `16`> HiAndLoOps;
4568
4569	DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /Start=/`0`, Count: NElts * `2`);
4570	for (unsigned I = `0`; I != NElts; ++I)
4571	HiOps [I] = HiAndLoOps [`2` * I + `1`];
4572	Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4573	} else {
4574	const SDValue One = DAG.getConstant(Val: `1`, DL: LHSSL, VT: TargetScalarType);
4575	ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4576	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4577	Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4578	}
4579
4580	SDValue NewShift =
4581	DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4582
4583	SDValue Vec;
4584	if (VT.isVector()) {
4585	unsigned NElts = TargetType.getVectorNumElements();
4586	SmallVector<SDValue, `8`> LoOps;
4587	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`, Zero);
4588
4589	DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: `0`, Count: NElts);
4590	for (unsigned I = `0`; I != NElts; ++I)
4591	HiAndLoOps [`2` * I] = LoOps [I];
4592	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4593	} else {
4594	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4595	}
4596	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4597	}
4598
4599	SDValue AMDGPUTargetLowering::performTruncateCombine(
4600	SDNode N, DAGCombinerInfo &DCI) const* {
4601	SDLoc SL(N);
4602	SelectionDAG &DAG = DCI.DAG;
4603	EVT VT = N->getValueType(ResNo: `0`);
4604	SDValue Src = N->getOperand(Num: `0`);
4605
4606	// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4607	if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4608	SDValue Vec = Src.getOperand(i: `0`);
4609	if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4610	SDValue Elt0 = Vec.getOperand(i: `0`);
4611	EVT EltVT = Elt0.getValueType();
4612	if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4613	if (EltVT.isFloatingPoint()) {
4614	Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4615	VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4616	}
4617
4618	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4619	}
4620	}
4621	}
4622
4623	// Equivalent of above for accessing the high element of a vector as an
4624	// integer operation.
4625	// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4626	if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4627	if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: `1`))) {
4628	SDValue BV = stripBitcast(Val: Src.getOperand(i: `0`));
4629	if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4630	EVT SrcEltVT = BV.getOperand(i: `0`).getValueType();
4631	unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4632	unsigned BitIndex = K->getZExtValue();
4633	unsigned PartIndex = BitIndex / SrcEltSize;
4634
4635	if (PartIndex * SrcEltSize == BitIndex &&
4636	PartIndex < BV.getNumOperands()) {
4637	if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4638	SDValue SrcElt =
4639	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4640	Operand: BV.getOperand(i: PartIndex));
4641	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4642	}
4643	}
4644	}
4645	}
4646	}
4647
4648	// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4649	//
4650	// i16 (trunc (srl i64:x, K)), K <= 16 ->
4651	// i16 (trunc (srl (i32 (trunc x), K)))
4652	if (VT.getScalarSizeInBits() < `32`) {
4653	EVT SrcVT = Src.getValueType();
4654	if (SrcVT.getScalarSizeInBits() > `32` &&
4655	(Src.getOpcode() == ISD::SRL \|\|
4656	Src.getOpcode() == ISD::SRA \|\|
4657	Src.getOpcode() == ISD::SHL)) {
4658	SDValue Amt = Src.getOperand(i: `1`);
4659	KnownBits Known = DAG.computeKnownBits(Op: Amt);
4660
4661	// - For left shifts, do the transform as long as the shift
4662	// amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4663	// - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4664	// losing information stored in the high bits when truncating.
4665	const unsigned MaxCstSize =
4666	(Src.getOpcode() == ISD::SHL) ? `31` : (`32` - VT.getScalarSizeInBits());
4667	if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4668	EVT MidVT = VT.isVector() ?
4669	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4670	NumElements: VT.getVectorNumElements()) : MVT::i32;
4671
4672	EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4673	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4674	Operand: Src.getOperand(i: `0`));
4675	DCI.AddToWorklist(N: Trunc.getNode());
4676
4677	if (Amt.getValueType() != NewShiftVT) {
4678	Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4679	DCI.AddToWorklist(N: Amt.getNode());
4680	}
4681
4682	SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4683	N1: Trunc, N2: Amt);
4684	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4685	}
4686	}
4687	}
4688
4689	return SDValue ();
4690	}
4691
4692	// We need to specifically handle i64 mul here to avoid unnecessary conversion
4693	// instructions. If we only match on the legalized i64 mul expansion,
4694	// SimplifyDemandedBits will be unable to remove them because there will be
4695	// multiple uses due to the separate mul + mulh[su].
4696	static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4697	SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4698	if (Size <= `32`) {
4699	unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4700	return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4701	}
4702
4703	unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4704	unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4705
4706	SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4707	SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4708
4709	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4710	}
4711
4712	/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4713	/// return SDValue().
4714	static SDValue getAddOneOp(const SDNode *V) {
4715	if (V->getOpcode() != ISD::ADD)
4716	return SDValue ();
4717
4718	return isOneConstant(V: V->getOperand(Num: `1`)) ? V->getOperand(Num: `0`) : SDValue ();
4719	}
4720
4721	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4722	DAGCombinerInfo &DCI) const {
4723	assert(N->getOpcode() == ISD::MUL);
4724	EVT VT = N->getValueType(ResNo: `0`);
4725
4726	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4727	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4728	// unnecessarily). isDivergent() is used as an approximation of whether the
4729	// value is in an SGPR.
4730	if (!N->isDivergent())
4731	return SDValue ();
4732
4733	unsigned Size = VT.getSizeInBits();
4734	if (VT.isVector() \|\| Size > `64`)
4735	return SDValue ();
4736
4737	SelectionDAG &DAG = DCI.DAG;
4738	SDLoc DL(N);
4739
4740	SDValue N0 = N->getOperand(Num: `0`);
4741	SDValue N1 = N->getOperand(Num: `1`);
4742
4743	// Undo InstCombine canonicalize X (Y + 1) -> X * Y + X to enable mad*
4744	// matching.
4745
4746	// mul x, (add y, 1) -> add (mul x, y), x
4747	auto IsFoldableAdd = [](SDValue V) -> SDValue {
4748	SDValue AddOp = getAddOneOp(V: V.getNode());
4749	if (!AddOp)
4750	return SDValue ();
4751
4752	if (V.hasOneUse() \|\| all_of(Range: V ->users(), P: [](const SDNode U) -> bool* {
4753	return U->getOpcode() == ISD::MUL;
4754	}))
4755	return AddOp;
4756
4757	return SDValue ();
4758	};
4759
4760	// FIXME: The selection pattern is not properly checking for commuted
4761	// operands, so we have to place the mul in the LHS
4762	if (SDValue MulOper = IsFoldableAdd (N0)) {
4763	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4764	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4765	}
4766
4767	if (SDValue MulOper = IsFoldableAdd (N1)) {
4768	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4769	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4770	}
4771
4772	// There are i16 integer mul/mad.
4773	if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4774	return SDValue ();
4775
4776	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4777	// in the source into any_extends if the result of the mul is truncated. Since
4778	// we can assume the high bits are whatever we want, use the underlying value
4779	// to avoid the unknown high bits from interfering.
4780	if (N0.getOpcode() == ISD::ANY_EXTEND)
4781	N0 = N0.getOperand(i: `0`);
4782
4783	if (N1.getOpcode() == ISD::ANY_EXTEND)
4784	N1 = N1.getOperand(i: `0`);
4785
4786	SDValue Mul;
4787
4788	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4789	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4790	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4791	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4792	} else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4793	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4794	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4795	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4796	} else {
4797	return SDValue ();
4798	}
4799
4800	// We need to use sext even for MUL_U24, because MUL_U24 is used
4801	// for signed multiply of 8 and 16-bit types.
4802	return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4803	}
4804
4805	SDValue
4806	AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4807	DAGCombinerInfo &DCI) const {
4808	if (N->getValueType(ResNo: `0`) != MVT::i32)
4809	return SDValue ();
4810
4811	SelectionDAG &DAG = DCI.DAG;
4812	SDLoc DL(N);
4813
4814	bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4815	SDValue N0 = N->getOperand(Num: `0`);
4816	SDValue N1 = N->getOperand(Num: `1`);
4817
4818	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4819	// in the source into any_extends if the result of the mul is truncated. Since
4820	// we can assume the high bits are whatever we want, use the underlying value
4821	// to avoid the unknown high bits from interfering.
4822	if (N0.getOpcode() == ISD::ANY_EXTEND)
4823	N0 = N0.getOperand(i: `0`);
4824	if (N1.getOpcode() == ISD::ANY_EXTEND)
4825	N1 = N1.getOperand(i: `0`);
4826
4827	// Try to use two fast 24-bit multiplies (one for each half of the result)
4828	// instead of one slow extending multiply.
4829	unsigned LoOpcode = `0`;
4830	unsigned HiOpcode = `0`;
4831	if (Signed) {
4832	if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4833	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4834	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4835	LoOpcode = AMDGPUISD::MUL_I24;
4836	HiOpcode = AMDGPUISD::MULHI_I24;
4837	}
4838	} else {
4839	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4840	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4841	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4842	LoOpcode = AMDGPUISD::MUL_U24;
4843	HiOpcode = AMDGPUISD::MULHI_U24;
4844	}
4845	}
4846	if (!LoOpcode)
4847	return SDValue ();
4848
4849	SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4850	SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4851	DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4852	return SDValue (N, `0`);
4853	}
4854
4855	SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4856	DAGCombinerInfo &DCI) const {
4857	EVT VT = N->getValueType(ResNo: `0`);
4858
4859	if (!Subtarget->hasMulI24() \|\| VT.isVector())
4860	return SDValue ();
4861
4862	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4863	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4864	// unnecessarily). isDivergent() is used as an approximation of whether the
4865	// value is in an SGPR.
4866	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4867	// valu op anyway)
4868	if (Subtarget->hasSMulHi() && !N->isDivergent())
4869	return SDValue ();
4870
4871	SelectionDAG &DAG = DCI.DAG;
4872	SDLoc DL(N);
4873
4874	SDValue N0 = N->getOperand(Num: `0`);
4875	SDValue N1 = N->getOperand(Num: `1`);
4876
4877	if (!isI24(Op: N0, DAG) \|\| !isI24(Op: N1, DAG))
4878	return SDValue ();
4879
4880	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4881	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4882
4883	SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4884	DCI.AddToWorklist(N: Mulhi.getNode());
4885	return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4886	}
4887
4888	SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4889	DAGCombinerInfo &DCI) const {
4890	EVT VT = N->getValueType(ResNo: `0`);
4891
4892	if (VT.isVector() \|\| VT.getSizeInBits() > `32` \|\| !Subtarget->hasMulU24())
4893	return SDValue ();
4894
4895	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4896	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4897	// unnecessarily). isDivergent() is used as an approximation of whether the
4898	// value is in an SGPR.
4899	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4900	// valu op anyway)
4901	if (!N->isDivergent() && Subtarget->hasSMulHi())
4902	return SDValue ();
4903
4904	SelectionDAG &DAG = DCI.DAG;
4905	SDLoc DL(N);
4906
4907	SDValue N0 = N->getOperand(Num: `0`);
4908	SDValue N1 = N->getOperand(Num: `1`);
4909
4910	if (!isU24(Op: N0, DAG) \|\| !isU24(Op: N1, DAG))
4911	return SDValue ();
4912
4913	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4914	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4915
4916	SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4917	DCI.AddToWorklist(N: Mulhi.getNode());
4918	return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4919	}
4920
4921	SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4922	SDValue Op,
4923	const SDLoc &DL,
4924	unsigned Opc) const {
4925	EVT VT = Op.getValueType();
4926	if (VT.bitsGT(VT: MVT::i32))
4927	return SDValue ();
4928
4929	if (VT != MVT::i32)
4930	Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4931
4932	SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4933	if (VT != MVT::i32)
4934	FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4935
4936	return FFBX;
4937	}
4938
4939	// The native instructions return -1 on 0 input. Optimize out a select that
4940	// produces -1 on 0.
4941	//
4942	// TODO: If zero is not undef, we could also do this if the output is compared
4943	// against the bitwidth.
4944	//
4945	// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4946	SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4947	SDValue LHS, SDValue RHS,
4948	DAGCombinerInfo &DCI) const {
4949	if (!isNullConstant(V: Cond.getOperand(i: `1`)))
4950	return SDValue ();
4951
4952	SelectionDAG &DAG = DCI.DAG;
4953	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
4954	SDValue CmpLHS = Cond.getOperand(i: `0`);
4955
4956	// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4957	// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4958	if (CCOpcode == ISD::SETEQ &&
4959	(isCtlzOpc(Opc: RHS.getOpcode()) \|\| isCttzOpc(Opc: RHS.getOpcode())) &&
4960	RHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: LHS)) {
4961	unsigned Opc =
4962	isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4963	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4964	}
4965
4966	// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4967	// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4968	if (CCOpcode == ISD::SETNE &&
4969	(isCtlzOpc(Opc: LHS.getOpcode()) \|\| isCttzOpc(Opc: LHS.getOpcode())) &&
4970	LHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: RHS)) {
4971	unsigned Opc =
4972	isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4973
4974	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4975	}
4976
4977	return SDValue ();
4978	}
4979
4980	static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4981	unsigned Op,
4982	const SDLoc &SL,
4983	SDValue Cond,
4984	SDValue N1,
4985	SDValue N2) {
4986	SelectionDAG &DAG = DCI.DAG;
4987	EVT VT = N1.getValueType();
4988
4989	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4990	N2: N1.getOperand(i: `0`), N3: N2.getOperand(i: `0`));
4991	DCI.AddToWorklist(N: NewSelect.getNode());
4992	return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4993	}
4994
4995	// Pull a free FP operation out of a select so it may fold into uses.
4996	//
4997	// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4998	// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4999	//
5000	// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5001	// select c, (fabs x), +k -> fabs (select c, x, k)
5002	SDValue
5003	AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
5004	SDValue N) const {
5005	SelectionDAG &DAG = DCI.DAG;
5006	SDValue Cond = N.getOperand(i: `0`);
5007	SDValue LHS = N.getOperand(i: `1`);
5008	SDValue RHS = N.getOperand(i: `2`);
5009
5010	EVT VT = N.getValueType();
5011	if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) \|\|
5012	(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5013	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5014	return SDValue ();
5015
5016	return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
5017	SL: SDLoc (N), Cond, N1: LHS, N2: RHS);
5018	}
5019
5020	bool Inv = false;
5021	if (RHS.getOpcode() == ISD::FABS \|\| RHS.getOpcode() == ISD::FNEG) {
5022	std::swap(a&: LHS, b&: RHS);
5023	Inv = true;
5024	}
5025
5026	// TODO: Support vector constants.
5027	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
5028	if ((LHS.getOpcode() == ISD::FNEG \|\| LHS.getOpcode() == ISD::FABS) && CRHS &&
5029	!selectSupportsSourceMods(N: N.getNode())) {
5030	SDLoc SL(N);
5031	// If one side is an fneg/fabs and the other is a constant, we can push the
5032	// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5033	SDValue NewLHS = LHS.getOperand(i: `0`);
5034	SDValue NewRHS = RHS;
5035
5036	// Careful: if the neg can be folded up, don't try to pull it back down.
5037	bool ShouldFoldNeg = true;
5038
5039	if (NewLHS.hasOneUse()) {
5040	unsigned Opc = NewLHS.getOpcode();
5041	if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
5042	ShouldFoldNeg = false;
5043	if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5044	ShouldFoldNeg = false;
5045	}
5046
5047	if (ShouldFoldNeg) {
5048	if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5049	return SDValue ();
5050
5051	// We're going to be forced to use a source modifier anyway, there's no
5052	// point to pulling the negate out unless we can get a size reduction by
5053	// negating the constant.
5054	//
5055	// TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5056	// about cheaper constants.
5057	if (NewLHS.getOpcode() == ISD::FABS &&
5058	getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
5059	return SDValue ();
5060
5061	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5062	return SDValue ();
5063
5064	if (LHS.getOpcode() == ISD::FNEG)
5065	NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5066
5067	if (Inv)
5068	std::swap(a&: NewLHS, b&: NewRHS);
5069
5070	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
5071	N1: Cond, N2: NewLHS, N3: NewRHS);
5072	DCI.AddToWorklist(N: NewSelect.getNode());
5073	return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
5074	}
5075	}
5076
5077	return SDValue ();
5078	}
5079
5080	SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
5081	DAGCombinerInfo &DCI) const {
5082	if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue (N, `0`)))
5083	return Folded;
5084
5085	SDValue Cond = N->getOperand(Num: `0`);
5086	if (Cond.getOpcode() != ISD::SETCC)
5087	return SDValue ();
5088
5089	EVT VT = N->getValueType(ResNo: `0`);
5090	SDValue LHS = Cond.getOperand(i: `0`);
5091	SDValue RHS = Cond.getOperand(i: `1`);
5092	SDValue CC = Cond.getOperand(i: `2`);
5093
5094	SDValue True = N->getOperand(Num: `1`);
5095	SDValue False = N->getOperand(Num: `2`);
5096
5097	if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5098	SelectionDAG &DAG = DCI.DAG;
5099	if (DAG.isConstantValueOfAnyType(N: True) &&
5100	!DAG.isConstantValueOfAnyType(N: False)) {
5101	// Swap cmp + select pair to move constant to false input.
5102	// This will allow using VOPC cndmasks more often.
5103	// select (setcc x, y), k, x -> select (setccinv x, y), x, k
5104
5105	SDLoc SL(N);
5106	ISD::CondCode NewCC =
5107	getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
5108
5109	SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
5110	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
5111	}
5112
5113	if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5114	SDValue MinMax
5115	= combineFMinMaxLegacy(DL: SDLoc (N), VT, LHS, RHS, True, False, CC, DCI);
5116	// Revisit this node so we can catch min3/max3/med3 patterns.
5117	//DCI.AddToWorklist(MinMax.getNode());
5118	return MinMax;
5119	}
5120	}
5121
5122	// There's no reason to not do this if the condition has other uses.
5123	return performCtlz_CttzCombine(SL: SDLoc (N), Cond, LHS: True, RHS: False, DCI);
5124	}
5125
5126	static bool isInv2Pi(const APFloat &APF) {
5127	static const APFloat KF16(APFloat::IEEEhalf(), APInt (`16`, `0x3118`));
5128	static const APFloat KF32(APFloat::IEEEsingle(), APInt (`32`, `0x3e22f983`));
5129	static const APFloat KF64(APFloat::IEEEdouble(), APInt (`64`, `0x3fc45f306dc9c882`));
5130
5131	return APF.bitwiseIsEqual(RHS: KF16) \|\|
5132	APF.bitwiseIsEqual(RHS: KF32) \|\|
5133	APF.bitwiseIsEqual(RHS: KF64);
5134	}
5135
5136	// 0 and 1.0 / (0.5 pi) do not have inline immmediates, so there is an*
5137	// additional cost to negate them.
5138	TargetLowering::NegatibleCost
5139	AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode C) const* {
5140	if (C->isZero())
5141	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5142
5143	if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5144	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5145
5146	return NegatibleCost::Neutral;
5147	}
5148
5149	bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5150	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5151	return getConstantNegateCost(C) == NegatibleCost::Expensive;
5152	return false;
5153	}
5154
5155	bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5156	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5157	return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5158	return false;
5159	}
5160
5161	static unsigned inverseMinMax(unsigned Opc) {
5162	switch (Opc) {
5163	case ISD::FMAXNUM:
5164	return ISD::FMINNUM;
5165	case ISD::FMINNUM:
5166	return ISD::FMAXNUM;
5167	case ISD::FMAXNUM_IEEE:
5168	return ISD::FMINNUM_IEEE;
5169	case ISD::FMINNUM_IEEE:
5170	return ISD::FMAXNUM_IEEE;
5171	case ISD::FMAXIMUM:
5172	return ISD::FMINIMUM;
5173	case ISD::FMINIMUM:
5174	return ISD::FMAXIMUM;
5175	case ISD::FMAXIMUMNUM:
5176	return ISD::FMINIMUMNUM;
5177	case ISD::FMINIMUMNUM:
5178	return ISD::FMAXIMUMNUM;
5179	case AMDGPUISD::FMAX_LEGACY:
5180	return AMDGPUISD::FMIN_LEGACY;
5181	case AMDGPUISD::FMIN_LEGACY:
5182	return AMDGPUISD::FMAX_LEGACY;
5183	default:
5184	llvm_unreachable("invalid min/max opcode");
5185	}
5186	}
5187
5188	/// \return true if it's profitable to try to push an fneg into its source
5189	/// instruction.
5190	bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5191	// If the input has multiple uses and we can either fold the negate down, or
5192	// the other uses cannot, give up. This both prevents unprofitable
5193	// transformations and infinite loops: we won't repeatedly try to fold around
5194	// a negate that has no 'good' form.
5195	if (N0.hasOneUse()) {
5196	// This may be able to fold into the source, but at a code size cost. Don't
5197	// fold if the fold into the user is free.
5198	if (allUsesHaveSourceMods(N, CostThreshold: `0`))
5199	return false;
5200	} else {
5201	if (fnegFoldsIntoOp(N: N0.getNode()) &&
5202	(allUsesHaveSourceMods(N) \|\| !allUsesHaveSourceMods(N: N0.getNode())))
5203	return false;
5204	}
5205
5206	return true;
5207	}
5208
5209	SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5210	DAGCombinerInfo &DCI) const {
5211	SelectionDAG &DAG = DCI.DAG;
5212	SDValue N0 = N->getOperand(Num: `0`);
5213	EVT VT = N->getValueType(ResNo: `0`);
5214
5215	unsigned Opc = N0.getOpcode();
5216
5217	if (!shouldFoldFNegIntoSrc(N, N0))
5218	return SDValue ();
5219
5220	SDLoc SL(N);
5221	switch (Opc) {
5222	case ISD::FADD: {
5223	if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5224	return SDValue ();
5225
5226	// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5227	SDValue LHS = N0.getOperand(i: `0`);
5228	SDValue RHS = N0.getOperand(i: `1`);
5229
5230	if (LHS.getOpcode() != ISD::FNEG)
5231	LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5232	else
5233	LHS = LHS.getOperand(i: `0`);
5234
5235	if (RHS.getOpcode() != ISD::FNEG)
5236	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5237	else
5238	RHS = RHS.getOperand(i: `0`);
5239
5240	SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
5241	if (Res.getOpcode() != ISD::FADD)
5242	return SDValue (); // Op got folded away.
5243	if (!N0.hasOneUse())
5244	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5245	return Res;
5246	}
5247	case ISD::FMUL:
5248	case AMDGPUISD::FMUL_LEGACY: {
5249	// (fneg (fmul x, y)) -> (fmul x, (fneg y))
5250	// (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5251	SDValue LHS = N0.getOperand(i: `0`);
5252	SDValue RHS = N0.getOperand(i: `1`);
5253
5254	if (LHS.getOpcode() == ISD::FNEG)
5255	LHS = LHS.getOperand(i: `0`);
5256	else if (RHS.getOpcode() == ISD::FNEG)
5257	RHS = RHS.getOperand(i: `0`);
5258	else
5259	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5260
5261	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
5262	if (Res.getOpcode() != Opc)
5263	return SDValue (); // Op got folded away.
5264	if (!N0.hasOneUse())
5265	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5266	return Res;
5267	}
5268	case ISD::FMA:
5269	case ISD::FMAD: {
5270	// TODO: handle llvm.amdgcn.fma.legacy
5271	if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5272	return SDValue ();
5273
5274	// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5275	SDValue LHS = N0.getOperand(i: `0`);
5276	SDValue MHS = N0.getOperand(i: `1`);
5277	SDValue RHS = N0.getOperand(i: `2`);
5278
5279	if (LHS.getOpcode() == ISD::FNEG)
5280	LHS = LHS.getOperand(i: `0`);
5281	else if (MHS.getOpcode() == ISD::FNEG)
5282	MHS = MHS.getOperand(i: `0`);
5283	else
5284	MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5285
5286	if (RHS.getOpcode() != ISD::FNEG)
5287	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5288	else
5289	RHS = RHS.getOperand(i: `0`);
5290
5291	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5292	if (Res.getOpcode() != Opc)
5293	return SDValue (); // Op got folded away.
5294	if (!N0.hasOneUse())
5295	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5296	return Res;
5297	}
5298	case ISD::FMAXNUM:
5299	case ISD::FMINNUM:
5300	case ISD::FMAXNUM_IEEE:
5301	case ISD::FMINNUM_IEEE:
5302	case ISD::FMINIMUM:
5303	case ISD::FMAXIMUM:
5304	case ISD::FMINIMUMNUM:
5305	case ISD::FMAXIMUMNUM:
5306	case AMDGPUISD::FMAX_LEGACY:
5307	case AMDGPUISD::FMIN_LEGACY: {
5308	// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5309	// fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5310	// fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5311	// fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5312
5313	SDValue LHS = N0.getOperand(i: `0`);
5314	SDValue RHS = N0.getOperand(i: `1`);
5315
5316	// 0 doesn't have a negated inline immediate.
5317	// TODO: This constant check should be generalized to other operations.
5318	if (isConstantCostlierToNegate(N: RHS))
5319	return SDValue ();
5320
5321	SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5322	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5323	unsigned Opposite = inverseMinMax(Opc);
5324
5325	SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0 ->getFlags());
5326	if (Res.getOpcode() != Opposite)
5327	return SDValue (); // Op got folded away.
5328	if (!N0.hasOneUse())
5329	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5330	return Res;
5331	}
5332	case AMDGPUISD::FMED3: {
5333	SDValue Ops[`3`];
5334	for (unsigned I = `0`; I < `3`; ++I)
5335	Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0 ->getOperand(Num: I), Flags: N0 ->getFlags());
5336
5337	SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0 ->getFlags());
5338	if (Res.getOpcode() != AMDGPUISD::FMED3)
5339	return SDValue (); // Op got folded away.
5340
5341	if (!N0.hasOneUse()) {
5342	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5343	DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5344
5345	for (SDNode *U : Neg ->users())
5346	DCI.AddToWorklist(N: U);
5347	}
5348
5349	return Res;
5350	}
5351	case ISD::FP_EXTEND:
5352	case ISD::FTRUNC:
5353	case ISD::FRINT:
5354	case ISD::FNEARBYINT: // XXX - Should fround be handled?
5355	case ISD::FROUNDEVEN:
5356	case ISD::FSIN:
5357	case ISD::FCANONICALIZE:
5358	case AMDGPUISD::RCP:
5359	case AMDGPUISD::RCP_LEGACY:
5360	case AMDGPUISD::RCP_IFLAG:
5361	case AMDGPUISD::SIN_HW: {
5362	SDValue CvtSrc = N0.getOperand(i: `0`);
5363	if (CvtSrc.getOpcode() == ISD::FNEG) {
5364	// (fneg (fp_extend (fneg x))) -> (fp_extend x)
5365	// (fneg (rcp (fneg x))) -> (rcp x)
5366	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: `0`));
5367	}
5368
5369	if (!N0.hasOneUse())
5370	return SDValue ();
5371
5372	// (fneg (fp_extend x)) -> (fp_extend (fneg x))
5373	// (fneg (rcp x)) -> (rcp (fneg x))
5374	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5375	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0 ->getFlags());
5376	}
5377	case ISD::FP_ROUND: {
5378	SDValue CvtSrc = N0.getOperand(i: `0`);
5379
5380	if (CvtSrc.getOpcode() == ISD::FNEG) {
5381	// (fneg (fp_round (fneg x))) -> (fp_round x)
5382	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5383	N1: CvtSrc.getOperand(i: `0`), N2: N0.getOperand(i: `1`));
5384	}
5385
5386	if (!N0.hasOneUse())
5387	return SDValue ();
5388
5389	// (fneg (fp_round x)) -> (fp_round (fneg x))
5390	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5391	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: `1`));
5392	}
5393	case ISD::FP16_TO_FP: {
5394	// v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5395	// f16, but legalization of f16 fneg ends up pulling it out of the source.
5396	// Put the fneg back as a legal source operation that can be matched later.
5397	SDLoc SL(N);
5398
5399	SDValue Src = N0.getOperand(i: `0`);
5400	EVT SrcVT = Src.getValueType();
5401
5402	// fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5403	SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5404	N2: DAG.getConstant(Val: `0x8000`, DL: SL, VT: SrcVT));
5405	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFNeg);
5406	}
5407	case ISD::SELECT: {
5408	// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5409	// TODO: Invert conditions of foldFreeOpFromSelect
5410	return SDValue ();
5411	}
5412	case ISD::BITCAST: {
5413	SDLoc SL(N);
5414	SDValue BCSrc = N0.getOperand(i: `0`);
5415	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5416	SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - `1`);
5417	if (HighBits.getValueType().getSizeInBits() != `32` \|\|
5418	!fnegFoldsIntoOp(N: HighBits.getNode()))
5419	return SDValue ();
5420
5421	// f64 fneg only really needs to operate on the high half of of the
5422	// register, so try to force it to an f32 operation to help make use of
5423	// source modifiers.
5424	//
5425	//
5426	// fneg (f64 (bitcast (build_vector x, y))) ->
5427	// f64 (bitcast (build_vector (bitcast i32:x to f32),
5428	// (fneg (bitcast i32:y to f32)))
5429
5430	SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5431	SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5432	SDValue CastBack =
5433	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5434
5435	SmallVector<SDValue, `8`> Ops(BCSrc ->ops());
5436	Ops.back() = CastBack;
5437	DCI.AddToWorklist(N: NegHi.getNode());
5438	SDValue Build =
5439	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5440	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5441
5442	if (!N0.hasOneUse())
5443	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5444	return Result;
5445	}
5446
5447	if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5448	BCSrc.hasOneUse()) {
5449	// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5450	// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5451
5452	// TODO: Cast back result for multiple uses is beneficial in some cases.
5453
5454	SDValue LHS =
5455	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: `1`));
5456	SDValue RHS =
5457	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: `2`));
5458
5459	SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5460	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5461
5462	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: `0`), N2: NegLHS,
5463	N3: NegRHS);
5464	}
5465
5466	return SDValue ();
5467	}
5468	default:
5469	return SDValue ();
5470	}
5471	}
5472
5473	SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5474	DAGCombinerInfo &DCI) const {
5475	SelectionDAG &DAG = DCI.DAG;
5476	SDValue N0 = N->getOperand(Num: `0`);
5477
5478	if (!N0.hasOneUse())
5479	return SDValue ();
5480
5481	switch (N0.getOpcode()) {
5482	case ISD::FP16_TO_FP: {
5483	assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5484	SDLoc SL(N);
5485	SDValue Src = N0.getOperand(i: `0`);
5486	EVT SrcVT = Src.getValueType();
5487
5488	// fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5489	SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5490	N2: DAG.getConstant(Val: `0x7fff`, DL: SL, VT: SrcVT));
5491	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFAbs);
5492	}
5493	default:
5494	return SDValue ();
5495	}
5496	}
5497
5498	SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5499	DAGCombinerInfo &DCI) const {
5500	const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: `0`));
5501	if (!CFP)
5502	return SDValue ();
5503
5504	// XXX - Should this flush denormals?
5505	const APFloat &Val = CFP->getValueAPF();
5506	APFloat One(Val.getSemantics(), "1.0");
5507	return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
5508	}
5509
5510	bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode N, SelectionDAG &DAG) const* {
5511	if (!Subtarget->isGCN())
5512	return false;
5513
5514	ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(Val: N);
5515	ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(Val: N);
5516	auto &ST = DAG.getSubtarget<GCNSubtarget>();
5517	const auto *TII = ST.getInstrInfo();
5518
5519	if (!ST.hasMovB64() \|\| (!SDConstant && !SDFPConstant))
5520	return false;
5521
5522	if (ST.has64BitLiterals())
5523	return true;
5524
5525	if (SDConstant) {
5526	const APInt &APVal = SDConstant->getAPIntValue();
5527	return isUInt<`32`>(x: APVal.getZExtValue()) \|\| TII->isInlineConstant(Imm: APVal);
5528	}
5529
5530	APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5531	return isUInt<`32`>(x: Val.getZExtValue()) \|\| TII->isInlineConstant(Imm: Val);
5532	}
5533
5534	SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5535	DAGCombinerInfo &DCI) const {
5536	SelectionDAG &DAG = DCI.DAG;
5537	SDLoc DL(N);
5538
5539	switch(N->getOpcode()) {
5540	default:
5541	break;
5542	case ISD::BITCAST: {
5543	EVT DestVT = N->getValueType(ResNo: `0`);
5544
5545	// Push casts through vector builds. This helps avoid emitting a large
5546	// number of copies when materializing floating point vector constants.
5547	//
5548	// vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5549	// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5550	if (DestVT.isVector()) {
5551	SDValue Src = N->getOperand(Num: `0`);
5552	if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5553	(DCI.getDAGCombineLevel() < AfterLegalizeDAG \|\|
5554	isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5555	EVT SrcVT = Src.getValueType();
5556	unsigned NElts = DestVT.getVectorNumElements();
5557
5558	if (SrcVT.getVectorNumElements() == NElts) {
5559	EVT DestEltVT = DestVT.getVectorElementType();
5560
5561	SmallVector<SDValue, `8`> CastedElts;
5562	SDLoc SL(N);
5563	for (unsigned I = `0`, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5564	SDValue Elt = Src.getOperand(i: I);
5565	CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5566	}
5567
5568	return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5569	}
5570	}
5571	}
5572
5573	if (DestVT.getSizeInBits() != `64` \|\| !DestVT.isVector())
5574	break;
5575
5576	// Fold bitcasts of constants.
5577	//
5578	// v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5579	// TODO: Generalize and move to DAGCombiner
5580	SDValue Src = N->getOperand(Num: `0`);
5581	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5582	SDLoc SL(N);
5583	if (isInt64ImmLegal(N: C, DAG))
5584	break;
5585	uint64_t CVal = C->getZExtValue();
5586	SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5587	N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5588	N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5589	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5590	}
5591
5592	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5593	const APInt &Val = C->getValueAPF().bitcastToAPInt();
5594	SDLoc SL(N);
5595	if (isInt64ImmLegal(N: C, DAG))
5596	break;
5597	uint64_t CVal = Val.getZExtValue();
5598	SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5599	N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5600	N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5601
5602	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5603	}
5604
5605	break;
5606	}
5607	case ISD::SHL:
5608	case ISD::SRA:
5609	case ISD::SRL: {
5610	// Range metadata can be invalidated when loads are converted to legal types
5611	// (e.g. v2i64 -> v4i32).
5612	// Try to convert vector shl/sra/srl before type legalization so that range
5613	// metadata can be utilized.
5614	if (!(N->getValueType(ResNo: `0`).isVector() &&
5615	DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5616	DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5617	break;
5618	if (N->getOpcode() == ISD::SHL)
5619	return performShlCombine(N, DCI);
5620	if (N->getOpcode() == ISD::SRA)
5621	return performSraCombine(N, DCI);
5622	return performSrlCombine(N, DCI);
5623	}
5624	case ISD::TRUNCATE:
5625	return performTruncateCombine(N, DCI);
5626	case ISD::MUL:
5627	return performMulCombine(N, DCI);
5628	case AMDGPUISD::MUL_U24:
5629	case AMDGPUISD::MUL_I24: {
5630	if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5631	return Simplified;
5632	break;
5633	}
5634	case AMDGPUISD::MULHI_I24:
5635	case AMDGPUISD::MULHI_U24:
5636	return simplifyMul24(Node24: N, DCI);
5637	case ISD::SMUL_LOHI:
5638	case ISD::UMUL_LOHI:
5639	return performMulLoHiCombine(N, DCI);
5640	case ISD::MULHS:
5641	return performMulhsCombine(N, DCI);
5642	case ISD::MULHU:
5643	return performMulhuCombine(N, DCI);
5644	case ISD::SELECT:
5645	return performSelectCombine(N, DCI);
5646	case ISD::FNEG:
5647	return performFNegCombine(N, DCI);
5648	case ISD::FABS:
5649	return performFAbsCombine(N, DCI);
5650	case AMDGPUISD::BFE_I32:
5651	case AMDGPUISD::BFE_U32: {
5652	assert(!N->getValueType(`0`).isVector() &&
5653	"Vector handling of BFE not implemented");
5654	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
5655	if (!Width)
5656	break;
5657
5658	uint32_t WidthVal = Width->getZExtValue() & `0x1f`;
5659	if (WidthVal == `0`)
5660	return DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
5661
5662	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
5663	if (!Offset)
5664	break;
5665
5666	SDValue BitsFrom = N->getOperand(Num: `0`);
5667	uint32_t OffsetVal = Offset->getZExtValue() & `0x1f`;
5668
5669	bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5670
5671	if (OffsetVal == `0`) {
5672	// This is already sign / zero extended, so try to fold away extra BFEs.
5673	unsigned SignBits = Signed ? (`32` - WidthVal + `1`) : (`32` - WidthVal);
5674
5675	unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5676	if (OpSignBits >= SignBits)
5677	return BitsFrom;
5678
5679	EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5680	if (Signed) {
5681	// This is a sign_extend_inreg. Replace it to take advantage of existing
5682	// DAG Combines. If not eliminated, we will match back to BFE during
5683	// selection.
5684
5685	// TODO: The sext_inreg of extended types ends, although we can could
5686	// handle them in a single BFE.
5687	return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5688	N2: DAG.getValueType(SmallVT));
5689	}
5690
5691	return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5692	}
5693
5694	if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5695	if (Signed) {
5696	return constantFoldBFE<int32_t>(DAG,
5697	Src0: CVal->getSExtValue(),
5698	Offset: OffsetVal,
5699	Width: WidthVal,
5700	DL);
5701	}
5702
5703	return constantFoldBFE<uint32_t>(DAG,
5704	Src0: CVal->getZExtValue(),
5705	Offset: OffsetVal,
5706	Width: WidthVal,
5707	DL);
5708	}
5709
5710	if ((OffsetVal + WidthVal) >= `32` &&
5711	!(OffsetVal == `16` && WidthVal == `16` && Subtarget->hasSDWA())) {
5712	SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5713	return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5714	N1: BitsFrom, N2: ShiftVal);
5715	}
5716
5717	if (BitsFrom.hasOneUse()) {
5718	APInt Demanded = APInt::getBitsSet(numBits: `32`,
5719	loBit: OffsetVal,
5720	hiBit: OffsetVal + WidthVal);
5721
5722	KnownBits Known;
5723	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5724	!DCI.isBeforeLegalizeOps());
5725	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5726	if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) \|\|
5727	TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5728	DCI.CommitTargetLoweringOpt(TLO);
5729	}
5730	}
5731
5732	break;
5733	}
5734	case ISD::LOAD:
5735	return performLoadCombine(N, DCI);
5736	case ISD::STORE:
5737	return performStoreCombine(N, DCI);
5738	case AMDGPUISD::RCP:
5739	case AMDGPUISD::RCP_IFLAG:
5740	return performRcpCombine(N, DCI);
5741	case ISD::AssertZext:
5742	case ISD::AssertSext:
5743	return performAssertSZExtCombine(N, DCI);
5744	case ISD::INTRINSIC_WO_CHAIN:
5745	return performIntrinsicWOChainCombine(N, DCI);
5746	case AMDGPUISD::FMAD_FTZ: {
5747	SDValue N0 = N->getOperand(Num: `0`);
5748	SDValue N1 = N->getOperand(Num: `1`);
5749	SDValue N2 = N->getOperand(Num: `2`);
5750	EVT VT = N->getValueType(ResNo: `0`);
5751
5752	// FMAD_FTZ is a FMAD + flush denormals to zero.
5753	// We flush the inputs, the intermediate step, and the output.
5754	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5755	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5756	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5757	if (N0CFP && N1CFP && N2CFP) {
5758	const auto FTZ = [](const APFloat &V) {
5759	if (V.isDenormal()) {
5760	APFloat Zero(V.getSemantics(), `0`);
5761	return V.isNegative() ? -Zero : Zero;
5762	}
5763	return V;
5764	};
5765
5766	APFloat V0 = FTZ (N0CFP->getValueAPF());
5767	APFloat V1 = FTZ (N1CFP->getValueAPF());
5768	APFloat V2 = FTZ (N2CFP->getValueAPF());
5769	V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5770	V0 = FTZ (V0);
5771	V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5772	return DAG.getConstantFP(Val: FTZ (V0), DL, VT);
5773	}
5774	break;
5775	}
5776	}
5777	return SDValue ();
5778	}
5779
5780	//===----------------------------------------------------------------------===//
5781	// Helper functions
5782	//===----------------------------------------------------------------------===//
5783
5784	SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5785	const TargetRegisterClass *RC,
5786	Register Reg, EVT VT,
5787	const SDLoc &SL,
5788	bool RawReg) const {
5789	MachineFunction &MF = DAG.getMachineFunction();
5790	MachineRegisterInfo &MRI = MF.getRegInfo();
5791	Register VReg;
5792
5793	if (!MRI.isLiveIn(Reg)) {
5794	VReg = MRI.createVirtualRegister(RegClass: RC);
5795	MRI.addLiveIn(Reg, vreg: VReg);
5796	} else {
5797	VReg = MRI.getLiveInVirtReg(PReg: Reg);
5798	}
5799
5800	if (RawReg)
5801	return DAG.getRegister(Reg: VReg, VT);
5802
5803	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5804	}
5805
5806	// This may be called multiple times, and nothing prevents creating multiple
5807	// objects at the same offset. See if we already defined this object.
5808	static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5809	int64_t Offset) {
5810	for (int I = MFI.getObjectIndexBegin(); I < `0`; ++I) {
5811	if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5812	assert(MFI.getObjectSize(I) == Size);
5813	return I;
5814	}
5815	}
5816
5817	return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5818	}
5819
5820	SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5821	EVT VT,
5822	const SDLoc &SL,
5823	int64_t Offset) const {
5824	MachineFunction &MF = DAG.getMachineFunction();
5825	MachineFrameInfo &MFI = MF.getFrameInfo();
5826	int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5827
5828	auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5829	SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5830
5831	return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align (`4`),
5832	MMOFlags: MachineMemOperand::MODereferenceable \|
5833	MachineMemOperand::MOInvariant);
5834	}
5835
5836	SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5837	const SDLoc &SL,
5838	SDValue Chain,
5839	SDValue ArgVal,
5840	int64_t Offset) const {
5841	MachineFunction &MF = DAG.getMachineFunction();
5842	MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5843	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5844
5845	SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5846	// Stores to the argument stack area are relative to the stack pointer.
5847	SDValue SP =
5848	DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5849	Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5850	SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align (`4`),
5851	MMOFlags: MachineMemOperand::MODereferenceable);
5852	return Store;
5853	}
5854
5855	SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5856	const TargetRegisterClass *RC,
5857	EVT VT, const SDLoc &SL,
5858	const ArgDescriptor &Arg) const {
5859	assert(Arg && "Attempting to load missing argument");
5860
5861	SDValue V = Arg.isRegister() ?
5862	CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5863	loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5864
5865	if (!Arg.isMasked())
5866	return V;
5867
5868	unsigned Mask = Arg.getMask();
5869	unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5870	V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5871	N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5872	return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5873	N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5874	}
5875
5876	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5877	uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5878	unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5879	const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5880	uint64_t ArgOffset =
5881	alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5882	switch (Param) {
5883	case FIRST_IMPLICIT:
5884	return ArgOffset;
5885	case PRIVATE_BASE:
5886	return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5887	case SHARED_BASE:
5888	return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5889	case QUEUE_PTR:
5890	return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5891	}
5892	llvm_unreachable("unexpected implicit parameter type");
5893	}
5894
5895	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5896	const MachineFunction &MF, const ImplicitParameter Param) const {
5897	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5898	return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5899	}
5900
5901	SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5902	SelectionDAG &DAG, int Enabled,
5903	int &RefinementSteps,
5904	bool &UseOneConstNR,
5905	bool Reciprocal) const {
5906	EVT VT = Operand.getValueType();
5907
5908	if (VT == MVT::f32) {
5909	RefinementSteps = `0`;
5910	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc (Operand), VT, Operand);
5911	}
5912
5913	// TODO: There is also f64 rsq instruction, but the documentation is less
5914	// clear on its precision.
5915
5916	return SDValue ();
5917	}
5918
5919	SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5920	SelectionDAG &DAG, int Enabled,
5921	int &RefinementSteps) const {
5922	EVT VT = Operand.getValueType();
5923
5924	if (VT == MVT::f32) {
5925	// Reciprocal, < 1 ulp error.
5926	//
5927	// This reciprocal approximation converges to < 0.5 ulp error with one
5928	// newton rhapson performed with two fused multiple adds (FMAs).
5929
5930	RefinementSteps = `0`;
5931	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc (Operand), VT, Operand);
5932	}
5933
5934	// TODO: There is also f64 rcp instruction, but the documentation is less
5935	// clear on its precision.
5936
5937	return SDValue ();
5938	}
5939
5940	static unsigned workitemIntrinsicDim(unsigned ID) {
5941	switch (ID) {
5942	case Intrinsic::amdgcn_workitem_id_x:
5943	return `0`;
5944	case Intrinsic::amdgcn_workitem_id_y:
5945	return `1`;
5946	case Intrinsic::amdgcn_workitem_id_z:
5947	return `2`;
5948	default:
5949	llvm_unreachable("not a workitem intrinsic");
5950	}
5951	}
5952
5953	void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5954	const SDValue Op, KnownBits &Known,
5955	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5956
5957	Known.resetAll(); // Don't know anything.
5958
5959	unsigned Opc = Op.getOpcode();
5960
5961	switch (Opc) {
5962	default:
5963	break;
5964	case AMDGPUISD::CARRY:
5965	case AMDGPUISD::BORROW: {
5966	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `31`);
5967	break;
5968	}
5969
5970	case AMDGPUISD::BFE_I32:
5971	case AMDGPUISD::BFE_U32: {
5972	ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5973	if (!CWidth)
5974	return;
5975
5976	uint32_t Width = CWidth->getZExtValue() & `0x1f`;
5977
5978	if (Opc == AMDGPUISD::BFE_U32)
5979	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `32` - Width);
5980
5981	break;
5982	}
5983	case AMDGPUISD::FP_TO_FP16: {
5984	unsigned BitWidth = Known.getBitWidth();
5985
5986	// High bits are zero.
5987	Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
5988	break;
5989	}
5990	case AMDGPUISD::MUL_U24:
5991	case AMDGPUISD::MUL_I24: {
5992	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5993	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5994	unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5995	RHSKnown.countMinTrailingZeros();
5996	Known.Zero.setLowBits(std::min(a: TrailZ, b: `32u`));
5997	// Skip extra check if all bits are known zeros.
5998	if (TrailZ >= `32`)
5999	break;
6000
6001	// Truncate to 24 bits.
6002	LHSKnown = LHSKnown.trunc(BitWidth: `24`);
6003	RHSKnown = RHSKnown.trunc(BitWidth: `24`);
6004
6005	if (Opc == AMDGPUISD::MUL_I24) {
6006	unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
6007	unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
6008	unsigned MaxValBits = LHSValBits + RHSValBits;
6009	if (MaxValBits > `32`)
6010	break;
6011	unsigned SignBits = `32` - MaxValBits + `1`;
6012	bool LHSNegative = LHSKnown.isNegative();
6013	bool LHSNonNegative = LHSKnown.isNonNegative();
6014	bool LHSPositive = LHSKnown.isStrictlyPositive();
6015	bool RHSNegative = RHSKnown.isNegative();
6016	bool RHSNonNegative = RHSKnown.isNonNegative();
6017	bool RHSPositive = RHSKnown.isStrictlyPositive();
6018
6019	if ((LHSNonNegative && RHSNonNegative) \|\| (LHSNegative && RHSNegative))
6020	Known.Zero.setHighBits(SignBits);
6021	else if ((LHSNegative && RHSPositive) \|\| (LHSPositive && RHSNegative))
6022	Known.One.setHighBits(SignBits);
6023	} else {
6024	unsigned LHSValBits = LHSKnown.countMaxActiveBits();
6025	unsigned RHSValBits = RHSKnown.countMaxActiveBits();
6026	unsigned MaxValBits = LHSValBits + RHSValBits;
6027	if (MaxValBits >= `32`)
6028	break;
6029	Known.Zero.setBitsFrom(MaxValBits);
6030	}
6031	break;
6032	}
6033	case AMDGPUISD::PERM: {
6034	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
6035	if (!CMask)
6036	return;
6037
6038	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6039	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
6040	unsigned Sel = CMask->getZExtValue();
6041
6042	for (unsigned I = `0`; I < `32`; I += `8`) {
6043	unsigned SelBits = Sel & `0xff`;
6044	if (SelBits < `4`) {
6045	SelBits *= `8`;
6046	Known.One \|= ((RHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
6047	Known.Zero \|= ((RHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
6048	} else if (SelBits < `7`) {
6049	SelBits = (SelBits & `3`) * `8`;
6050	Known.One \|= ((LHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
6051	Known.Zero \|= ((LHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
6052	} else if (SelBits == `0x0c`) {
6053	Known.Zero \|= `0xFFull` << I;
6054	} else if (SelBits > `0x0c`) {
6055	Known.One \|= `0xFFull` << I;
6056	}
6057	Sel >>= `8`;
6058	}
6059	break;
6060	}
6061	case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6062	Known.Zero.setHighBits(`24`);
6063	break;
6064	}
6065	case AMDGPUISD::BUFFER_LOAD_USHORT: {
6066	Known.Zero.setHighBits(`16`);
6067	break;
6068	}
6069	case AMDGPUISD::LDS: {
6070	auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: `0`).getNode());
6071	Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
6072
6073	Known.Zero.setHighBits(`16`);
6074	Known.Zero.setLowBits(Log2(A: Alignment));
6075	break;
6076	}
6077	case AMDGPUISD::SMIN3:
6078	case AMDGPUISD::SMAX3:
6079	case AMDGPUISD::SMED3:
6080	case AMDGPUISD::UMIN3:
6081	case AMDGPUISD::UMAX3:
6082	case AMDGPUISD::UMED3: {
6083	KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
6084	if (Known2.isUnknown())
6085	break;
6086
6087	KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
6088	if (Known1.isUnknown())
6089	break;
6090
6091	KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6092	if (Known0.isUnknown())
6093	break;
6094
6095	// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6096	Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6097	Known.One = Known0.One & Known1.One & Known2.One;
6098	break;
6099	}
6100	case ISD::INTRINSIC_WO_CHAIN: {
6101	unsigned IID = Op.getConstantOperandVal(i: `0`);
6102	switch (IID) {
6103	case Intrinsic::amdgcn_workitem_id_x:
6104	case Intrinsic::amdgcn_workitem_id_y:
6105	case Intrinsic::amdgcn_workitem_id_z: {
6106	unsigned MaxValue = Subtarget->getMaxWorkitemID(
6107	Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
6108	Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
6109	break;
6110	}
6111	default:
6112	break;
6113	}
6114	}
6115	}
6116	}
6117
6118	unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6119	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6120	unsigned Depth) const {
6121	switch (Op.getOpcode()) {
6122	case AMDGPUISD::BFE_I32: {
6123	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
6124	if (!Width)
6125	return `1`;
6126
6127	unsigned SignBits = `32` - (Width->getZExtValue() & `0x1f`) + `1`;
6128	if (!isNullConstant(V: Op.getOperand(i: `1`)))
6129	return SignBits;
6130
6131	// TODO: Could probably figure something out with non-0 offsets.
6132	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6133	return std::max(a: SignBits, b: Op0SignBits);
6134	}
6135
6136	case AMDGPUISD::BFE_U32: {
6137	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
6138	return Width ? `32` - (Width->getZExtValue() & `0x1f`) : `1`;
6139	}
6140
6141	case AMDGPUISD::CARRY:
6142	case AMDGPUISD::BORROW:
6143	return `31`;
6144	case AMDGPUISD::BUFFER_LOAD_BYTE:
6145	return `25`;
6146	case AMDGPUISD::BUFFER_LOAD_SHORT:
6147	return `17`;
6148	case AMDGPUISD::BUFFER_LOAD_UBYTE:
6149	return `24`;
6150	case AMDGPUISD::BUFFER_LOAD_USHORT:
6151	return `16`;
6152	case AMDGPUISD::FP_TO_FP16:
6153	return `16`;
6154	case AMDGPUISD::SMIN3:
6155	case AMDGPUISD::SMAX3:
6156	case AMDGPUISD::SMED3:
6157	case AMDGPUISD::UMIN3:
6158	case AMDGPUISD::UMAX3:
6159	case AMDGPUISD::UMED3: {
6160	unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
6161	if (Tmp2 == `1`)
6162	return `1`; // Early out.
6163
6164	unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
6165	if (Tmp1 == `1`)
6166	return `1`; // Early out.
6167
6168	unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6169	if (Tmp0 == `1`)
6170	return `1`; // Early out.
6171
6172	return std::min(l: {Tmp0, Tmp1, Tmp2});
6173	}
6174	default:
6175	return `1`;
6176	}
6177	}
6178
6179	unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6180	GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6181	const MachineRegisterInfo &MRI, unsigned Depth) const {
6182	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6183	if (!MI)
6184	return `1`;
6185
6186	// TODO: Check range metadata on MMO.
6187	switch (MI->getOpcode()) {
6188	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6189	return `25`;
6190	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6191	return `17`;
6192	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6193	return `24`;
6194	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6195	return `16`;
6196	case AMDGPU::G_AMDGPU_SMED3:
6197	case AMDGPU::G_AMDGPU_UMED3: {
6198	auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6199	unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + `1`);
6200	if (Tmp2 == `1`)
6201	return `1`;
6202	unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + `1`);
6203	if (Tmp1 == `1`)
6204	return `1`;
6205	unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + `1`);
6206	if (Tmp0 == `1`)
6207	return `1`;
6208	return std::min(l: {Tmp0, Tmp1, Tmp2});
6209	}
6210	default:
6211	return `1`;
6212	}
6213	}
6214
6215	bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6216	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6217	bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6218	unsigned Opcode = Op.getOpcode();
6219	switch (Opcode) {
6220	case AMDGPUISD::BFE_I32:
6221	case AMDGPUISD::BFE_U32:
6222	return false;
6223	}
6224	return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6225	Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6226	}
6227
6228	bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6229	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6230	unsigned Depth) const {
6231	unsigned Opcode = Op.getOpcode();
6232	switch (Opcode) {
6233	case AMDGPUISD::FMIN_LEGACY:
6234	case AMDGPUISD::FMAX_LEGACY: {
6235	if (SNaN)
6236	return true;
6237
6238	// TODO: Can check no nans on one of the operands for each one, but which
6239	// one?
6240	return false;
6241	}
6242	case AMDGPUISD::FMUL_LEGACY:
6243	case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6244	if (SNaN)
6245	return true;
6246	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
6247	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
6248	}
6249	case AMDGPUISD::FMED3:
6250	case AMDGPUISD::FMIN3:
6251	case AMDGPUISD::FMAX3:
6252	case AMDGPUISD::FMINIMUM3:
6253	case AMDGPUISD::FMAXIMUM3:
6254	case AMDGPUISD::FMAD_FTZ: {
6255	if (SNaN)
6256	return true;
6257	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
6258	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6259	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
6260	}
6261	case AMDGPUISD::CVT_F32_UBYTE0:
6262	case AMDGPUISD::CVT_F32_UBYTE1:
6263	case AMDGPUISD::CVT_F32_UBYTE2:
6264	case AMDGPUISD::CVT_F32_UBYTE3:
6265	return true;
6266
6267	case AMDGPUISD::RCP:
6268	case AMDGPUISD::RSQ:
6269	case AMDGPUISD::RCP_LEGACY:
6270	case AMDGPUISD::RSQ_CLAMP: {
6271	if (SNaN)
6272	return true;
6273
6274	// TODO: Need is known positive check.
6275	return false;
6276	}
6277	case ISD::FLDEXP:
6278	case AMDGPUISD::FRACT: {
6279	if (SNaN)
6280	return true;
6281	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`);
6282	}
6283	case AMDGPUISD::DIV_SCALE:
6284	case AMDGPUISD::DIV_FMAS:
6285	case AMDGPUISD::DIV_FIXUP:
6286	// TODO: Refine on operands.
6287	return SNaN;
6288	case AMDGPUISD::SIN_HW:
6289	case AMDGPUISD::COS_HW: {
6290	// TODO: Need check for infinity
6291	return SNaN;
6292	}
6293	case ISD::INTRINSIC_WO_CHAIN: {
6294	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
6295	// TODO: Handle more intrinsics
6296	switch (IntrinsicID) {
6297	case Intrinsic::amdgcn_cubeid:
6298	case Intrinsic::amdgcn_cvt_off_f32_i4:
6299	return true;
6300
6301	case Intrinsic::amdgcn_frexp_mant: {
6302	if (SNaN)
6303	return true;
6304	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
6305	}
6306	case Intrinsic::amdgcn_cvt_pkrtz: {
6307	if (SNaN)
6308	return true;
6309	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6310	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
6311	}
6312	case Intrinsic::amdgcn_rcp:
6313	case Intrinsic::amdgcn_rsq:
6314	case Intrinsic::amdgcn_rcp_legacy:
6315	case Intrinsic::amdgcn_rsq_legacy:
6316	case Intrinsic::amdgcn_rsq_clamp:
6317	case Intrinsic::amdgcn_tanh: {
6318	if (SNaN)
6319	return true;
6320
6321	// TODO: Need is known positive check.
6322	return false;
6323	}
6324	case Intrinsic::amdgcn_trig_preop:
6325	case Intrinsic::amdgcn_fdot2:
6326	// TODO: Refine on operand
6327	return SNaN;
6328	case Intrinsic::amdgcn_fma_legacy:
6329	if (SNaN)
6330	return true;
6331	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6332	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`) &&
6333	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `3`), SNaN, Depth: Depth + `1`);
6334	default:
6335	return false;
6336	}
6337	}
6338	default:
6339	return false;
6340	}
6341	}
6342
6343	bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6344	Register N0, Register N1) const {
6345	return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6346	}
6347

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp