AMDGPUISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp]

1	//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This is the parent TargetLowering class for hardware code gen
11	/// targets.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPUISelLowering.h"
16	#include "AMDGPU.h"
17	#include "AMDGPUInstrInfo.h"
18	#include "AMDGPUMachineFunction.h"
19	#include "AMDGPUMemoryUtils.h"
20	#include "SIMachineFunctionInfo.h"
21	#include "llvm/CodeGen/Analysis.h"
22	#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
23	#include "llvm/CodeGen/MachineFrameInfo.h"
24	#include "llvm/IR/DiagnosticInfo.h"
25	#include "llvm/IR/IntrinsicsAMDGPU.h"
26	#include "llvm/Support/CommandLine.h"
27	#include "llvm/Support/KnownBits.h"
28	#include "llvm/Target/TargetMachine.h"
29
30	using namespace llvm;
31
32	#include "AMDGPUGenCallingConv.inc"
33
34	static cl::opt<bool> AMDGPUBypassSlowDiv(
35	"amdgpu-bypass-slow-div",
36	cl::desc ("Skip 64-bit divide for dynamic 32-bit values"),
37	cl::init(Val: true));
38
39	// Find a larger type to do a load / store of a vector with.
40	EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41	unsigned StoreSize = VT.getStoreSizeInBits();
42	if (StoreSize <= `32`)
43	return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
44
45	if (StoreSize % `32` == `0`)
46	return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / `32`);
47
48	return VT;
49	}
50
51	unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52	return DAG.computeKnownBits(Op).countMaxActiveBits();
53	}
54
55	unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
56	// In order for this to be a signed 24-bit value, bit 23, must
57	// be a sign bit.
58	return DAG.ComputeMaxSignificantBits(Op);
59	}
60
61	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62	const AMDGPUSubtarget &STI)
63	: TargetLowering (TM), Subtarget(&STI) {
64	// Always lower memset, memcpy, and memmove intrinsics to load/store
65	// instructions, rather then generating calls to memset, mempcy or memmove.
66	MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~`0U`;
67	MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~`0U`;
68	MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~`0U`;
69
70	// Enable ganging up loads and stores in the memcpy DAG lowering.
71	MaxGluedStoresPerMemcpy = `16`;
72
73	// Lower floating point store/load to integer store/load to reduce the number
74	// of patterns in tablegen.
75	setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
76	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
77
78	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
79	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
80
81	setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
82	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
83
84	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
85	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
86
87	setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
88	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
89
90	setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
91	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
92
93	setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
94	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
95
96	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
97	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
98
99	setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
100	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
101
102	setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
103	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
104
105	setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
106	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
107
108	setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
109	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
110
111	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
112	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
113
114	setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
115	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
116
117	setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
118	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
119
120	setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
121	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
122
123	setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
124	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
125
126	setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
127	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
128
129	setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
130	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
131
132	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
133	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
134
135	setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
136	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
137
138	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
139	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
140
141	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
142	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
143
144	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
145	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
146
147	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
148	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
149
150	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
151	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
152
153	setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
154	AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
155
156	// TODO: Would be better to consume as directly legal
157	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
158	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
159
160	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
161	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
162
163	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
164	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
165
166	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
167	AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
168
169	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
170	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
171
172	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
173	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
174
175	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
176	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
177
178	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
179	AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
180
181	// There are no 64-bit extloads. These should be done as a 32-bit extload and
182	// an extension to 64-bit.
183	for (MVT VT : MVT::integer_valuetypes())
184	setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
185	Action: Expand);
186
187	for (MVT VT : MVT::integer_valuetypes()) {
188	if (VT == MVT::i64)
189	continue;
190
191	for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
193	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
194	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
195	setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
196	}
197	}
198
199	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
200	for (auto MemVT :
201	{MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
202	setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
203	Action: Expand);
204
205	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
206	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
207	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
208	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
209	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
210	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
211	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
212	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
213	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
214	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
215	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
216	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
217	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
218	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
219
220	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
221	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
222	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
223	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
224	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
225	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
226
227	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
228	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
229	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
230	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
231	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
232	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
233	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
234	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
235	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
236	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
237	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
238	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
239
240	setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
241	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
242
243	setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
244	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
245
246	setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
247	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
248
249	setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
250	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
251
252	setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
253	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
254
255	setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
256	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
257
258	setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
259	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
260
261	setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
262	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
263
264	setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
265	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
266
267	setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
268	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
269
270	setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
271	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
272
273	setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
274	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
275
276	setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
277	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
278
279	setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
280	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
281
282	setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
283	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
284
285	setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
286	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
287
288	setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
289	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
290
291	setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
292	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
293
294	setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
295	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
296
297	setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
298	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
299
300	setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
301	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
302
303	setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
304	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
305
306	setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
307	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
308
309	setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
310	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
311
312	setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
313	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
314
315	setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
316	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
317
318	setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
319	AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
320
321	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
322	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
323	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
324	setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
325
326	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
327	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
328	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
329	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
330
331	setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
332	setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
333	setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
334	setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
335	setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
336	setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
337	setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
338	setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
339	setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
340	setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
341	setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
342	setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
343	setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
344	setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
345
346	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
347	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
348	setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
349
350	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
351	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
352	setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
353
354	setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
355
356	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
357	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
358	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
359	setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
360	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
361	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
362	setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
363
364	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
365	setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
366	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
367	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
368	setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
369
370	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
371	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
372	setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
373
374	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
375	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
376	setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
377	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
378	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
379	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
380	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
381	setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
382
383	setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
384	setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
385
386	setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
387
388	// For R600, this is totally unsupported, just custom lower to produce an
389	// error.
390	setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
391
392	// Library functions. These default to Expand, but we have instructions
393	// for them.
394	setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
395	ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
396	VT: MVT::f32, Action: Legal);
397
398	setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
399	setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
400	setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
401	VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
402
403	setOperationAction(
404	Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
405	Action: Custom);
406
407	setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
408
409	setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
410
411	setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
412	Action: Expand);
413
414	setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
415
416	if (Subtarget->has16BitInsts())
417	setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
418	else {
419	setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
420	setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
421	}
422
423	setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
424	Action: Custom);
425
426	// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427	// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428	// default unless marked custom/legal.
429	setOperationAction(Ops: ISD::IS_FPCLASS,
430	VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431	MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432	MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
433	MVT::v16f64},
434	Action: Custom);
435
436	if (isTypeLegal(VT: MVT::f16))
437	setOperationAction(Ops: ISD::IS_FPCLASS,
438	VTs: {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
439	Action: Custom);
440
441	// Expand to fneg + fadd.
442	setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
443
444	setOperationAction(Ops: ISD::CONCAT_VECTORS,
445	VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446	MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447	MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448	MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449	MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450	Action: Custom);
451
452	setOperationAction(
453	Ops: ISD::EXTRACT_SUBVECTOR,
454	VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455	MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456	MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457	MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458	MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459	MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460	MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461	Action: Custom);
462
463	setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f64, Action: Expand);
464	setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
465
466	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467	for (MVT VT : ScalarIntVTs) {
468	// These should use [SU]DIVREM, so set them to expand
469	setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
470	Action: Expand);
471
472	// GPU does not have divrem function for signed or unsigned.
473	setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
474
475	// GPU does not have [S\|U]MUL_LOHI functions as a single instruction.
476	setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
477
478	setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
479
480	// AMDGPU uses ADDC/SUBC/ADDE/SUBE
481	setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
482	}
483
484	// The hardware supports 32-bit FSHR, but not FSHL.
485	setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
486
487	// The hardware supports 32-bit ROTR, but not ROTL.
488	setOperationAction(Ops: ISD::ROTL, VTs: {MVT::i32, MVT::i64}, Action: Expand);
489	setOperationAction(Op: ISD::ROTR, VT: MVT::i64, Action: Expand);
490
491	setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
492
493	setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
494	setOperationAction(
495	Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
496	VT: MVT::i64, Action: Custom);
497	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
498
499	setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
500	Action: Legal);
501
502	setOperationAction(
503	Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
504	VT: MVT::i64, Action: Custom);
505
506	for (auto VT : {MVT::i8, MVT::i16})
507	setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
508
509	static const MVT::SimpleValueType VectorIntTypes[] = {
510	MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511	MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
512
513	for (MVT VT : VectorIntTypes) {
514	// Expand the following operations for the current type by default.
515	setOperationAction(Ops: {ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
516	ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
517	ISD::MULHS, ISD::OR, ISD::SHL,
518	ISD::SRA, ISD::SRL, ISD::ROTL,
519	ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
520	ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
521	ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
522	ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
523	ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
524	ISD::XOR, ISD::BSWAP, ISD::CTPOP,
525	ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
526	ISD::SETCC, ISD::ADDRSPACECAST},
527	VT, Action: Expand);
528	}
529
530	static const MVT::SimpleValueType FloatVectorTypes[] = {
531	MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532	MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
533
534	for (MVT VT : FloatVectorTypes) {
535	setOperationAction(
536	Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
537	ISD::FADD, ISD::FCEIL, ISD::FCOS,
538	ISD::FDIV, ISD::FEXP2, ISD::FEXP,
539	ISD::FEXP10, ISD::FLOG2, ISD::FREM,
540	ISD::FLOG, ISD::FLOG10, ISD::FPOW,
541	ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
542	ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
543	ISD::FSQRT, ISD::FSIN, ISD::FSUB,
544	ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
545	ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
546	ISD::FCANONICALIZE, ISD::FROUNDEVEN},
547	VT, Action: Expand);
548	}
549
550	// This causes using an unrolled select operation rather than expansion with
551	// bit operations. This is in general better, but the alternative using BFI
552	// instructions may be better if the select sources are SGPRs.
553	setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
554	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
555
556	setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
557	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
558
559	setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
560	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
561
562	setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
563	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
564
565	setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
566	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
567
568	setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
569	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
570
571	setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
572	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
573
574	setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
575	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
576
577	setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
578	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
579
580	setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
581	AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
582
583	setSchedulingPreference(Sched::RegPressure);
584	setJumpIsExpensive(true);
585
586	// FIXME: This is only partially true. If we have to do vector compares, any
587	// SGPR pair can be a condition register. If we have a uniform condition, we
588	// are better off doing SALU operations, where there is only one SCC. For now,
589	// we don't have a way of knowing during instruction selection if a condition
590	// will be uniform and we always use vector compares. Assume we are using
591	// vector compares until that is fixed.
592	setHasMultipleConditionRegisters(true);
593
594	setMinCmpXchgSizeInBits(`32`);
595	setSupportsUnalignedAtomics(false);
596
597	PredictableSelectIsExpensive = false;
598
599	// We want to find all load dependencies for long chains of stores to enable
600	// merging into very wide vectors. The problem is with vectors with > 4
601	// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
602	// vectors are a legal type, even though we have to split the loads
603	// usually. When we can more precisely specify load legality per address
604	// space, we should be able to make FindBetterChain/MergeConsecutiveStores
605	// smarter so that they can figure out what to do in 2 iterations without all
606	// N > 4 stores on the same chain.
607	GatherAllAliasesMaxDepth = `16`;
608
609	// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
610	// about these during lowering.
611	MaxStoresPerMemcpy = `0xffffffff`;
612	MaxStoresPerMemmove = `0xffffffff`;
613	MaxStoresPerMemset = `0xffffffff`;
614
615	// The expansion for 64-bit division is enormous.
616	if (AMDGPUBypassSlowDiv)
617	addBypassSlowDiv(SlowBitWidth: `64`, FastBitWidth: `32`);
618
619	setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
620	ISD::SRA, ISD::SRL,
621	ISD::TRUNCATE, ISD::MUL,
622	ISD::SMUL_LOHI, ISD::UMUL_LOHI,
623	ISD::MULHU, ISD::MULHS,
624	ISD::SELECT, ISD::SELECT_CC,
625	ISD::STORE, ISD::FADD,
626	ISD::FSUB, ISD::FNEG,
627	ISD::FABS, ISD::AssertZext,
628	ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
629
630	setMaxAtomicSizeInBitsSupported(`64`);
631	setMaxDivRemBitWidthSupported(`64`);
632	setMaxLargeFPConvertBitWidthSupported(`64`);
633	}
634
635	bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
636	if (getTargetMachine().Options.NoSignedZerosFPMath)
637	return true;
638
639	const auto Flags = Op.getNode()->getFlags();
640	if (Flags.hasNoSignedZeros())
641	return true;
642
643	return false;
644	}
645
646	//===----------------------------------------------------------------------===//
647	// Target Information
648	//===----------------------------------------------------------------------===//
649
650	LLVM_READNONE
651	static bool fnegFoldsIntoOpcode(unsigned Opc) {
652	switch (Opc) {
653	case ISD::FADD:
654	case ISD::FSUB:
655	case ISD::FMUL:
656	case ISD::FMA:
657	case ISD::FMAD:
658	case ISD::FMINNUM:
659	case ISD::FMAXNUM:
660	case ISD::FMINNUM_IEEE:
661	case ISD::FMAXNUM_IEEE:
662	case ISD::FMINIMUM:
663	case ISD::FMAXIMUM:
664	case ISD::FMINIMUMNUM:
665	case ISD::FMAXIMUMNUM:
666	case ISD::SELECT:
667	case ISD::FSIN:
668	case ISD::FTRUNC:
669	case ISD::FRINT:
670	case ISD::FNEARBYINT:
671	case ISD::FROUNDEVEN:
672	case ISD::FCANONICALIZE:
673	case AMDGPUISD::RCP:
674	case AMDGPUISD::RCP_LEGACY:
675	case AMDGPUISD::RCP_IFLAG:
676	case AMDGPUISD::SIN_HW:
677	case AMDGPUISD::FMUL_LEGACY:
678	case AMDGPUISD::FMIN_LEGACY:
679	case AMDGPUISD::FMAX_LEGACY:
680	case AMDGPUISD::FMED3:
681	// TODO: handle llvm.amdgcn.fma.legacy
682	return true;
683	case ISD::BITCAST:
684	llvm_unreachable("bitcast is special cased");
685	default:
686	return false;
687	}
688	}
689
690	static bool fnegFoldsIntoOp(const SDNode *N) {
691	unsigned Opc = N->getOpcode();
692	if (Opc == ISD::BITCAST) {
693	// TODO: Is there a benefit to checking the conditions performFNegCombine
694	// does? We don't for the other cases.
695	SDValue BCSrc = N->getOperand(Num: `0`);
696	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
697	return BCSrc.getNumOperands() == `2` &&
698	BCSrc.getOperand(i: `1`).getValueSizeInBits() == `32`;
699	}
700
701	return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
702	}
703
704	return fnegFoldsIntoOpcode(Opc);
705	}
706
707	/// \p returns true if the operation will definitely need to use a 64-bit
708	/// encoding, and thus will use a VOP3 encoding regardless of the source
709	/// modifiers.
710	LLVM_READONLY
711	static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
712	return (N->getNumOperands() > `2` && N->getOpcode() != ISD::SELECT) \|\|
713	VT == MVT::f64;
714	}
715
716	/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
717	/// type for ISD::SELECT.
718	LLVM_READONLY
719	static bool selectSupportsSourceMods(const SDNode *N) {
720	// TODO: Only applies if select will be vector
721	return N->getValueType(ResNo: `0`) == MVT::f32;
722	}
723
724	// Most FP instructions support source modifiers, but this could be refined
725	// slightly.
726	LLVM_READONLY
727	static bool hasSourceMods(const SDNode *N) {
728	if (isa<MemSDNode>(Val: N))
729	return false;
730
731	switch (N->getOpcode()) {
732	case ISD::CopyToReg:
733	case ISD::FDIV:
734	case ISD::FREM:
735	case ISD::INLINEASM:
736	case ISD::INLINEASM_BR:
737	case AMDGPUISD::DIV_SCALE:
738	case ISD::INTRINSIC_W_CHAIN:
739
740	// TODO: Should really be looking at the users of the bitcast. These are
741	// problematic because bitcasts are used to legalize all stores to integer
742	// types.
743	case ISD::BITCAST:
744	return false;
745	case ISD::INTRINSIC_WO_CHAIN: {
746	switch (N->getConstantOperandVal(Num: `0`)) {
747	case Intrinsic::amdgcn_interp_p1:
748	case Intrinsic::amdgcn_interp_p2:
749	case Intrinsic::amdgcn_interp_mov:
750	case Intrinsic::amdgcn_interp_p1_f16:
751	case Intrinsic::amdgcn_interp_p2_f16:
752	return false;
753	default:
754	return true;
755	}
756	}
757	case ISD::SELECT:
758	return selectSupportsSourceMods(N);
759	default:
760	return true;
761	}
762	}
763
764	bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
765	unsigned CostThreshold) {
766	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
767	// it is truly free to use a source modifier in all cases. If there are
768	// multiple users but for each one will necessitate using VOP3, there will be
769	// a code size increase. Try to avoid increasing code size unless we know it
770	// will save on the instruction count.
771	unsigned NumMayIncreaseSize = `0`;
772	MVT VT = N->getValueType(ResNo: `0`).getScalarType().getSimpleVT();
773
774	assert(!N->use_empty());
775
776	// XXX - Should this limit number of uses to check?
777	for (const SDNode *U : N->users()) {
778	if (!hasSourceMods(N: U))
779	return false;
780
781	if (!opMustUseVOP3Encoding(N: U, VT)) {
782	if (++NumMayIncreaseSize > CostThreshold)
783	return false;
784	}
785	}
786
787	return true;
788	}
789
790	EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
791	ISD::NodeType ExtendKind) const {
792	assert(!VT.isVector() && "only scalar expected");
793
794	// Round to the next multiple of 32-bits.
795	unsigned Size = VT.getSizeInBits();
796	if (Size <= `32`)
797	return MVT::i32;
798	return EVT::getIntegerVT(Context, BitWidth: `32` * ((Size + `31`) / `32`));
799	}
800
801	unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
802	return `32`;
803	}
804
805	bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
806	return true;
807	}
808
809	// The backend supports 32 and 64 bit floating point immediates.
810	// FIXME: Why are we reporting vectors of FP immediates as legal?
811	bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
812	bool ForCodeSize) const {
813	EVT ScalarVT = VT.getScalarType();
814	return (ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f64 \|\|
815	(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
816	}
817
818	// We don't want to shrink f64 / f32 constants.
819	bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
820	EVT ScalarVT = VT.getScalarType();
821	return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
822	}
823
824	bool AMDGPUTargetLowering::shouldReduceLoadWidth(
825	SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
826	std::optional<unsigned> ByteOffset) const {
827	// TODO: This may be worth removing. Check regression tests for diffs.
828	if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
829	return false;
830
831	unsigned NewSize = NewVT.getStoreSizeInBits();
832
833	// If we are reducing to a 32-bit load or a smaller multi-dword load,
834	// this is always better.
835	if (NewSize >= `32`)
836	return true;
837
838	EVT OldVT = N->getValueType(ResNo: `0`);
839	unsigned OldSize = OldVT.getStoreSizeInBits();
840
841	MemSDNode *MN = cast<MemSDNode>(Val: N);
842	unsigned AS = MN->getAddressSpace();
843	// Do not shrink an aligned scalar load to sub-dword.
844	// Scalar engine cannot do sub-dword loads.
845	// TODO: Update this for GFX12 which does have scalar sub-dword loads.
846	if (OldSize >= `32` && NewSize < `32` && MN->getAlign() >= Align (`4`) &&
847	(AS == AMDGPUAS::CONSTANT_ADDRESS \|\|
848	AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT \|\|
849	(isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
850	MN->isInvariant())) &&
851	AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
852	return false;
853
854	// Don't produce extloads from sub 32-bit types. SI doesn't have scalar
855	// extloads, so doing one requires using a buffer_load. In cases where we
856	// still couldn't use a scalar load, using the wider load shouldn't really
857	// hurt anything.
858
859	// If the old size already had to be an extload, there's no harm in continuing
860	// to reduce the width.
861	return (OldSize < `32`);
862	}
863
864	bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
865	const SelectionDAG &DAG,
866	const MachineMemOperand &MMO) const {
867
868	assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
869
870	if (LoadTy.getScalarType() == MVT::i32)
871	return false;
872
873	unsigned LScalarSize = LoadTy.getScalarSizeInBits();
874	unsigned CastScalarSize = CastTy.getScalarSizeInBits();
875
876	if ((LScalarSize >= CastScalarSize) && (CastScalarSize < `32`))
877	return false;
878
879	unsigned Fast = `0`;
880	return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
881	VT: CastTy, MMO, Fast: &Fast) &&
882	Fast;
883	}
884
885	// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
886	// profitable with the expansion for 64-bit since it's generally good to
887	// speculate things.
888	bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type Ty) const* {
889	return true;
890	}
891
892	bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type Ty) const* {
893	return true;
894	}
895
896	bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode N) const* {
897	switch (N->getOpcode()) {
898	case ISD::EntryToken:
899	case ISD::TokenFactor:
900	return true;
901	case ISD::INTRINSIC_WO_CHAIN: {
902	unsigned IntrID = N->getConstantOperandVal(Num: `0`);
903	return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
904	}
905	case ISD::INTRINSIC_W_CHAIN: {
906	unsigned IntrID = N->getConstantOperandVal(Num: `1`);
907	return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
908	}
909	case ISD::LOAD:
910	if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
911	AMDGPUAS::CONSTANT_ADDRESS_32BIT)
912	return true;
913	return false;
914	case AMDGPUISD::SETCC: // ballot-style instruction
915	return true;
916	}
917	return false;
918	}
919
920	SDValue AMDGPUTargetLowering::getNegatedExpression(
921	SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
922	NegatibleCost &Cost, unsigned Depth) const {
923
924	switch (Op.getOpcode()) {
925	case ISD::FMA:
926	case ISD::FMAD: {
927	// Negating a fma is not free if it has users without source mods.
928	if (!allUsesHaveSourceMods(N: Op.getNode()))
929	return SDValue ();
930	break;
931	}
932	case AMDGPUISD::RCP: {
933	SDValue Src = Op.getOperand(i: `0`);
934	EVT VT = Op.getValueType();
935	SDLoc SL(Op);
936
937	SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
938	ForCodeSize, Cost, Depth: Depth + `1`);
939	if (NegSrc)
940	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op ->getFlags());
941	return SDValue ();
942	}
943	default:
944	break;
945	}
946
947	return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
948	OptForSize: ForCodeSize, Cost, Depth);
949	}
950
951	//===---------------------------------------------------------------------===//
952	// Target Properties
953	//===---------------------------------------------------------------------===//
954
955	bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
956	assert(VT.isFloatingPoint());
957
958	// Packed operations do not have a fabs modifier.
959	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
960	(Subtarget->has16BitInsts() && (VT == MVT::f16 \|\| VT == MVT::bf16));
961	}
962
963	bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
964	assert(VT.isFloatingPoint());
965	// Report this based on the end legalized type.
966	VT = VT.getScalarType();
967	return VT == MVT::f32 \|\| VT == MVT::f64 \|\| VT == MVT::f16 \|\| VT == MVT::bf16;
968	}
969
970	bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
971	unsigned NumElem,
972	unsigned AS) const {
973	return true;
974	}
975
976	bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
977	// There are few operations which truly have vector input operands. Any vector
978	// operation is going to involve operations on each component, and a
979	// build_vector will be a copy per element, so it always makes sense to use a
980	// build_vector input in place of the extracted element to avoid a copy into a
981	// super register.
982	//
983	// We should probably only do this if all users are extracts only, but this
984	// should be the common case.
985	return true;
986	}
987
988	bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
989	// Truncate is just accessing a subregister.
990
991	unsigned SrcSize = Source.getSizeInBits();
992	unsigned DestSize = Dest.getSizeInBits();
993
994	return DestSize < SrcSize && DestSize % `32` == `0` ;
995	}
996
997	bool AMDGPUTargetLowering::isTruncateFree(Type Source, Type Dest) const {
998	// Truncate is just accessing a subregister.
999
1000	unsigned SrcSize = Source->getScalarSizeInBits();
1001	unsigned DestSize = Dest->getScalarSizeInBits();
1002
1003	if (DestSize== `16` && Subtarget->has16BitInsts())
1004	return SrcSize >= `32`;
1005
1006	return DestSize < SrcSize && DestSize % `32` == `0`;
1007	}
1008
1009	bool AMDGPUTargetLowering::isZExtFree(Type Src, Type Dest) const {
1010	unsigned SrcSize = Src->getScalarSizeInBits();
1011	unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013	if (SrcSize == `16` && Subtarget->has16BitInsts())
1014	return DestSize >= `32`;
1015
1016	return SrcSize == `32` && DestSize == `64`;
1017	}
1018
1019	bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1020	// Any register load of a 64-bit value really requires 2 32-bit moves. For all
1021	// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1022	// this will enable reducing 64-bit operations the 32-bit, which is always
1023	// good.
1024
1025	if (Src == MVT::i16)
1026	return Dest == MVT::i32 \|\|Dest == MVT::i64 ;
1027
1028	return Src == MVT::i32 && Dest == MVT::i64;
1029	}
1030
1031	bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1032	EVT DestVT) const {
1033	switch (N->getOpcode()) {
1034	case ISD::ADD:
1035	case ISD::SUB:
1036	case ISD::SHL:
1037	case ISD::SRL:
1038	case ISD::SRA:
1039	case ISD::AND:
1040	case ISD::OR:
1041	case ISD::XOR:
1042	case ISD::MUL:
1043	case ISD::SETCC:
1044	case ISD::SELECT:
1045	case ISD::SMIN:
1046	case ISD::SMAX:
1047	case ISD::UMIN:
1048	case ISD::UMAX:
1049	if (Subtarget->has16BitInsts() &&
1050	(!DestVT.isVector() \|\| !Subtarget->hasVOP3PInsts())) {
1051	// Don't narrow back down to i16 if promoted to i32 already.
1052	if (!N->isDivergent() && DestVT.isInteger() &&
1053	DestVT.getScalarSizeInBits() > `1` &&
1054	DestVT.getScalarSizeInBits() <= `16` &&
1055	SrcVT.getScalarSizeInBits() > `16`) {
1056	return false;
1057	}
1058	}
1059	return true;
1060	default:
1061	break;
1062	}
1063
1064	// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1065	// limited number of native 64-bit operations. Shrinking an operation to fit
1066	// in a single 32-bit register should always be helpful. As currently used,
1067	// this is much less general than the name suggests, and is only used in
1068	// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1069	// not profitable, and may actually be harmful.
1070	if (isa<LoadSDNode>(Val: N))
1071	return SrcVT.getSizeInBits() > `32` && DestVT.getSizeInBits() == `32`;
1072
1073	return true;
1074	}
1075
1076	bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1077	const SDNode* N, CombineLevel Level) const {
1078	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
1079	N->getOpcode() == ISD::SRL) &&
1080	"Expected shift op");
1081
1082	SDValue ShiftLHS = N->getOperand(Num: `0`);
1083	if (!ShiftLHS ->hasOneUse())
1084	return false;
1085
1086	if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1087	!ShiftLHS.getOperand(i: `0`)->hasOneUse())
1088	return false;
1089
1090	// Always commute pre-type legalization and right shifts.
1091	// We're looking for shl(or(x,y),z) patterns.
1092	if (Level < CombineLevel::AfterLegalizeTypes \|\|
1093	N->getOpcode() != ISD::SHL \|\| N->getOperand(Num: `0`).getOpcode() != ISD::OR)
1094	return true;
1095
1096	// If only user is a i32 right-shift, then don't destroy a BFE pattern.
1097	if (N->getValueType(ResNo: `0`) == MVT::i32 && N->hasOneUse() &&
1098	(N->user_begin()->getOpcode() == ISD::SRA \|\|
1099	N->user_begin()->getOpcode() == ISD::SRL))
1100	return false;
1101
1102	// Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1103	auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1104	if (LHS.getOpcode() != ISD::SHL)
1105	return false;
1106	auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1107	auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: `0`));
1108	auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
1109	return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1110	LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1111	RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1112	};
1113	SDValue LHS = N->getOperand(Num: `0`).getOperand(i: `0`);
1114	SDValue RHS = N->getOperand(Num: `0`).getOperand(i: `1`);
1115	return !(IsShiftAndLoad (LHS, RHS) \|\| IsShiftAndLoad (RHS, LHS));
1116	}
1117
1118	//===---------------------------------------------------------------------===//
1119	// TargetLowering Callbacks
1120	//===---------------------------------------------------------------------===//
1121
1122	CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1123	bool IsVarArg) {
1124	switch (CC) {
1125	case CallingConv::AMDGPU_VS:
1126	case CallingConv::AMDGPU_GS:
1127	case CallingConv::AMDGPU_PS:
1128	case CallingConv::AMDGPU_CS:
1129	case CallingConv::AMDGPU_HS:
1130	case CallingConv::AMDGPU_ES:
1131	case CallingConv::AMDGPU_LS:
1132	return CC_AMDGPU;
1133	case CallingConv::AMDGPU_CS_Chain:
1134	case CallingConv::AMDGPU_CS_ChainPreserve:
1135	return CC_AMDGPU_CS_CHAIN;
1136	case CallingConv::C:
1137	case CallingConv::Fast:
1138	case CallingConv::Cold:
1139	return CC_AMDGPU_Func;
1140	case CallingConv::AMDGPU_Gfx:
1141	return CC_SI_Gfx;
1142	case CallingConv::AMDGPU_KERNEL:
1143	case CallingConv::SPIR_KERNEL:
1144	default:
1145	reportFatalUsageError(reason: "unsupported calling convention for call");
1146	}
1147	}
1148
1149	CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1150	bool IsVarArg) {
1151	switch (CC) {
1152	case CallingConv::AMDGPU_KERNEL:
1153	case CallingConv::SPIR_KERNEL:
1154	llvm_unreachable("kernels should not be handled here");
1155	case CallingConv::AMDGPU_VS:
1156	case CallingConv::AMDGPU_GS:
1157	case CallingConv::AMDGPU_PS:
1158	case CallingConv::AMDGPU_CS:
1159	case CallingConv::AMDGPU_CS_Chain:
1160	case CallingConv::AMDGPU_CS_ChainPreserve:
1161	case CallingConv::AMDGPU_HS:
1162	case CallingConv::AMDGPU_ES:
1163	case CallingConv::AMDGPU_LS:
1164	return RetCC_SI_Shader;
1165	case CallingConv::AMDGPU_Gfx:
1166	return RetCC_SI_Gfx;
1167	case CallingConv::C:
1168	case CallingConv::Fast:
1169	case CallingConv::Cold:
1170	return RetCC_AMDGPU_Func;
1171	default:
1172	reportFatalUsageError(reason: "unsupported calling convention");
1173	}
1174	}
1175
1176	/// The SelectionDAGBuilder will automatically promote function arguments
1177	/// with illegal types. However, this does not work for the AMDGPU targets
1178	/// since the function arguments are stored in memory as these illegal types.
1179	/// In order to handle this properly we need to get the original types sizes
1180	/// from the LLVM IR Function and fixup the ISD:InputArg values before
1181	/// passing them to AnalyzeFormalArguments()
1182
1183	/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1184	/// input values across multiple registers. Each item in the Ins array
1185	/// represents a single value that will be stored in registers. Ins[x].VT is
1186	/// the value type of the value that will be stored in the register, so
1187	/// whatever SDNode we lower the argument to needs to be this type.
1188	///
1189	/// In order to correctly lower the arguments we need to know the size of each
1190	/// argument. Since Ins[x].VT gives us the size of the register that will
1191	/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1192	/// for the original function argument so that we can deduce the correct memory
1193	/// type to use for Ins[x]. In most cases the correct memory type will be
1194	/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1195	/// we have a kernel argument of type v8i8, this argument will be split into
1196	/// 8 parts and each part will be represented by its own item in the Ins array.
1197	/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1198	/// the argument before it was split. From this, we deduce that the memory type
1199	/// for each individual part is i8. We pass the memory type as LocVT to the
1200	/// calling convention analysis function and the register type (Ins[x].VT) as
1201	/// the ValVT.
1202	void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1203	CCState &State,
1204	const SmallVectorImpl<ISD::InputArg> &Ins) const {
1205	const MachineFunction &MF = State.getMachineFunction();
1206	const Function &Fn = MF.getFunction();
1207	LLVMContext &Ctx = Fn.getParent()->getContext();
1208	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1209	const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1210	CallingConv::ID CC = Fn.getCallingConv();
1211
1212	Align MaxAlign = Align (`1`);
1213	uint64_t ExplicitArgOffset = `0`;
1214	const DataLayout &DL = Fn.getDataLayout();
1215
1216	unsigned InIndex = `0`;
1217
1218	for (const Argument &Arg : Fn.args()) {
1219	const bool IsByRef = Arg.hasByRefAttr();
1220	Type *BaseArgTy = Arg.getType();
1221	Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1222	Align Alignment = DL.getValueOrABITypeAlignment(
1223	Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1224	MaxAlign = std::max(a: Alignment, b: MaxAlign);
1225	uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1226
1227	uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1228	ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1229
1230	// We're basically throwing away everything passed into us and starting over
1231	// to get accurate in-memory offsets. The "PartOffset" is completely useless
1232	// to us as computed in Ins.
1233	//
1234	// We also need to figure out what type legalization is trying to do to get
1235	// the correct memory offsets.
1236
1237	SmallVector<EVT, `16`> ValueVTs;
1238	SmallVector<uint64_t, `16`> Offsets;
1239	ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1240
1241	for (unsigned Value = `0`, NumValues = ValueVTs.size();
1242	Value != NumValues; ++Value) {
1243	uint64_t BasePartOffset = Offsets [Value];
1244
1245	EVT ArgVT = ValueVTs [Value];
1246	EVT MemVT = ArgVT;
1247	MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1248	unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1249
1250	if (NumRegs == `1`) {
1251	// This argument is not split, so the IR type is the memory type.
1252	if (ArgVT.isExtended()) {
1253	// We have an extended type, like i24, so we should just use the
1254	// register type.
1255	MemVT = RegisterVT;
1256	} else {
1257	MemVT = ArgVT;
1258	}
1259	} else if (ArgVT.isVector() && RegisterVT.isVector() &&
1260	ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1261	assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1262	// We have a vector value which has been split into a vector with
1263	// the same scalar type, but fewer elements. This should handle
1264	// all the floating-point vector types.
1265	MemVT = RegisterVT;
1266	} else if (ArgVT.isVector() &&
1267	ArgVT.getVectorNumElements() == NumRegs) {
1268	// This arg has been split so that each element is stored in a separate
1269	// register.
1270	MemVT = ArgVT.getScalarType();
1271	} else if (ArgVT.isExtended()) {
1272	// We have an extended type, like i65.
1273	MemVT = RegisterVT;
1274	} else {
1275	unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1276	assert(ArgVT.getStoreSizeInBits() % NumRegs == `0`);
1277	if (RegisterVT.isInteger()) {
1278	MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1279	} else if (RegisterVT.isVector()) {
1280	assert(!RegisterVT.getScalarType().isFloatingPoint());
1281	unsigned NumElements = RegisterVT.getVectorNumElements();
1282	assert(MemoryBits % NumElements == `0`);
1283	// This vector type has been split into another vector type with
1284	// a different elements size.
1285	EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1286	BitWidth: MemoryBits / NumElements);
1287	MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1288	} else {
1289	llvm_unreachable("cannot deduce memory type.");
1290	}
1291	}
1292
1293	// Convert one element vectors to scalar.
1294	if (MemVT.isVector() && MemVT.getVectorNumElements() == `1`)
1295	MemVT = MemVT.getScalarType();
1296
1297	// Round up vec3/vec5 argument.
1298	if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1299	MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1300	} else if (!MemVT.isSimple() && !MemVT.isVector()) {
1301	MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1302	}
1303
1304	unsigned PartOffset = `0`;
1305	for (unsigned i = `0`; i != NumRegs; ++i) {
1306	State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1307	Offset: BasePartOffset + PartOffset,
1308	LocVT: MemVT.getSimpleVT(),
1309	HTP: CCValAssign::Full));
1310	PartOffset += MemVT.getStoreSize();
1311	}
1312	}
1313	}
1314	}
1315
1316	SDValue AMDGPUTargetLowering::LowerReturn(
1317	SDValue Chain, CallingConv::ID CallConv,
1318	bool isVarArg,
1319	const SmallVectorImpl<ISD::OutputArg> &Outs,
1320	const SmallVectorImpl<SDValue> &OutVals,
1321	const SDLoc &DL, SelectionDAG &DAG) const {
1322	// FIXME: Fails for r600 tests
1323	//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1324	// "wave terminate should not have return values");
1325	return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1326	}
1327
1328	//===---------------------------------------------------------------------===//
1329	// Target specific lowering
1330	//===---------------------------------------------------------------------===//
1331
1332	/// Selects the correct CCAssignFn for a given CallingConvention value.
1333	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1334	bool IsVarArg) {
1335	return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1336	}
1337
1338	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1339	bool IsVarArg) {
1340	return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1341	}
1342
1343	SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1344	SelectionDAG &DAG,
1345	MachineFrameInfo &MFI,
1346	int ClobberedFI) const {
1347	SmallVector<SDValue, `8`> ArgChains;
1348	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1349	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
1350
1351	// Include the original chain at the beginning of the list. When this is
1352	// used by target LowerCall hooks, this helps legalize find the
1353	// CALLSEQ_BEGIN node.
1354	ArgChains.push_back(Elt: Chain);
1355
1356	// Add a chain value for each stack argument corresponding
1357	for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1358	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1359	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1360	if (FI->getIndex() < `0`) {
1361	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1362	int64_t InLastByte = InFirstByte;
1363	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
1364
1365	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
1366	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
1367	ArgChains.push_back(Elt: SDValue (L, `1`));
1368	}
1369	}
1370	}
1371	}
1372
1373	// Build a tokenfactor for all the chains.
1374	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ArgChains);
1375	}
1376
1377	SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1378	SmallVectorImpl<SDValue> &InVals,
1379	StringRef Reason) const {
1380	SDValue Callee = CLI.Callee;
1381	SelectionDAG &DAG = CLI.DAG;
1382
1383	const Function &Fn = DAG.getMachineFunction().getFunction();
1384
1385	StringRef FuncName("<unknown>");
1386
1387	if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1388	FuncName = G->getSymbol();
1389	else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1390	FuncName = G->getGlobal()->getName();
1391
1392	DAG.getContext()->diagnose(
1393	DI: DiagnosticInfoUnsupported (Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1394
1395	if (!CLI.IsTailCall) {
1396	for (ISD::InputArg &Arg : CLI.Ins)
1397	InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1398	}
1399
1400	return DAG.getEntryNode();
1401	}
1402
1403	SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1404	SmallVectorImpl<SDValue> &InVals) const {
1405	return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1406	}
1407
1408	SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1409	SelectionDAG &DAG) const {
1410	const Function &Fn = DAG.getMachineFunction().getFunction();
1411
1412	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
1413	Fn, "unsupported dynamic alloca", SDLoc (Op).getDebugLoc()));
1414	auto Ops = {DAG.getConstant(Val: `0`, DL: SDLoc (), VT: Op.getValueType()), Op.getOperand(i: `0`)};
1415	return DAG.getMergeValues(Ops, dl: SDLoc ());
1416	}
1417
1418	SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1419	SelectionDAG &DAG) const {
1420	switch (Op.getOpcode()) {
1421	default:
1422	Op ->print(OS&: errs(), G: &DAG);
1423	llvm_unreachable("Custom lowering code for this "
1424	"instruction is not implemented yet!");
1425	break;
1426	case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1427	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1428	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1429	case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1430	case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1431	case ISD::FREM: return LowerFREM(Op, DAG);
1432	case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1433	case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1434	case ISD::FRINT: return LowerFRINT(Op, DAG);
1435	case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1436	case ISD::FROUNDEVEN:
1437	return LowerFROUNDEVEN(Op, DAG);
1438	case ISD::FROUND: return LowerFROUND(Op, DAG);
1439	case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1440	case ISD::FLOG2:
1441	return LowerFLOG2(Op, DAG);
1442	case ISD::FLOG:
1443	case ISD::FLOG10:
1444	return LowerFLOGCommon(Op, DAG);
1445	case ISD::FEXP:
1446	case ISD::FEXP10:
1447	return lowerFEXP(Op, DAG);
1448	case ISD::FEXP2:
1449	return lowerFEXP2(Op, DAG);
1450	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1451	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1452	case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1453	case ISD::FP_TO_SINT:
1454	case ISD::FP_TO_UINT:
1455	return LowerFP_TO_INT(Op, DAG);
1456	case ISD::CTTZ:
1457	case ISD::CTTZ_ZERO_UNDEF:
1458	case ISD::CTLZ:
1459	case ISD::CTLZ_ZERO_UNDEF:
1460	return LowerCTLZ_CTTZ(Op, DAG);
1461	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1462	}
1463	return Op;
1464	}
1465
1466	void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1467	SmallVectorImpl<SDValue> &Results,
1468	SelectionDAG &DAG) const {
1469	switch (N->getOpcode()) {
1470	case ISD::SIGN_EXTEND_INREG:
1471	// Different parts of legalization seem to interpret which type of
1472	// sign_extend_inreg is the one to check for custom lowering. The extended
1473	// from type is what really matters, but some places check for custom
1474	// lowering of the result type. This results in trying to use
1475	// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1476	// nothing here and let the illegal result integer be handled normally.
1477	return;
1478	case ISD::FLOG2:
1479	if (SDValue Lowered = LowerFLOG2(Op: SDValue (N, `0`), DAG))
1480	Results.push_back(Elt: Lowered);
1481	return;
1482	case ISD::FLOG:
1483	case ISD::FLOG10:
1484	if (SDValue Lowered = LowerFLOGCommon(Op: SDValue (N, `0`), DAG))
1485	Results.push_back(Elt: Lowered);
1486	return;
1487	case ISD::FEXP2:
1488	if (SDValue Lowered = lowerFEXP2(Op: SDValue (N, `0`), DAG))
1489	Results.push_back(Elt: Lowered);
1490	return;
1491	case ISD::FEXP:
1492	case ISD::FEXP10:
1493	if (SDValue Lowered = lowerFEXP(Op: SDValue (N, `0`), DAG))
1494	Results.push_back(Elt: Lowered);
1495	return;
1496	case ISD::CTLZ:
1497	case ISD::CTLZ_ZERO_UNDEF:
1498	if (auto Lowered = lowerCTLZResults(Op: SDValue (N, `0u`), DAG))
1499	Results.push_back(Elt: Lowered);
1500	return;
1501	default:
1502	return;
1503	}
1504	}
1505
1506	SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1507	SDValue Op,
1508	SelectionDAG &DAG) const {
1509
1510	const DataLayout &DL = DAG.getDataLayout();
1511	GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1512	const GlobalValue *GV = G->getGlobal();
1513
1514	if (!MFI->isModuleEntryFunction()) {
1515	if (std::optional<uint32_t> Address =
1516	AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1517	return DAG.getConstant(Val: *Address, DL: SDLoc (Op), VT: Op.getValueType());
1518	}
1519	}
1520
1521	if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
1522	G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1523	if (!MFI->isModuleEntryFunction() &&
1524	GV->getName() != "llvm.amdgcn.module.lds" &&
1525	!AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1526	SDLoc DL(Op);
1527	const Function &Fn = DAG.getMachineFunction().getFunction();
1528	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
1529	Fn, "local memory global used by non-kernel function",
1530	DL.getDebugLoc(), DS_Warning));
1531
1532	// We currently don't have a way to correctly allocate LDS objects that
1533	// aren't directly associated with a kernel. We do force inlining of
1534	// functions that use local objects. However, if these dead functions are
1535	// not eliminated, we don't want a compile time error. Just emit a warning
1536	// and a trap, since there should be no callable path here.
1537	SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1538	SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1539	N1: Trap, N2: DAG.getRoot());
1540	DAG.setRoot(OutputChain);
1541	return DAG.getPOISON(VT: Op.getValueType());
1542	}
1543
1544	// XXX: What does the value of G->getOffset() mean?
1545	assert(G->getOffset() == `0` &&
1546	"Do not know what to do with an non-zero offset");
1547
1548	// TODO: We could emit code to handle the initialization somewhere.
1549	// We ignore the initializer for now and legalize it to allow selection.
1550	// The initializer will anyway get errored out during assembly emission.
1551	unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1552	return DAG.getConstant(Val: Offset, DL: SDLoc (Op), VT: Op.getValueType());
1553	}
1554	return SDValue ();
1555	}
1556
1557	SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1558	SelectionDAG &DAG) const {
1559	SmallVector<SDValue, `8`> Args;
1560	SDLoc SL(Op);
1561
1562	EVT VT = Op.getValueType();
1563	if (VT.getVectorElementType().getSizeInBits() < `32`) {
1564	unsigned OpBitSize = Op.getOperand(i: `0`).getValueType().getSizeInBits();
1565	if (OpBitSize >= `32` && OpBitSize % `32` == `0`) {
1566	unsigned NewNumElt = OpBitSize / `32`;
1567	EVT NewEltVT = (NewNumElt == `1`) ? MVT::i32
1568	: EVT::getVectorVT(Context&: *DAG.getContext(),
1569	VT: MVT::i32, NumElements: NewNumElt);
1570	for (const SDUse &U : Op ->ops()) {
1571	SDValue In = U.get();
1572	SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1573	if (NewNumElt > `1`)
1574	DAG.ExtractVectorElements(Op: NewIn, Args);
1575	else
1576	Args.push_back(Elt: NewIn);
1577	}
1578
1579	EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1580	NumElements: NewNumElt * Op.getNumOperands());
1581	SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1582	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1583	}
1584	}
1585
1586	for (const SDUse &U : Op ->ops())
1587	DAG.ExtractVectorElements(Op: U.get(), Args);
1588
1589	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1590	}
1591
1592	SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1593	SelectionDAG &DAG) const {
1594	SDLoc SL(Op);
1595	SmallVector<SDValue, `8`> Args;
1596	unsigned Start = Op.getConstantOperandVal(i: `1`);
1597	EVT VT = Op.getValueType();
1598	EVT SrcVT = Op.getOperand(i: `0`).getValueType();
1599
1600	if (VT.getScalarSizeInBits() == `16` && Start % `2` == `0`) {
1601	unsigned NumElt = VT.getVectorNumElements();
1602	unsigned NumSrcElt = SrcVT.getVectorNumElements();
1603	assert(NumElt % `2` == `0` && NumSrcElt % `2` == `0` && "expect legal types");
1604
1605	// Extract 32-bit registers at a time.
1606	EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / `2`);
1607	EVT NewVT = NumElt == `2`
1608	? MVT::i32
1609	: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / `2`);
1610	SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: `0`));
1611
1612	DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / `2`, Count: NumElt / `2`);
1613	if (NumElt == `2`)
1614	Tmp = Args [`0`];
1615	else
1616	Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1617
1618	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1619	}
1620
1621	DAG.ExtractVectorElements(Op: Op.getOperand(i: `0`), Args, Start,
1622	Count: VT.getVectorNumElements());
1623
1624	return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1625	}
1626
1627	// TODO: Handle fabs too
1628	static SDValue peekFNeg(SDValue Val) {
1629	if (Val.getOpcode() == ISD::FNEG)
1630	return Val.getOperand(i: `0`);
1631
1632	return Val;
1633	}
1634
1635	static SDValue peekFPSignOps(SDValue Val) {
1636	if (Val.getOpcode() == ISD::FNEG)
1637	Val = Val.getOperand(i: `0`);
1638	if (Val.getOpcode() == ISD::FABS)
1639	Val = Val.getOperand(i: `0`);
1640	if (Val.getOpcode() == ISD::FCOPYSIGN)
1641	Val = Val.getOperand(i: `0`);
1642	return Val;
1643	}
1644
1645	SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1646	const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1647	SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1648	SelectionDAG &DAG = DCI.DAG;
1649	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1650	switch (CCOpcode) {
1651	case ISD::SETOEQ:
1652	case ISD::SETONE:
1653	case ISD::SETUNE:
1654	case ISD::SETNE:
1655	case ISD::SETUEQ:
1656	case ISD::SETEQ:
1657	case ISD::SETFALSE:
1658	case ISD::SETFALSE2:
1659	case ISD::SETTRUE:
1660	case ISD::SETTRUE2:
1661	case ISD::SETUO:
1662	case ISD::SETO:
1663	break;
1664	case ISD::SETULE:
1665	case ISD::SETULT: {
1666	if (LHS == True)
1667	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1668	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1669	}
1670	case ISD::SETOLE:
1671	case ISD::SETOLT:
1672	case ISD::SETLE:
1673	case ISD::SETLT: {
1674	// Ordered. Assume ordered for undefined.
1675
1676	// Only do this after legalization to avoid interfering with other combines
1677	// which might occur.
1678	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1679	!DCI.isCalledByLegalizer())
1680	return SDValue ();
1681
1682	// We need to permute the operands to get the correct NaN behavior. The
1683	// selected operand is the second one based on the failing compare with NaN,
1684	// so permute it based on the compare type the hardware uses.
1685	if (LHS == True)
1686	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1687	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1688	}
1689	case ISD::SETUGE:
1690	case ISD::SETUGT: {
1691	if (LHS == True)
1692	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1693	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1694	}
1695	case ISD::SETGT:
1696	case ISD::SETGE:
1697	case ISD::SETOGE:
1698	case ISD::SETOGT: {
1699	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1700	!DCI.isCalledByLegalizer())
1701	return SDValue ();
1702
1703	if (LHS == True)
1704	return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1705	return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1706	}
1707	case ISD::SETCC_INVALID:
1708	llvm_unreachable("Invalid setcc condcode!");
1709	}
1710	return SDValue ();
1711	}
1712
1713	/// Generate Min/Max node
1714	SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1715	SDValue LHS, SDValue RHS,
1716	SDValue True, SDValue False,
1717	SDValue CC,
1718	DAGCombinerInfo &DCI) const {
1719	if ((LHS == True && RHS == False) \|\| (LHS == False && RHS == True))
1720	return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1721
1722	SelectionDAG &DAG = DCI.DAG;
1723
1724	// If we can't directly match this, try to see if we can fold an fneg to
1725	// match.
1726
1727	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1728	ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1729	SDValue NegTrue = peekFNeg(Val: True);
1730
1731	// Undo the combine foldFreeOpFromSelect does if it helps us match the
1732	// fmin/fmax.
1733	//
1734	// select (fcmp olt (lhs, K)), (fneg lhs), -K
1735	// -> fneg (fmin_legacy lhs, K)
1736	//
1737	// TODO: Use getNegatedExpression
1738	if (LHS == NegTrue && CFalse && CRHS) {
1739	APFloat NegRHS = neg(X: CRHS->getValueAPF());
1740	if (NegRHS == CFalse->getValueAPF()) {
1741	SDValue Combined =
1742	combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1743	if (Combined)
1744	return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1745	return SDValue ();
1746	}
1747	}
1748
1749	return SDValue ();
1750	}
1751
1752	std::pair<SDValue, SDValue>
1753	AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1754	SDLoc SL(Op);
1755
1756	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1757
1758	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
1759	const SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
1760
1761	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1762	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1763
1764	return std::pair(Lo, Hi);
1765	}
1766
1767	SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1768	SDLoc SL(Op);
1769
1770	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1771	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
1772	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1773	}
1774
1775	SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1776	SDLoc SL(Op);
1777
1778	SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1779	const SDValue One = DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32);
1780	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1781	}
1782
1783	// Split a vector type into two parts. The first part is a power of two vector.
1784	// The second part is whatever is left over, and is a scalar if it would
1785	// otherwise be a 1-vector.
1786	std::pair<EVT, EVT>
1787	AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1788	EVT LoVT, HiVT;
1789	EVT EltVT = VT.getVectorElementType();
1790	unsigned NumElts = VT.getVectorNumElements();
1791	unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + `1`) / `2`);
1792	LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1793	HiVT = NumElts - LoNumElts == `1`
1794	? EltVT
1795	: EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1796	return std::pair(LoVT, HiVT);
1797	}
1798
1799	// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1800	// scalar.
1801	std::pair<SDValue, SDValue>
1802	AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1803	const EVT &LoVT, const EVT &HiVT,
1804	SelectionDAG &DAG) const {
1805	assert(LoVT.getVectorNumElements() +
1806	(HiVT.isVector() ? HiVT.getVectorNumElements() : `1`) <=
1807	N.getValueType().getVectorNumElements() &&
1808	"More vector elements requested than available!");
1809	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1810	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
1811	SDValue Hi = DAG.getNode(
1812	Opcode: HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1813	VT: HiVT, N1: N, N2: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL));
1814	return std::pair(Lo, Hi);
1815	}
1816
1817	SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1818	SelectionDAG &DAG) const {
1819	LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1820	EVT VT = Op.getValueType();
1821	SDLoc SL(Op);
1822
1823
1824	// If this is a 2 element vector, we really want to scalarize and not create
1825	// weird 1 element vectors.
1826	if (VT.getVectorNumElements() == `2`) {
1827	SDValue Ops[`2`];
1828	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = scalarizeVectorLoad(LD: Load, DAG);
1829	return DAG.getMergeValues(Ops, dl: SL);
1830	}
1831
1832	SDValue BasePtr = Load->getBasePtr();
1833	EVT MemVT = Load->getMemoryVT();
1834
1835	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1836
1837	EVT LoVT, HiVT;
1838	EVT LoMemVT, HiMemVT;
1839	SDValue Lo, Hi;
1840
1841	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1842	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1843	std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1844
1845	unsigned Size = LoMemVT.getStoreSize();
1846	Align BaseAlign = Load->getAlign();
1847	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1848
1849	SDValue LoLoad = DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: LoVT,
1850	Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue, MemVT: LoMemVT,
1851	Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1852	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1853	SDValue HiLoad =
1854	DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(),
1855	Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()),
1856	MemVT: HiMemVT, Alignment: HiAlign, MMOFlags: Load->getMemOperand()->getFlags());
1857
1858	SDValue Join;
1859	if (LoVT == HiVT) {
1860	// This is the case that the vector is power of two so was evenly split.
1861	Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1862	} else {
1863	Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1864	N3: DAG.getVectorIdxConstant(Val: `0`, DL: SL));
1865	Join = DAG.getNode(
1866	Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1867	VT, N1: Join, N2: HiLoad,
1868	N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1869	}
1870
1871	SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1872	N1: LoLoad.getValue(R: `1`), N2: HiLoad.getValue(R: `1`))};
1873
1874	return DAG.getMergeValues(Ops, dl: SL);
1875	}
1876
1877	SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1878	SelectionDAG &DAG) const {
1879	LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1880	EVT VT = Op.getValueType();
1881	SDValue BasePtr = Load->getBasePtr();
1882	EVT MemVT = Load->getMemoryVT();
1883	SDLoc SL(Op);
1884	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1885	Align BaseAlign = Load->getAlign();
1886	unsigned NumElements = MemVT.getVectorNumElements();
1887
1888	// Widen from vec3 to vec4 when the load is at least 8-byte aligned
1889	// or 16-byte fully dereferenceable. Otherwise, split the vector load.
1890	if (NumElements != `3` \|\|
1891	(BaseAlign < Align (`8`) &&
1892	!SrcValue.isDereferenceable(Size: `16`, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1893	return SplitVectorLoad(Op, DAG);
1894
1895	assert(NumElements == `3`);
1896
1897	EVT WideVT =
1898	EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: `4`);
1899	EVT WideMemVT =
1900	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: `4`);
1901	SDValue WideLoad = DAG.getExtLoad(
1902	ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1903	MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1904	return DAG.getMergeValues(
1905	Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1906	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SL)),
1907	WideLoad.getValue(R: `1`)},
1908	dl: SL);
1909	}
1910
1911	SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1912	SelectionDAG &DAG) const {
1913	StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1914	SDValue Val = Store->getValue();
1915	EVT VT = Val.getValueType();
1916
1917	// If this is a 2 element vector, we really want to scalarize and not create
1918	// weird 1 element vectors.
1919	if (VT.getVectorNumElements() == `2`)
1920	return scalarizeVectorStore(ST: Store, DAG);
1921
1922	EVT MemVT = Store->getMemoryVT();
1923	SDValue Chain = Store->getChain();
1924	SDValue BasePtr = Store->getBasePtr();
1925	SDLoc SL(Op);
1926
1927	EVT LoVT, HiVT;
1928	EVT LoMemVT, HiMemVT;
1929	SDValue Lo, Hi;
1930
1931	std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1932	std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1933	std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1934
1935	SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1936
1937	const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1938	Align BaseAlign = Store->getAlign();
1939	unsigned Size = LoMemVT.getStoreSize();
1940	Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1941
1942	SDValue LoStore =
1943	DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1944	MMOFlags: Store->getMemOperand()->getFlags());
1945	SDValue HiStore =
1946	DAG.getTruncStore(Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size),
1947	SVT: HiMemVT, Alignment: HiAlign, MMOFlags: Store->getMemOperand()->getFlags());
1948
1949	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1950	}
1951
1952	// This is a shortcut for integer division because we have fast i32<->f32
1953	// conversions, and fast f32 reciprocal instructions. The fractional part of a
1954	// float is enough to accurately represent up to a 24-bit signed integer.
1955	SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1956	bool Sign) const {
1957	SDLoc DL(Op);
1958	EVT VT = Op.getValueType();
1959	SDValue LHS = Op.getOperand(i: `0`);
1960	SDValue RHS = Op.getOperand(i: `1`);
1961	MVT IntVT = MVT::i32;
1962	MVT FltVT = MVT::f32;
1963
1964	unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
1965	if (LHSSignBits < `9`)
1966	return SDValue ();
1967
1968	unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
1969	if (RHSSignBits < `9`)
1970	return SDValue ();
1971
1972	unsigned BitSize = VT.getSizeInBits();
1973	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1974	unsigned DivBits = BitSize - SignBits;
1975	if (Sign)
1976	++DivBits;
1977
1978	ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1979	ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1980
1981	SDValue jq = DAG.getConstant(Val: `1`, DL, VT: IntVT);
1982
1983	if (Sign) {
1984	// char\|short jq = ia ^ ib;
1985	jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
1986
1987	// jq = jq >> (bitsize - 2)
1988	jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
1989	N2: DAG.getConstant(Val: BitSize - `2`, DL, VT));
1990
1991	// jq = jq \| 0x1
1992	jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: `1`, DL, VT));
1993	}
1994
1995	// int ia = (int)LHS;
1996	SDValue ia = LHS;
1997
1998	// int ib, (int)RHS;
1999	SDValue ib = RHS;
2000
2001	// float fa = (float)ia;
2002	SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2003
2004	// float fb = (float)ib;
2005	SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2006
2007	SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2008	N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2009
2010	// fq = trunc(fq);
2011	fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2012
2013	// float fqneg = -fq;
2014	SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2015
2016	MachineFunction &MF = DAG.getMachineFunction();
2017
2018	bool UseFmadFtz = false;
2019	if (Subtarget->isGCN()) {
2020	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2021	UseFmadFtz =
2022	MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2023	}
2024
2025	// float fr = mad(fqneg, fb, fa);
2026	unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2027	: UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2028	: (unsigned)ISD::FMAD;
2029	SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2030
2031	// int iq = (int)fq;
2032	SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2033
2034	// fr = fabs(fr);
2035	fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2036
2037	// fb = fabs(fb);
2038	fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2039
2040	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2041
2042	// int cv = fr >= fb;
2043	SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2044
2045	// jq = (cv ? jq : 0);
2046	jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: `0`, DL, VT));
2047
2048	// dst = iq + jq;
2049	SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2050
2051	// Rem needs compensation, it's easier to recompute it
2052	SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2053	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2054
2055	// Truncate to number of bits this divide really is.
2056	if (Sign) {
2057	SDValue InRegSize
2058	= DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2059	Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2060	Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2061	} else {
2062	SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(`1`) << DivBits) - `1`, DL, VT);
2063	Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2064	Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2065	}
2066
2067	return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2068	}
2069
2070	void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2071	SelectionDAG &DAG,
2072	SmallVectorImpl<SDValue> &Results) const {
2073	SDLoc DL(Op);
2074	EVT VT = Op.getValueType();
2075
2076	assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2077
2078	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2079
2080	SDValue One = DAG.getConstant(Val: `1`, DL, VT: HalfVT);
2081	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: HalfVT);
2082
2083	//HiLo split
2084	SDValue LHS_Lo, LHS_Hi;
2085	SDValue LHS = Op.getOperand(i: `0`);
2086	std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2087
2088	SDValue RHS_Lo, RHS_Hi;
2089	SDValue RHS = Op.getOperand(i: `1`);
2090	std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2091
2092	if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`)) &&
2093	DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`))) {
2094
2095	SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2096	N1: LHS_Lo, N2: RHS_Lo);
2097
2098	SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: `0`), Zero});
2099	SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: `1`), Zero});
2100
2101	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2102	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2103	return;
2104	}
2105
2106	if (isTypeLegal(VT: MVT::i64)) {
2107	// The algorithm here is based on ideas from "Software Integer Division",
2108	// Tom Rodeheffer, August 2008.
2109
2110	MachineFunction &MF = DAG.getMachineFunction();
2111	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2112
2113	// Compute denominator reciprocal.
2114	unsigned FMAD =
2115	!Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2116	: MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2117	? (unsigned)ISD::FMAD
2118	: (unsigned)AMDGPUISD::FMAD_FTZ;
2119
2120	SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2121	SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2122	SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2123	N2: DAG.getConstantFP(Val: APInt (`32`, `0x4f800000`).bitsToFloat(), DL, VT: MVT::f32),
2124	N3: Cvt_Lo);
2125	SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2126	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2127	N2: DAG.getConstantFP(Val: APInt (`32`, `0x5f7ffffc`).bitsToFloat(), DL, VT: MVT::f32));
2128	SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2129	N2: DAG.getConstantFP(Val: APInt (`32`, `0x2f800000`).bitsToFloat(), DL, VT: MVT::f32));
2130	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2131	SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2132	N2: DAG.getConstantFP(Val: APInt (`32`, `0xcf800000`).bitsToFloat(), DL, VT: MVT::f32),
2133	N3: Mul1);
2134	SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2135	SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2136	SDValue Rcp64 = DAG.getBitcast(VT,
2137	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2138
2139	SDValue Zero64 = DAG.getConstant(Val: `0`, DL, VT);
2140	SDValue One64 = DAG.getConstant(Val: `1`, DL, VT);
2141	SDValue Zero1 = DAG.getConstant(Val: `0`, DL, VT: MVT::i1);
2142	SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2143
2144	// First round of UNR (Unsigned integer Newton-Raphson).
2145	SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2146	SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2147	SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2148	SDValue Mulhi1_Lo, Mulhi1_Hi;
2149	std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2150	DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2151	SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2152	N2: Mulhi1_Lo, N3: Zero1);
2153	SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2154	N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: `1`));
2155	SDValue Add1 = DAG.getBitcast(VT,
2156	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2157
2158	// Second round of UNR.
2159	SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2160	SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2161	SDValue Mulhi2_Lo, Mulhi2_Hi;
2162	std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2163	DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2164	SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2165	N2: Mulhi2_Lo, N3: Zero1);
2166	SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2167	N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: `1`));
2168	SDValue Add2 = DAG.getBitcast(VT,
2169	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2170
2171	SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2172
2173	SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2174
2175	SDValue Mul3_Lo, Mul3_Hi;
2176	std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2177	SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2178	N2: Mul3_Lo, N3: Zero1);
2179	SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2180	N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: `1`));
2181	SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2182	SDValue Sub1 = DAG.getBitcast(VT,
2183	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2184
2185	SDValue MinusOne = DAG.getConstant(Val: `0xffffffffu`, DL, VT: HalfVT);
2186	SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2187	Cond: ISD::SETUGE);
2188	SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2189	Cond: ISD::SETUGE);
2190	SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2191
2192	// TODO: Here and below portions of the code can be enclosed into if/endif.
2193	// Currently control flow is unconditional and we have 4 selects after
2194	// potential endif to substitute PHIs.
2195
2196	// if C3 != 0 ...
2197	SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2198	N2: RHS_Lo, N3: Zero1);
2199	SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2200	N2: RHS_Hi, N3: Sub1_Lo.getValue(R: `1`));
2201	SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2202	N2: Zero, N3: Sub2_Lo.getValue(R: `1`));
2203	SDValue Sub2 = DAG.getBitcast(VT,
2204	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2205
2206	SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2207
2208	SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2209	Cond: ISD::SETUGE);
2210	SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2211	Cond: ISD::SETUGE);
2212	SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2213
2214	// if (C6 != 0)
2215	SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2216
2217	SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2218	N2: RHS_Lo, N3: Zero1);
2219	SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2220	N2: RHS_Hi, N3: Sub2_Lo.getValue(R: `1`));
2221	SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2222	N2: Zero, N3: Sub3_Lo.getValue(R: `1`));
2223	SDValue Sub3 = DAG.getBitcast(VT,
2224	V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2225
2226	// endif C6
2227	// endif C3
2228
2229	SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2230	SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2231
2232	SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2233	SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2234
2235	Results.push_back(Elt: Div);
2236	Results.push_back(Elt: Rem);
2237
2238	return;
2239	}
2240
2241	// r600 expandion.
2242	// Get Speculative values
2243	SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2244	SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2245
2246	SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2247	SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2248	REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2249
2250	SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2251	SDValue DIV_Lo = Zero;
2252
2253	const unsigned halfBitWidth = HalfVT.getSizeInBits();
2254
2255	for (unsigned i = `0`; i < halfBitWidth; ++i) {
2256	const unsigned bitPos = halfBitWidth - i - `1`;
2257	SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2258	// Get value of high bit
2259	SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2260	HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2261	HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2262
2263	// Shift
2264	REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: `1`, DL, VT));
2265	// Add LHS high bit
2266	REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2267
2268	SDValue BIT = DAG.getConstant(Val: `1ULL` << bitPos, DL, VT: HalfVT);
2269	SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2270
2271	DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2272
2273	// Update REM
2274	SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2275	REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2276	}
2277
2278	SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2279	DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2280	Results.push_back(Elt: DIV);
2281	Results.push_back(Elt: REM);
2282	}
2283
2284	SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2285	SelectionDAG &DAG) const {
2286	SDLoc DL(Op);
2287	EVT VT = Op.getValueType();
2288
2289	if (VT == MVT::i64) {
2290	SmallVector<SDValue, `2`> Results;
2291	LowerUDIVREM64(Op, DAG, Results);
2292	return DAG.getMergeValues(Ops: Results, dl: DL);
2293	}
2294
2295	if (VT == MVT::i32) {
2296	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2297	return Res;
2298	}
2299
2300	SDValue X = Op.getOperand(i: `0`);
2301	SDValue Y = Op.getOperand(i: `1`);
2302
2303	// See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2304	// algorithm used here.
2305
2306	// Initial estimate of inv(y).
2307	SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2308
2309	// One round of UNR.
2310	SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT), N2: Y);
2311	SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2312	Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2313	N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2314
2315	// Quotient/remainder estimate.
2316	SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2317	SDValue R =
2318	DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2319
2320	// First quotient/remainder refinement.
2321	EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2322	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
2323	SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2324	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2325	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2326	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2327	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2328
2329	// Second quotient/remainder refinement.
2330	Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2331	Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2332	N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2333	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2334	N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2335
2336	return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2337	}
2338
2339	SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2340	SelectionDAG &DAG) const {
2341	SDLoc DL(Op);
2342	EVT VT = Op.getValueType();
2343
2344	SDValue LHS = Op.getOperand(i: `0`);
2345	SDValue RHS = Op.getOperand(i: `1`);
2346
2347	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
2348	SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2349
2350	if (VT == MVT::i32) {
2351	if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2352	return Res;
2353	}
2354
2355	if (VT == MVT::i64 &&
2356	DAG.ComputeNumSignBits(Op: LHS) > `32` &&
2357	DAG.ComputeNumSignBits(Op: RHS) > `32`) {
2358	EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2359
2360	//HiLo split
2361	SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2362	SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2363	SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2364	N1: LHS_Lo, N2: RHS_Lo);
2365	SDValue Res[`2`] = {
2366	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `0`)),
2367	DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: `1`))
2368	};
2369	return DAG.getMergeValues(Ops: Res, dl: DL);
2370	}
2371
2372	SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2373	SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2374	SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2375	SDValue RSign = LHSign; // Remainder sign is the same as LHS
2376
2377	LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2378	RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2379
2380	LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2381	RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2382
2383	SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2384	SDValue Rem = Div.getValue(R: `1`);
2385
2386	Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2387	Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2388
2389	Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2390	Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2391
2392	SDValue Res[`2`] = {
2393	Div,
2394	Rem
2395	};
2396	return DAG.getMergeValues(Ops: Res, dl: DL);
2397	}
2398
2399	// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2400	SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2401	SDLoc SL(Op);
2402	EVT VT = Op.getValueType();
2403	auto Flags = Op ->getFlags();
2404	SDValue X = Op.getOperand(i: `0`);
2405	SDValue Y = Op.getOperand(i: `1`);
2406
2407	SDValue Div = DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT, N1: X, N2: Y, Flags);
2408	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: Div, Flags);
2409	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Trunc, Flags);
2410	// TODO: For f32 use FMAD instead if !hasFastFMA32?
2411	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Neg, N2: Y, N3: X, Flags);
2412	}
2413
2414	SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2415	SDLoc SL(Op);
2416	SDValue Src = Op.getOperand(i: `0`);
2417
2418	// result = trunc(src)
2419	// if (src > 0.0 && src != result)
2420	// result += 1.0
2421
2422	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2423
2424	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT: MVT::f64);
2425	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT: MVT::f64);
2426
2427	EVT SetCCVT =
2428	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2429
2430	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2431	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2432	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2433
2434	SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2435	// TODO: Should this propagate fast-math-flags?
2436	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2437	}
2438
2439	static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2440	SelectionDAG &DAG) {
2441	const unsigned FractBits = `52`;
2442	const unsigned ExpBits = `11`;
2443
2444	SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2445	N1: Hi,
2446	N2: DAG.getConstant(Val: FractBits - `32`, DL: SL, VT: MVT::i32),
2447	N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2448	SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2449	N2: DAG.getConstant(Val: `1023`, DL: SL, VT: MVT::i32));
2450
2451	return Exp;
2452	}
2453
2454	SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2455	SDLoc SL(Op);
2456	SDValue Src = Op.getOperand(i: `0`);
2457
2458	assert(Op.getValueType() == MVT::f64);
2459
2460	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i32);
2461
2462	// Extract the upper half, since this is where we will find the sign and
2463	// exponent.
2464	SDValue Hi = getHiHalf64(Op: Src, DAG);
2465
2466	SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2467
2468	const unsigned FractBits = `52`;
2469
2470	// Extract the sign bit.
2471	const SDValue SignBitMask = DAG.getConstant(UINT32_C(`1`) << `31`, DL: SL, VT: MVT::i32);
2472	SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2473
2474	// Extend back to 64-bits.
2475	SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2476	SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2477
2478	SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2479	const SDValue FractMask
2480	= DAG.getConstant(Val: (UINT64_C(`1`) << FractBits) - `1`, DL: SL, VT: MVT::i64);
2481
2482	SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2483	SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2484	SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2485
2486	EVT SetCCVT =
2487	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2488
2489	const SDValue FiftyOne = DAG.getConstant(Val: FractBits - `1`, DL: SL, VT: MVT::i32);
2490
2491	SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2492	SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2493
2494	SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2495	SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2496
2497	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2498	}
2499
2500	SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2501	SelectionDAG &DAG) const {
2502	SDLoc SL(Op);
2503	SDValue Src = Op.getOperand(i: `0`);
2504
2505	assert(Op.getValueType() == MVT::f64);
2506
2507	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2508	SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2509	SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2510
2511	// TODO: Should this propagate fast-math-flags?
2512
2513	SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2514	SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2515
2516	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2517
2518	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2519	SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2520
2521	EVT SetCCVT =
2522	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2523	SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2524
2525	return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2526	}
2527
2528	SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2529	SelectionDAG &DAG) const {
2530	// FNEARBYINT and FRINT are the same, except in their handling of FP
2531	// exceptions. Those aren't really meaningful for us, and OpenCL only has
2532	// rint, so just treat them as equivalent.
2533	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT: Op.getValueType(),
2534	Operand: Op.getOperand(i: `0`));
2535	}
2536
2537	SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2538	auto VT = Op.getValueType();
2539	auto Arg = Op.getOperand(i: `0u`);
2540	return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc (Op), VT, Operand: Arg);
2541	}
2542
2543	// XXX - May require not supporting f32 denormals?
2544
2545	// Don't handle v2f16. The extra instructions to scalarize and repack around the
2546	// compare and vselect end up producing worse code than scalarizing the whole
2547	// operation.
2548	SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2549	SDLoc SL(Op);
2550	SDValue X = Op.getOperand(i: `0`);
2551	EVT VT = Op.getValueType();
2552
2553	SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2554
2555	// TODO: Should this propagate fast-math-flags?
2556
2557	SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2558
2559	SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2560
2561	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2562	const SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2563
2564	EVT SetCCVT =
2565	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2566
2567	const SDValue Half = DAG.getConstantFP(Val: `0.5`, DL: SL, VT);
2568	SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2569	SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2570
2571	SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2572	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2573	}
2574
2575	SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2576	SDLoc SL(Op);
2577	SDValue Src = Op.getOperand(i: `0`);
2578
2579	// result = trunc(src);
2580	// if (src < 0.0 && src != result)
2581	// result += -1.0.
2582
2583	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2584
2585	const SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT: MVT::f64);
2586	const SDValue NegOne = DAG.getConstantFP(Val: -`1.0`, DL: SL, VT: MVT::f64);
2587
2588	EVT SetCCVT =
2589	getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2590
2591	SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2592	SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2593	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2594
2595	SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2596	// TODO: Should this propagate fast-math-flags?
2597	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2598	}
2599
2600	/// Return true if it's known that \p Src can never be an f32 denormal value.
2601	static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2602	switch (Src.getOpcode()) {
2603	case ISD::FP_EXTEND:
2604	return Src.getOperand(i: `0`).getValueType() == MVT::f16;
2605	case ISD::FP16_TO_FP:
2606	case ISD::FFREXP:
2607	return true;
2608	case ISD::INTRINSIC_WO_CHAIN: {
2609	unsigned IntrinsicID = Src.getConstantOperandVal(i: `0`);
2610	switch (IntrinsicID) {
2611	case Intrinsic::amdgcn_frexp_mant:
2612	return true;
2613	default:
2614	return false;
2615	}
2616	}
2617	default:
2618	return false;
2619	}
2620
2621	llvm_unreachable("covered opcode switch");
2622	}
2623
2624	bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2625	SDNodeFlags Flags) {
2626	if (Flags.hasApproximateFuncs())
2627	return true;
2628	auto &Options = DAG.getTarget().Options;
2629	return Options.UnsafeFPMath \|\| Options.ApproxFuncFPMath;
2630	}
2631
2632	bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2633	SDValue Src,
2634	SDNodeFlags Flags) {
2635	return !valueIsKnownNeverF32Denorm(Src) &&
2636	DAG.getMachineFunction()
2637	.getDenormalMode(FPType: APFloat::IEEEsingle())
2638	.Input != DenormalMode::PreserveSign;
2639	}
2640
2641	SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2642	SDValue Src,
2643	SDNodeFlags Flags) const {
2644	SDLoc SL(Src);
2645	EVT VT = Src.getValueType();
2646	const fltSemantics &Semantics = VT.getFltSemantics();
2647	SDValue SmallestNormal =
2648	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2649
2650	// Want to scale denormals up, but negatives and 0 work just as well on the
2651	// scaled path.
2652	SDValue IsLtSmallestNormal = DAG.getSetCC(
2653	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2654	RHS: SmallestNormal, Cond: ISD::SETOLT);
2655
2656	return IsLtSmallestNormal;
2657	}
2658
2659	SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2660	SDNodeFlags Flags) const {
2661	SDLoc SL(Src);
2662	EVT VT = Src.getValueType();
2663	const fltSemantics &Semantics = VT.getFltSemantics();
2664	SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2665
2666	SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2667	SDValue IsFinite = DAG.getSetCC(
2668	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2669	RHS: Inf, Cond: ISD::SETOLT);
2670	return IsFinite;
2671	}
2672
2673	/// If denormal handling is required return the scaled input to FLOG2, and the
2674	/// check for denormal range. Otherwise, return null values.
2675	std::pair<SDValue, SDValue>
2676	AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2677	SDValue Src, SDNodeFlags Flags) const {
2678	if (!needsDenormHandlingF32(DAG, Src, Flags))
2679	return {};
2680
2681	MVT VT = MVT::f32;
2682	const fltSemantics &Semantics = APFloat::IEEEsingle();
2683	SDValue SmallestNormal =
2684	DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2685
2686	SDValue IsLtSmallestNormal = DAG.getSetCC(
2687	DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2688	RHS: SmallestNormal, Cond: ISD::SETOLT);
2689
2690	SDValue Scale32 = DAG.getConstantFP(Val: `0x1.0p+32`, DL: SL, VT);
2691	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2692	SDValue ScaleFactor =
2693	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2694
2695	SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2696	return {ScaledInput, IsLtSmallestNormal};
2697	}
2698
2699	SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2700	// v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2701	// If we have to handle denormals, scale up the input and adjust the result.
2702
2703	// scaled = x (is_denormal ? 0x1.0p+32 : 1.0)*
2704	// log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2705
2706	SDLoc SL(Op);
2707	EVT VT = Op.getValueType();
2708	SDValue Src = Op.getOperand(i: `0`);
2709	SDNodeFlags Flags = Op ->getFlags();
2710
2711	if (VT == MVT::f16) {
2712	// Nothing in half is a denormal when promoted to f32.
2713	assert(!Subtarget->has16BitInsts());
2714	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2715	SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2716	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2717	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
2718	}
2719
2720	auto [ScaledInput, IsLtSmallestNormal] =
2721	getScaledLogInput(DAG, SL, Src, Flags);
2722	if (!ScaledInput)
2723	return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2724
2725	SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2726
2727	SDValue ThirtyTwo = DAG.getConstantFP(Val: `32.0`, DL: SL, VT);
2728	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2729	SDValue ResultOffset =
2730	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2731	return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2732	}
2733
2734	static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2735	SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags ()) {
2736	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2737	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2738	}
2739
2740	SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2741	SelectionDAG &DAG) const {
2742	SDValue X = Op.getOperand(i: `0`);
2743	EVT VT = Op.getValueType();
2744	SDNodeFlags Flags = Op ->getFlags();
2745	SDLoc DL(Op);
2746
2747	const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2748	assert(IsLog10 \|\| Op.getOpcode() == ISD::FLOG);
2749
2750	const auto &Options = getTargetMachine().Options;
2751	if (VT == MVT::f16 \|\| Flags.hasApproximateFuncs() \|\|
2752	Options.ApproxFuncFPMath \|\| Options.UnsafeFPMath) {
2753
2754	if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2755	// Log and multiply in f32 is good enough for f16.
2756	X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2757	}
2758
2759	SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2760	if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2761	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2762	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32), Flags);
2763	}
2764
2765	return Lowered;
2766	}
2767
2768	auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2769	if (ScaledInput)
2770	X = ScaledInput;
2771
2772	SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2773
2774	SDValue R;
2775	if (Subtarget->hasFastFMAF32()) {
2776	// c+cc are ln(2)/ln(10) to more than 49 bits
2777	const float c_log10 = `0x1.344134p-2f`;
2778	const float cc_log10 = `0x1.09f79ep-26f`;
2779
2780	// c + cc is ln(2) to more than 49 bits
2781	const float c_log = `0x1.62e42ep-1f`;
2782	const float cc_log = `0x1.efa39ep-25f`;
2783
2784	SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2785	SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2786
2787	R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2788	SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2789	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2790	SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2791	R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2792	} else {
2793	// ch+ct is ln(2)/ln(10) to more than 36 bits
2794	const float ch_log10 = `0x1.344000p-2f`;
2795	const float ct_log10 = `0x1.3509f6p-18f`;
2796
2797	// ch + ct is ln(2) to more than 36 bits
2798	const float ch_log = `0x1.62e000p-1f`;
2799	const float ct_log = `0x1.0bfbe8p-15f`;
2800
2801	SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2802	SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2803
2804	SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2805	SDValue MaskConst = DAG.getConstant(Val: `0xfffff000`, DL, VT: MVT::i32);
2806	SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2807	SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2808	SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2809
2810	SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2811	SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2812	SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2813	R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2814	}
2815
2816	const bool IsFiniteOnly = (Flags.hasNoNaNs() \|\| Options.NoNaNsFPMath) &&
2817	(Flags.hasNoInfs() \|\| Options.NoInfsFPMath);
2818
2819	// TODO: Check if known finite from source value.
2820	if (!IsFiniteOnly) {
2821	SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2822	R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2823	}
2824
2825	if (IsScaled) {
2826	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL, VT);
2827	SDValue ShiftK =
2828	DAG.getConstantFP(Val: IsLog10 ? `0x1.344136p+3f` : `0x1.62e430p+4f`, DL, VT);
2829	SDValue Shift =
2830	DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2831	R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2832	}
2833
2834	return R;
2835	}
2836
2837	SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2838	return LowerFLOGCommon(Op, DAG);
2839	}
2840
2841	// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2842	// promote f16 operation.
2843	SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2844	SelectionDAG &DAG, bool IsLog10,
2845	SDNodeFlags Flags) const {
2846	EVT VT = Src.getValueType();
2847	unsigned LogOp =
2848	VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2849
2850	double Log2BaseInverted =
2851	IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2852
2853	if (VT == MVT::f32) {
2854	auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2855	if (ScaledInput) {
2856	SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2857	SDValue ScaledResultOffset =
2858	DAG.getConstantFP(Val: -`32.0` * Log2BaseInverted, DL: SL, VT);
2859
2860	SDValue Zero = DAG.getConstantFP(Val: `0.0f`, DL: SL, VT);
2861
2862	SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2863	N2: ScaledResultOffset, N3: Zero, Flags);
2864
2865	SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2866
2867	if (Subtarget->hasFastFMAF32())
2868	return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2869	Flags);
2870	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2871	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2872	}
2873	}
2874
2875	SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2876	SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2877
2878	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2879	Flags);
2880	}
2881
2882	SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2883	// v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2884	// If we have to handle denormals, scale up the input and adjust the result.
2885
2886	SDLoc SL(Op);
2887	EVT VT = Op.getValueType();
2888	SDValue Src = Op.getOperand(i: `0`);
2889	SDNodeFlags Flags = Op ->getFlags();
2890
2891	if (VT == MVT::f16) {
2892	// Nothing in half is a denormal when promoted to f32.
2893	assert(!Subtarget->has16BitInsts());
2894	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2895	SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2896	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2897	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
2898	}
2899
2900	assert(VT == MVT::f32);
2901
2902	if (!needsDenormHandlingF32(DAG, Src, Flags))
2903	return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2904
2905	// bool needs_scaling = x < -0x1.f80000p+6f;
2906	// v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) (s ? 0x1.0p-64f : 1.0f);*
2907
2908	// -nextafter(128.0, -1)
2909	SDValue RangeCheckConst = DAG.getConstantFP(Val: -`0x1.f80000p+6f`, DL: SL, VT);
2910
2911	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2912
2913	SDValue NeedsScaling =
2914	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2915
2916	SDValue SixtyFour = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
2917	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
2918
2919	SDValue AddOffset =
2920	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2921
2922	SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2923	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2924
2925	SDValue TwoExpNeg64 = DAG.getConstantFP(Val: `0x1.0p-64f`, DL: SL, VT);
2926	SDValue One = DAG.getConstantFP(Val: `1.0`, DL: SL, VT);
2927	SDValue ResultScale =
2928	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2929
2930	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2931	}
2932
2933	SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2934	SelectionDAG &DAG,
2935	SDNodeFlags Flags) const {
2936	EVT VT = X.getValueType();
2937	const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
2938
2939	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, Src: X, Flags)) {
2940	// exp2(M_LOG2E_F f);*
2941	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Log2E, Flags);
2942	return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2943	: (unsigned)ISD::FEXP2,
2944	DL: SL, VT, Operand: Mul, Flags);
2945	}
2946
2947	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2948
2949	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.5d58a0p+6f`, DL: SL, VT);
2950	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2951
2952	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+6f`, DL: SL, VT);
2953
2954	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2955
2956	SDValue AdjustedX =
2957	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2958
2959	SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
2960
2961	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
2962
2963	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.969d48p-93f`, DL: SL, VT);
2964	SDValue AdjustedResult =
2965	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
2966
2967	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
2968	Flags);
2969	}
2970
2971	/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2972	/// handled correctly.
2973	SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2974	SelectionDAG &DAG,
2975	SDNodeFlags Flags) const {
2976	const EVT VT = X.getValueType();
2977	const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
2978	: static_cast<unsigned>(ISD::FEXP2);
2979
2980	if (VT != MVT::f32 \|\| !needsDenormHandlingF32(DAG, Src: X, Flags)) {
2981	// exp2(x 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);*
2982	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
2983	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
2984
2985	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
2986	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2987	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
2988	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2989	return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
2990	}
2991
2992	// bool s = x < -0x1.2f7030p+5f;
2993	// x += s ? 0x1.0p+5f : 0.0f;
2994	// exp10 = exp2(x * 0x1.a92000p+1f) *
2995	// exp2(x * 0x1.4f0978p-11f) *
2996	// (s ? 0x1.9f623ep-107f : 1.0f);
2997
2998	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2999
3000	SDValue Threshold = DAG.getConstantFP(Val: -`0x1.2f7030p+5f`, DL: SL, VT);
3001	SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3002
3003	SDValue ScaleOffset = DAG.getConstantFP(Val: `0x1.0p+5f`, DL: SL, VT);
3004	SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3005	SDValue AdjustedX =
3006	DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3007
3008	SDValue K0 = DAG.getConstantFP(Val: `0x1.a92000p+1f`, DL: SL, VT);
3009	SDValue K1 = DAG.getConstantFP(Val: `0x1.4f0978p-11f`, DL: SL, VT);
3010
3011	SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3012	SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3013	SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3014	SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3015
3016	SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3017
3018	SDValue ResultScaleFactor = DAG.getConstantFP(Val: `0x1.9f623ep-107f`, DL: SL, VT);
3019	SDValue AdjustedResult =
3020	DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3021
3022	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3023	Flags);
3024	}
3025
3026	SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3027	EVT VT = Op.getValueType();
3028	SDLoc SL(Op);
3029	SDValue X = Op.getOperand(i: `0`);
3030	SDNodeFlags Flags = Op ->getFlags();
3031	const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3032
3033	if (VT.getScalarType() == MVT::f16) {
3034	// v_exp_f16 (fmul x, log2e)
3035	if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3036	return lowerFEXPUnsafe(X, SL, DAG, Flags);
3037
3038	if (VT.isVector())
3039	return SDValue ();
3040
3041	// exp(f16 x) ->
3042	// fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3043
3044	// Nothing in half is a denormal when promoted to f32.
3045	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3046	SDValue Lowered = lowerFEXPUnsafe(X: Ext, SL, DAG, Flags);
3047	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3048	N2: DAG.getTargetConstant(Val: `0`, DL: SL, VT: MVT::i32), Flags);
3049	}
3050
3051	assert(VT == MVT::f32);
3052
3053	// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3054	// library behavior. Also, is known-not-daz source sufficient?
3055	if (allowApproxFunc(DAG, Flags)) {
3056	return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3057	: lowerFEXPUnsafe(X, SL, DAG, Flags);
3058	}
3059
3060	// Algorithm:
3061	//
3062	// e^x = 2^(x/ln(2)) = 2^(x(64/ln(2))/64)*
3063	//
3064	// x(64/ln(2)) = n + f, \|f\| <= 0.5, n is integer*
3065	// n = 64m + j, 0 <= j < 64*
3066	//
3067	// e^x = 2^((64m + j + f)/64)*
3068	// = (2^m) (2^(j/64)) * 2^(f/64)*
3069	// = (2^m) (2^(j/64)) * e^(f(ln(2)/64))
3070	//
3071	// f = x(64/ln(2)) - n*
3072	// r = f(ln(2)/64) = x - n(ln(2)/64)
3073	//
3074	// e^x = (2^m) (2^(j/64)) * e^r*
3075	//
3076	// (2^(j/64)) is precomputed
3077	//
3078	// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3079	// e^r = 1 + q
3080	//
3081	// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3082	//
3083	// e^x = (2^m) ( (2^(j/64)) + q(2^(j/64)) )
3084	SDNodeFlags FlagsNoContract = Flags;
3085	FlagsNoContract.setAllowContract(false);
3086
3087	SDValue PH, PL;
3088	if (Subtarget->hasFastFMAF32()) {
3089	const float c_exp = numbers::log2ef;
3090	const float cc_exp = `0x1.4ae0bep-26f`; // c+cc are 49 bits
3091	const float c_exp10 = `0x1.a934f0p+1f`;
3092	const float cc_exp10 = `0x1.2f346ep-24f`;
3093
3094	SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3095	SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3096
3097	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3098	SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3099	SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3100	PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3101	} else {
3102	const float ch_exp = `0x1.714000p+0f`;
3103	const float cl_exp = `0x1.47652ap-12f`; // ch + cl are 36 bits
3104
3105	const float ch_exp10 = `0x1.a92000p+1f`;
3106	const float cl_exp10 = `0x1.4f0978p-11f`;
3107
3108	SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3109	SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3110
3111	SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3112	SDValue MaskConst = DAG.getConstant(Val: `0xfffff000`, DL: SL, VT: MVT::i32);
3113	SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3114	SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3115	SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3116
3117	PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3118
3119	SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3120	SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3121	PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3122	}
3123
3124	SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3125
3126	// It is unsafe to contract this fsub into the PH multiply.
3127	SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3128
3129	SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3130	SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3131	SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3132
3133	SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3134
3135	SDValue UnderflowCheckConst =
3136	DAG.getConstantFP(Val: IsExp10 ? -`0x1.66d3e8p+5f` : -`0x1.9d1da0p+6f`, DL: SL, VT);
3137
3138	EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3139	SDValue Zero = DAG.getConstantFP(Val: `0.0`, DL: SL, VT);
3140	SDValue Underflow =
3141	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3142
3143	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3144	const auto &Options = getTargetMachine().Options;
3145
3146	if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3147	SDValue OverflowCheckConst =
3148	DAG.getConstantFP(Val: IsExp10 ? `0x1.344136p+5f` : `0x1.62e430p+6f`, DL: SL, VT);
3149	SDValue Overflow =
3150	DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3151	SDValue Inf =
3152	DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3153	R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3154	}
3155
3156	return R;
3157	}
3158
3159	static bool isCtlzOpc(unsigned Opc) {
3160	return Opc == ISD::CTLZ \|\| Opc == ISD::CTLZ_ZERO_UNDEF;
3161	}
3162
3163	static bool isCttzOpc(unsigned Opc) {
3164	return Opc == ISD::CTTZ \|\| Opc == ISD::CTTZ_ZERO_UNDEF;
3165	}
3166
3167	SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3168	SelectionDAG &DAG) const {
3169	auto SL = SDLoc (Op);
3170	auto Opc = Op.getOpcode();
3171	auto Arg = Op.getOperand(i: `0u`);
3172	auto ResultVT = Op.getValueType();
3173
3174	if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3175	return {};
3176
3177	assert(isCtlzOpc(Opc));
3178	assert(ResultVT == Arg.getValueType());
3179
3180	const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3181	SDValue NumExtBits = DAG.getConstant(Val: `32u` - NumBits, DL: SL, VT: MVT::i32);
3182	SDValue NewOp;
3183
3184	if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3185	NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3186	NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3187	NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3188	} else {
3189	NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3190	NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3191	NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3192	}
3193
3194	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3195	}
3196
3197	SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3198	SDLoc SL(Op);
3199	SDValue Src = Op.getOperand(i: `0`);
3200
3201	assert(isCtlzOpc(Op.getOpcode()) \|\| isCttzOpc(Op.getOpcode()));
3202	bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3203	unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3204
3205	bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF \|\|
3206	Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3207	bool Is64BitScalar = !Src ->isDivergent() && Src.getValueType() == MVT::i64;
3208
3209	if (Src.getValueType() == MVT::i32 \|\| Is64BitScalar) {
3210	// (ctlz hi:lo) -> (umin (ffbh src), 32)
3211	// (cttz hi:lo) -> (umin (ffbl src), 32)
3212	// (ctlz_zero_undef src) -> (ffbh src)
3213	// (cttz_zero_undef src) -> (ffbl src)
3214
3215	// 64-bit scalar version produce 32-bit result
3216	// (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3217	// (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3218	// (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3219	// (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3220	SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3221	if (!ZeroUndef) {
3222	const SDValue ConstVal = DAG.getConstant(
3223	Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3224	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3225	}
3226	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3227	}
3228
3229	SDValue Lo, Hi;
3230	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3231
3232	SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3233	SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3234
3235	// (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3236	// (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3237	// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3238	// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3239
3240	unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3241	const SDValue Const32 = DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32);
3242	if (Ctlz)
3243	OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3244	else
3245	OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3246
3247	SDValue NewOpr;
3248	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3249	if (!ZeroUndef) {
3250	const SDValue Const64 = DAG.getConstant(Val: `64`, DL: SL, VT: MVT::i32);
3251	NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3252	}
3253
3254	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3255	}
3256
3257	SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3258	bool Signed) const {
3259	// The regular method converting a 64-bit integer to float roughly consists of
3260	// 2 steps: normalization and rounding. In fact, after normalization, the
3261	// conversion from a 64-bit integer to a float is essentially the same as the
3262	// one from a 32-bit integer. The only difference is that it has more
3263	// trailing bits to be rounded. To leverage the native 32-bit conversion, a
3264	// 64-bit integer could be preprocessed and fit into a 32-bit integer then
3265	// converted into the correct float number. The basic steps for the unsigned
3266	// conversion are illustrated in the following pseudo code:
3267	//
3268	// f32 uitofp(i64 u) {
3269	// i32 hi, lo = split(u);
3270	// // Only count the leading zeros in hi as we have native support of the
3271	// // conversion from i32 to f32. If hi is all 0s, the conversion is
3272	// // reduced to a 32-bit one automatically.
3273	// i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3274	// u <<= shamt;
3275	// hi, lo = split(u);
3276	// hi \|= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3277	// // convert it as a 32-bit integer and scale the result back.
3278	// return uitofp(hi) 2^(32 - shamt);*
3279	// }
3280	//
3281	// The signed one follows the same principle but uses 'ffbh_i32' to count its
3282	// sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3283	// converted instead followed by negation based its sign bit.
3284
3285	SDLoc SL(Op);
3286	SDValue Src = Op.getOperand(i: `0`);
3287
3288	SDValue Lo, Hi;
3289	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3290	SDValue Sign;
3291	SDValue ShAmt;
3292	if (Signed && Subtarget->isGCN()) {
3293	// We also need to consider the sign bit in Lo if Hi has just sign bits,
3294	// i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3295	// account. That is, the maximal shift is
3296	// - 32 if Lo and Hi have opposite signs;
3297	// - 33 if Lo and Hi have the same sign.
3298	//
3299	// Or, MaxShAmt = 33 + OppositeSign, where
3300	//
3301	// OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3302	// - -1 if Lo and Hi have opposite signs; and
3303	// - 0 otherwise.
3304	//
3305	// All in all, ShAmt is calculated as
3306	//
3307	// umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3308	//
3309	// or
3310	//
3311	// umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3312	//
3313	// to reduce the critical path.
3314	SDValue OppositeSign = DAG.getNode(
3315	Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3316	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3317	SDValue MaxShAmt =
3318	DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32),
3319	N2: OppositeSign);
3320	// Count the leading sign bits.
3321	ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3322	// Different from unsigned conversion, the shift should be one bit less to
3323	// preserve the sign bit.
3324	ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3325	N2: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32));
3326	ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3327	} else {
3328	if (Signed) {
3329	// Without 'ffbh_i32', only leading zeros could be counted. Take the
3330	// absolute value first.
3331	Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3332	N2: DAG.getConstant(Val: `63`, DL: SL, VT: MVT::i64));
3333	SDValue Abs =
3334	DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3335	N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3336	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3337	}
3338	// Count the leading zeros.
3339	ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3340	// The shift amount for signed integers is [0, 32].
3341	}
3342	// Normalize the given 64-bit integer.
3343	SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3344	// Split it again.
3345	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3346	// Calculate the adjust bit for rounding.
3347	// (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3348	SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3349	N1: DAG.getConstant(Val: `1`, DL: SL, VT: MVT::i32), N2: Lo);
3350	// Get the 32-bit normalized integer.
3351	Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3352	// Convert the normalized 32-bit integer into f32.
3353	unsigned Opc =
3354	(Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3355	SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3356
3357	// Finally, need to scale back the converted floating number as the original
3358	// 64-bit integer is converted as a 32-bit one.
3359	ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32),
3360	N2: ShAmt);
3361	// On GCN, use LDEXP directly.
3362	if (Subtarget->isGCN())
3363	return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3364
3365	// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3366	// part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3367	// exponent is enough to avoid overflowing into the sign bit.
3368	SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3369	N2: DAG.getConstant(Val: `23`, DL: SL, VT: MVT::i32));
3370	SDValue IVal =
3371	DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3372	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3373	if (Signed) {
3374	// Set the sign bit.
3375	Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3376	N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3377	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3378	IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3379	}
3380	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3381	}
3382
3383	SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3384	bool Signed) const {
3385	SDLoc SL(Op);
3386	SDValue Src = Op.getOperand(i: `0`);
3387
3388	SDValue Lo, Hi;
3389	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3390
3391	SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3392	DL: SL, VT: MVT::f64, Operand: Hi);
3393
3394	SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3395
3396	SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3397	N2: DAG.getConstant(Val: `32`, DL: SL, VT: MVT::i32));
3398	// TODO: Should this propagate fast-math-flags?
3399	return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3400	}
3401
3402	SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3403	SelectionDAG &DAG) const {
3404	// TODO: Factor out code common with LowerSINT_TO_FP.
3405	EVT DestVT = Op.getValueType();
3406	SDValue Src = Op.getOperand(i: `0`);
3407	EVT SrcVT = Src.getValueType();
3408
3409	if (SrcVT == MVT::i16) {
3410	if (DestVT == MVT::f16)
3411	return Op;
3412	SDLoc DL(Op);
3413
3414	// Promote src to i32
3415	SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3416	return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3417	}
3418
3419	if (DestVT == MVT::bf16) {
3420	SDLoc SL(Op);
3421	SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3422	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3423	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3424	}
3425
3426	if (SrcVT != MVT::i64)
3427	return Op;
3428
3429	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3430	SDLoc DL(Op);
3431
3432	SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3433	SDValue FPRoundFlag =
3434	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3435	SDValue FPRound =
3436	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3437
3438	return FPRound;
3439	}
3440
3441	if (DestVT == MVT::f32)
3442	return LowerINT_TO_FP32(Op, DAG, Signed: false);
3443
3444	assert(DestVT == MVT::f64);
3445	return LowerINT_TO_FP64(Op, DAG, Signed: false);
3446	}
3447
3448	SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3449	SelectionDAG &DAG) const {
3450	EVT DestVT = Op.getValueType();
3451
3452	SDValue Src = Op.getOperand(i: `0`);
3453	EVT SrcVT = Src.getValueType();
3454
3455	if (SrcVT == MVT::i16) {
3456	if (DestVT == MVT::f16)
3457	return Op;
3458
3459	SDLoc DL(Op);
3460	// Promote src to i32
3461	SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3462	return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3463	}
3464
3465	if (DestVT == MVT::bf16) {
3466	SDLoc SL(Op);
3467	SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3468	SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: `0`, DL: SL, /isTarget=/true);
3469	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3470	}
3471
3472	if (SrcVT != MVT::i64)
3473	return Op;
3474
3475	// TODO: Factor out code common with LowerUINT_TO_FP.
3476
3477	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3478	SDLoc DL(Op);
3479	SDValue Src = Op.getOperand(i: `0`);
3480
3481	SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3482	SDValue FPRoundFlag =
3483	DAG.getIntPtrConstant(Val: `0`, DL: SDLoc (Op), /isTarget=/true);
3484	SDValue FPRound =
3485	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3486
3487	return FPRound;
3488	}
3489
3490	if (DestVT == MVT::f32)
3491	return LowerINT_TO_FP32(Op, DAG, Signed: true);
3492
3493	assert(DestVT == MVT::f64);
3494	return LowerINT_TO_FP64(Op, DAG, Signed: true);
3495	}
3496
3497	SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3498	bool Signed) const {
3499	SDLoc SL(Op);
3500
3501	SDValue Src = Op.getOperand(i: `0`);
3502	EVT SrcVT = Src.getValueType();
3503
3504	assert(SrcVT == MVT::f32 \|\| SrcVT == MVT::f64);
3505
3506	// The basic idea of converting a floating point number into a pair of 32-bit
3507	// integers is illustrated as follows:
3508	//
3509	// tf := trunc(val);
3510	// hif := floor(tf 2^-32);*
3511	// lof := tf - hif 2^32; // lof is always positive due to floor.*
3512	// hi := fptoi(hif);
3513	// lo := fptoi(lof);
3514	//
3515	SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3516	SDValue Sign;
3517	if (Signed && SrcVT == MVT::f32) {
3518	// However, a 32-bit floating point number has only 23 bits mantissa and
3519	// it's not enough to hold all the significant bits of `lof` if val is
3520	// negative. To avoid the loss of precision, We need to take the absolute
3521	// value after truncating and flip the result back based on the original
3522	// signedness.
3523	Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3524	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3525	N2: DAG.getConstant(Val: `31`, DL: SL, VT: MVT::i32));
3526	Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3527	}
3528
3529	SDValue K0, K1;
3530	if (SrcVT == MVT::f64) {
3531	K0 = DAG.getConstantFP(
3532	Val: llvm::bit_cast<double>(UINT64_C(/2^-32/ `0x3df0000000000000`)), DL: SL,
3533	VT: SrcVT);
3534	K1 = DAG.getConstantFP(
3535	Val: llvm::bit_cast<double>(UINT64_C(/-2^32/ `0xc1f0000000000000`)), DL: SL,
3536	VT: SrcVT);
3537	} else {
3538	K0 = DAG.getConstantFP(
3539	Val: llvm::bit_cast<float>(UINT32_C(/2^-32/ `0x2f800000`)), DL: SL, VT: SrcVT);
3540	K1 = DAG.getConstantFP(
3541	Val: llvm::bit_cast<float>(UINT32_C(/-2^32/ `0xcf800000`)), DL: SL, VT: SrcVT);
3542	}
3543	// TODO: Should this propagate fast-math-flags?
3544	SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3545
3546	SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3547
3548	SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3549
3550	SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3551	: ISD::FP_TO_UINT,
3552	DL: SL, VT: MVT::i32, Operand: FloorMul);
3553	SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3554
3555	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3556	Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3557
3558	if (Signed && SrcVT == MVT::f32) {
3559	assert(Sign);
3560	// Flip the result based on the signedness, which is either all 0s or 1s.
3561	Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3562	Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3563	// r := xor(r, sign) - sign;
3564	Result =
3565	DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3566	N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3567	}
3568
3569	return Result;
3570	}
3571
3572	SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3573	SDLoc DL(Op);
3574	SDValue N0 = Op.getOperand(i: `0`);
3575
3576	// Convert to target node to get known bits
3577	if (N0.getValueType() == MVT::f32)
3578	return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3579
3580	if (getTargetMachine().Options.UnsafeFPMath) {
3581	// There is a generic expand for FP_TO_FP16 with unsafe fast math.
3582	return SDValue ();
3583	}
3584
3585	return LowerF64ToF16Safe(Src: N0, DL, DAG);
3586	}
3587
3588	// return node in i32
3589	SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3590	SelectionDAG &DAG) const {
3591	assert(Src.getSimpleValueType() == MVT::f64);
3592
3593	// f64 -> f16 conversion using round-to-nearest-even rounding mode.
3594	// TODO: We can generate better code for True16.
3595	const unsigned ExpMask = `0x7ff`;
3596	const unsigned ExpBiasf64 = `1023`;
3597	const unsigned ExpBiasf16 = `15`;
3598	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
3599	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
3600	SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3601	SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3602	N2: DAG.getConstant(Val: `32`, DL, VT: MVT::i64));
3603	UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3604	U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3605	SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3606	N2: DAG.getConstant(Val: `20`, DL, VT: MVT::i64));
3607	E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3608	N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3609	// Subtract the fp64 exponent bias (1023) to get the real exponent and
3610	// add the f16 bias (15) to get the biased exponent for the f16 format.
3611	E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3612	N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3613
3614	SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3615	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i32));
3616	M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3617	N2: DAG.getConstant(Val: `0xffe`, DL, VT: MVT::i32));
3618
3619	SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3620	N2: DAG.getConstant(Val: `0x1ff`, DL, VT: MVT::i32));
3621	MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3622
3623	SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3624	M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3625
3626	// (M != 0 ? 0x0200 : 0) \| 0x7c00;
3627	SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3628	N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: `0x0200`, DL, VT: MVT::i32),
3629	False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: `0x7c00`, DL, VT: MVT::i32));
3630
3631	// N = M \| (E << 12);
3632	SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3633	N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3634	N2: DAG.getConstant(Val: `12`, DL, VT: MVT::i32)));
3635
3636	// B = clamp(1-E, 0, 13);
3637	SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3638	N1: One, N2: E);
3639	SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3640	B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3641	N2: DAG.getConstant(Val: `13`, DL, VT: MVT::i32));
3642
3643	SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3644	N2: DAG.getConstant(Val: `0x1000`, DL, VT: MVT::i32));
3645
3646	SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3647	SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3648	SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3649	D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3650
3651	SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3652	SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3653	N2: DAG.getConstant(Val: `0x7`, DL, VT: MVT::i32));
3654	V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3655	N2: DAG.getConstant(Val: `2`, DL, VT: MVT::i32));
3656	SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: `3`, DL, VT: MVT::i32),
3657	True: One, False: Zero, Cond: ISD::SETEQ);
3658	SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: `5`, DL, VT: MVT::i32),
3659	True: One, False: Zero, Cond: ISD::SETGT);
3660	V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3661	V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3662
3663	V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: `30`, DL, VT: MVT::i32),
3664	True: DAG.getConstant(Val: `0x7c00`, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3665	V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: `1039`, DL, VT: MVT::i32),
3666	True: I, False: V, Cond: ISD::SETEQ);
3667
3668	// Extract the sign bit.
3669	SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3670	N2: DAG.getConstant(Val: `16`, DL, VT: MVT::i32));
3671	Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3672	N2: DAG.getConstant(Val: `0x8000`, DL, VT: MVT::i32));
3673
3674	return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3675	}
3676
3677	SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3678	SelectionDAG &DAG) const {
3679	SDValue Src = Op.getOperand(i: `0`);
3680	unsigned OpOpcode = Op.getOpcode();
3681	EVT SrcVT = Src.getValueType();
3682	EVT DestVT = Op.getValueType();
3683
3684	// Will be selected natively
3685	if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3686	return Op;
3687
3688	if (SrcVT == MVT::bf16) {
3689	SDLoc DL(Op);
3690	SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3691	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3692	}
3693
3694	// Promote i16 to i32
3695	if (DestVT == MVT::i16 && (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)) {
3696	SDLoc DL(Op);
3697
3698	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3699	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3700	}
3701
3702	if (DestVT != MVT::i64)
3703	return Op;
3704
3705	if (SrcVT == MVT::f16 \|\|
3706	(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3707	SDLoc DL(Op);
3708
3709	SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3710	unsigned Ext =
3711	OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3712	return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3713	}
3714
3715	if (SrcVT == MVT::f32 \|\| SrcVT == MVT::f64)
3716	return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3717
3718	return SDValue ();
3719	}
3720
3721	SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3722	SelectionDAG &DAG) const {
3723	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
3724	MVT VT = Op.getSimpleValueType();
3725	MVT ScalarVT = VT.getScalarType();
3726
3727	assert(VT.isVector());
3728
3729	SDValue Src = Op.getOperand(i: `0`);
3730	SDLoc DL(Op);
3731
3732	// TODO: Don't scalarize on Evergreen?
3733	unsigned NElts = VT.getVectorNumElements();
3734	SmallVector<SDValue, `8`> Args;
3735	DAG.ExtractVectorElements(Op: Src, Args, Start: `0`, Count: NElts);
3736
3737	SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3738	for (unsigned I = `0`; I < NElts; ++I)
3739	Args [I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args [I], N2: VTOp);
3740
3741	return DAG.getBuildVector(VT, DL, Ops: Args);
3742	}
3743
3744	//===----------------------------------------------------------------------===//
3745	// Custom DAG optimizations
3746	//===----------------------------------------------------------------------===//
3747
3748	static bool isU24(SDValue Op, SelectionDAG &DAG) {
3749	return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= `24`;
3750	}
3751
3752	static bool isI24(SDValue Op, SelectionDAG &DAG) {
3753	EVT VT = Op.getValueType();
3754	return VT.getSizeInBits() >= `24` && // Types less than 24-bit should be treated
3755	// as unsigned 24-bit values.
3756	AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= `24`;
3757	}
3758
3759	static SDValue simplifyMul24(SDNode *Node24,
3760	TargetLowering::DAGCombinerInfo &DCI) {
3761	SelectionDAG &DAG = DCI.DAG;
3762	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3763	bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3764
3765	SDValue LHS = IsIntrin ? Node24->getOperand(Num: `1`) : Node24->getOperand(Num: `0`);
3766	SDValue RHS = IsIntrin ? Node24->getOperand(Num: `2`) : Node24->getOperand(Num: `1`);
3767	unsigned NewOpcode = Node24->getOpcode();
3768	if (IsIntrin) {
3769	unsigned IID = Node24->getConstantOperandVal(Num: `0`);
3770	switch (IID) {
3771	case Intrinsic::amdgcn_mul_i24:
3772	NewOpcode = AMDGPUISD::MUL_I24;
3773	break;
3774	case Intrinsic::amdgcn_mul_u24:
3775	NewOpcode = AMDGPUISD::MUL_U24;
3776	break;
3777	case Intrinsic::amdgcn_mulhi_i24:
3778	NewOpcode = AMDGPUISD::MULHI_I24;
3779	break;
3780	case Intrinsic::amdgcn_mulhi_u24:
3781	NewOpcode = AMDGPUISD::MULHI_U24;
3782	break;
3783	default:
3784	llvm_unreachable("Expected 24-bit mul intrinsic");
3785	}
3786	}
3787
3788	APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: `24`);
3789
3790	// First try to simplify using SimplifyMultipleUseDemandedBits which allows
3791	// the operands to have other uses, but will only perform simplifications that
3792	// involve bypassing some nodes for this user.
3793	SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3794	SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3795	if (DemandedLHS \|\| DemandedRHS)
3796	return DAG.getNode(Opcode: NewOpcode, DL: SDLoc (Node24), VTList: Node24->getVTList(),
3797	N1: DemandedLHS ? DemandedLHS : LHS,
3798	N2: DemandedRHS ? DemandedRHS : RHS);
3799
3800	// Now try SimplifyDemandedBits which can simplify the nodes used by our
3801	// operands if this node is the only user.
3802	if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3803	return SDValue (Node24, `0`);
3804	if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3805	return SDValue (Node24, `0`);
3806
3807	return SDValue ();
3808	}
3809
3810	template <typename IntTy>
3811	static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3812	uint32_t Width, const SDLoc &DL) {
3813	if (Width + Offset < `32`) {
3814	uint32_t Shl = static_cast<uint32_t>(Src0) << (`32` - Offset - Width);
3815	IntTy Result = static_cast<IntTy>(Shl) >> (`32` - Width);
3816	if constexpr (std::is_signed_v<IntTy>) {
3817	return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
3818	} else {
3819	return DAG.getConstant(Result, DL, MVT::i32);
3820	}
3821	}
3822
3823	return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3824	}
3825
3826	static bool hasVolatileUser(SDNode *Val) {
3827	for (SDNode *U : Val->users()) {
3828	if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3829	if (M->isVolatile())
3830	return true;
3831	}
3832	}
3833
3834	return false;
3835	}
3836
3837	bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3838	// i32 vectors are the canonical memory type.
3839	if (VT.getScalarType() == MVT::i32 \|\| isTypeLegal(VT))
3840	return false;
3841
3842	if (!VT.isByteSized())
3843	return false;
3844
3845	unsigned Size = VT.getStoreSize();
3846
3847	if ((Size == `1` \|\| Size == `2` \|\| Size == `4`) && !VT.isVector())
3848	return false;
3849
3850	if (Size == `3` \|\| (Size > `4` && (Size % `4` != `0`)))
3851	return false;
3852
3853	return true;
3854	}
3855
3856	// Replace load of an illegal type with a store of a bitcast to a friendlier
3857	// type.
3858	SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3859	DAGCombinerInfo &DCI) const {
3860	if (!DCI.isBeforeLegalize())
3861	return SDValue ();
3862
3863	LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3864	if (!LN->isSimple() \|\| !ISD::isNormalLoad(N: LN) \|\| hasVolatileUser(Val: LN))
3865	return SDValue ();
3866
3867	SDLoc SL(N);
3868	SelectionDAG &DAG = DCI.DAG;
3869	EVT VT = LN->getMemoryVT();
3870
3871	unsigned Size = VT.getStoreSize();
3872	Align Alignment = LN->getAlign();
3873	if (Alignment < Size && isTypeLegal(VT)) {
3874	unsigned IsFast;
3875	unsigned AS = LN->getAddressSpace();
3876
3877	// Expand unaligned loads earlier than legalization. Due to visitation order
3878	// problems during legalization, the emitted instructions to pack and unpack
3879	// the bytes again are not eliminated in the case of an unaligned copy.
3880	if (!allowsMisalignedMemoryAccesses(
3881	VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
3882	if (VT.isVector())
3883	return SplitVectorLoad(Op: SDValue (LN, `0`), DAG);
3884
3885	SDValue Ops[`2`];
3886	std::tie(args&: Ops[`0`], args&: Ops[`1`]) = expandUnalignedLoad(LD: LN, DAG);
3887
3888	return DAG.getMergeValues(Ops, dl: SDLoc (N));
3889	}
3890
3891	if (!IsFast)
3892	return SDValue ();
3893	}
3894
3895	if (!shouldCombineMemoryType(VT))
3896	return SDValue ();
3897
3898	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3899
3900	SDValue NewLoad
3901	= DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
3902	Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
3903
3904	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
3905	DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: `1`));
3906	return SDValue (N, `0`);
3907	}
3908
3909	// Replace store of an illegal type with a store of a bitcast to a friendlier
3910	// type.
3911	SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3912	DAGCombinerInfo &DCI) const {
3913	if (!DCI.isBeforeLegalize())
3914	return SDValue ();
3915
3916	StoreSDNode *SN = cast<StoreSDNode>(Val: N);
3917	if (!SN->isSimple() \|\| !ISD::isNormalStore(N: SN))
3918	return SDValue ();
3919
3920	EVT VT = SN->getMemoryVT();
3921	unsigned Size = VT.getStoreSize();
3922
3923	SDLoc SL(N);
3924	SelectionDAG &DAG = DCI.DAG;
3925	Align Alignment = SN->getAlign();
3926	if (Alignment < Size && isTypeLegal(VT)) {
3927	unsigned IsFast;
3928	unsigned AS = SN->getAddressSpace();
3929
3930	// Expand unaligned stores earlier than legalization. Due to visitation
3931	// order problems during legalization, the emitted instructions to pack and
3932	// unpack the bytes again are not eliminated in the case of an unaligned
3933	// copy.
3934	if (!allowsMisalignedMemoryAccesses(
3935	VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
3936	if (VT.isVector())
3937	return SplitVectorStore(Op: SDValue (SN, `0`), DAG);
3938
3939	return expandUnalignedStore(ST: SN, DAG);
3940	}
3941
3942	if (!IsFast)
3943	return SDValue ();
3944	}
3945
3946	if (!shouldCombineMemoryType(VT))
3947	return SDValue ();
3948
3949	EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3950	SDValue Val = SN->getValue();
3951
3952	//DCI.AddToWorklist(Val.getNode());
3953
3954	bool OtherUses = !Val.hasOneUse();
3955	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
3956	if (OtherUses) {
3957	SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
3958	DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
3959	}
3960
3961	return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
3962	Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
3963	}
3964
3965	// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3966	// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3967	// issues.
3968	SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3969	DAGCombinerInfo &DCI) const {
3970	SelectionDAG &DAG = DCI.DAG;
3971	SDValue N0 = N->getOperand(Num: `0`);
3972
3973	// (vt2 (assertzext (truncate vt0:x), vt1)) ->
3974	// (vt2 (truncate (assertzext vt0:x, vt1)))
3975	if (N0.getOpcode() == ISD::TRUNCATE) {
3976	SDValue N1 = N->getOperand(Num: `1`);
3977	EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
3978	SDLoc SL(N);
3979
3980	SDValue Src = N0.getOperand(i: `0`);
3981	EVT SrcVT = Src.getValueType();
3982	if (SrcVT.bitsGE(VT: ExtVT)) {
3983	SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
3984	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: NewInReg);
3985	}
3986	}
3987
3988	return SDValue ();
3989	}
3990
3991	SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3992	SDNode N, DAGCombinerInfo &DCI) const* {
3993	unsigned IID = N->getConstantOperandVal(Num: `0`);
3994	switch (IID) {
3995	case Intrinsic::amdgcn_mul_i24:
3996	case Intrinsic::amdgcn_mul_u24:
3997	case Intrinsic::amdgcn_mulhi_i24:
3998	case Intrinsic::amdgcn_mulhi_u24:
3999	return simplifyMul24(Node24: N, DCI);
4000	case Intrinsic::amdgcn_fract:
4001	case Intrinsic::amdgcn_rsq:
4002	case Intrinsic::amdgcn_rcp_legacy:
4003	case Intrinsic::amdgcn_rsq_legacy:
4004	case Intrinsic::amdgcn_rsq_clamp: {
4005	// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4006	SDValue Src = N->getOperand(Num: `1`);
4007	return Src.isUndef() ? Src : SDValue ();
4008	}
4009	case Intrinsic::amdgcn_frexp_exp: {
4010	// frexp_exp (fneg x) -> frexp_exp x
4011	// frexp_exp (fabs x) -> frexp_exp x
4012	// frexp_exp (fneg (fabs x)) -> frexp_exp x
4013	SDValue Src = N->getOperand(Num: `1`);
4014	SDValue PeekSign = peekFPSignOps(Val: Src);
4015	if (PeekSign == Src)
4016	return SDValue ();
4017	return SDValue (DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: `0`), Op2: PeekSign),
4018	`0`);
4019	}
4020	default:
4021	return SDValue ();
4022	}
4023	}
4024
4025	/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4026	/// binary operation \p Opc to it with the corresponding constant operands.
4027	SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4028	DAGCombinerInfo &DCI, const SDLoc &SL,
4029	unsigned Opc, SDValue LHS,
4030	uint32_t ValLo, uint32_t ValHi) const {
4031	SelectionDAG &DAG = DCI.DAG;
4032	SDValue Lo, Hi;
4033	std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4034
4035	SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4036	SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4037
4038	SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4039	SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4040
4041	// Re-visit the ands. It's possible we eliminated one of them and it could
4042	// simplify the vector.
4043	DCI.AddToWorklist(N: Lo.getNode());
4044	DCI.AddToWorklist(N: Hi.getNode());
4045
4046	SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4047	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4048	}
4049
4050	SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4051	DAGCombinerInfo &DCI) const {
4052	EVT VT = N->getValueType(ResNo: `0`);
4053	SDValue LHS = N->getOperand(Num: `0`);
4054	SDValue RHS = N->getOperand(Num: `1`);
4055	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4056	SDLoc SL(N);
4057	SelectionDAG &DAG = DCI.DAG;
4058
4059	unsigned RHSVal;
4060	if (CRHS) {
4061	RHSVal = CRHS->getZExtValue();
4062	if (!RHSVal)
4063	return LHS;
4064
4065	switch (LHS ->getOpcode()) {
4066	default:
4067	break;
4068	case ISD::ZERO_EXTEND:
4069	case ISD::SIGN_EXTEND:
4070	case ISD::ANY_EXTEND: {
4071	SDValue X = LHS ->getOperand(Num: `0`);
4072
4073	if (VT == MVT::i32 && RHSVal == `16` && X.getValueType() == MVT::i16 &&
4074	isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4075	// Prefer build_vector as the canonical form if packed types are legal.
4076	// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4077	SDValue Vec = DAG.getBuildVector(
4078	VT: MVT::v2i16, DL: SL,
4079	Ops: {DAG.getConstant(Val: `0`, DL: SL, VT: MVT::i16), LHS ->getOperand(Num: `0`)});
4080	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4081	}
4082
4083	// shl (ext x) => zext (shl x), if shift does not overflow int
4084	if (VT != MVT::i64)
4085	break;
4086	KnownBits Known = DAG.computeKnownBits(Op: X);
4087	unsigned LZ = Known.countMinLeadingZeros();
4088	if (LZ < RHSVal)
4089	break;
4090	EVT XVT = X.getValueType();
4091	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue (CRHS, `0`));
4092	return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4093	}
4094	}
4095	}
4096
4097	if (VT.getScalarType() != MVT::i64)
4098	return SDValue ();
4099
4100	// i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101
4102	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4103	// common case, splitting this into a move and a 32-bit shift is faster and
4104	// the same code size.
4105	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4106
4107	EVT ElementType = VT.getScalarType();
4108	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4109	EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4110	: TargetScalarType;
4111
4112	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4113	return SDValue ();
4114	SDValue ShiftAmt;
4115
4116	if (CRHS) {
4117	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4118	VT: TargetType);
4119	} else {
4120	SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4121	const SDValue ShiftMask =
4122	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4123	// This AND instruction will clamp out of bounds shift values.
4124	// It will also be removed during later instruction selection.
4125	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4126	}
4127
4128	SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4129	SDValue NewShift =
4130	DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4131
4132	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: TargetScalarType);
4133	SDValue Vec;
4134
4135	if (VT.isVector()) {
4136	EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4137	unsigned NElts = TargetType.getVectorNumElements();
4138	SmallVector<SDValue, `8`> HiOps;
4139	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`, Zero);
4140
4141	DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: `0`, Count: NElts);
4142	for (unsigned I = `0`; I != NElts; ++I)
4143	HiAndLoOps [`2` * I + `1`] = HiOps [I];
4144	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4145	} else {
4146	EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4147	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4148	}
4149	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4150	}
4151
4152	SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4153	DAGCombinerInfo &DCI) const {
4154	SDValue RHS = N->getOperand(Num: `1`);
4155	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4156	EVT VT = N->getValueType(ResNo: `0`);
4157	SDValue LHS = N->getOperand(Num: `0`);
4158	SelectionDAG &DAG = DCI.DAG;
4159	SDLoc SL(N);
4160
4161	if (VT.getScalarType() != MVT::i64)
4162	return SDValue ();
4163
4164	// For C >= 32
4165	// i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4166
4167	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4168	// common case, splitting this into a move and a 32-bit shift is faster and
4169	// the same code size.
4170	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4171
4172	EVT ElementType = VT.getScalarType();
4173	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4174	EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4175	: TargetScalarType;
4176
4177	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4178	return SDValue ();
4179
4180	SDValue ShiftFullAmt =
4181	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4182	SDValue ShiftAmt;
4183	if (CRHS) {
4184	unsigned RHSVal = CRHS->getZExtValue();
4185	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4186	VT: TargetType);
4187	} else if (Known.getMinValue().getZExtValue() ==
4188	(ElementType.getSizeInBits() - `1`)) {
4189	ShiftAmt = ShiftFullAmt;
4190	} else {
4191	SDValue truncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4192	const SDValue ShiftMask =
4193	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4194	// This AND instruction will clamp out of bounds shift values.
4195	// It will also be removed during later instruction selection.
4196	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: truncShiftAmt, N2: ShiftMask);
4197	}
4198
4199	EVT ConcatType;
4200	SDValue Hi;
4201	SDLoc LHSSL(LHS);
4202	// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4203	if (VT.isVector()) {
4204	unsigned NElts = TargetType.getVectorNumElements();
4205	ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4206	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4207	SmallVector<SDValue, `8`> HiOps(NElts);
4208	SmallVector<SDValue, `16`> HiAndLoOps;
4209
4210	DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: `0`, Count: NElts * `2`);
4211	for (unsigned I = `0`; I != NElts; ++I) {
4212	HiOps [I] = HiAndLoOps [`2` * I + `1`];
4213	}
4214	Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4215	} else {
4216	const SDValue One = DAG.getConstant(Val: `1`, DL: LHSSL, VT: TargetScalarType);
4217	ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4218	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4219	Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4220	}
4221	Hi = DAG.getFreeze(V: Hi);
4222
4223	SDValue HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4224	SDValue NewShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt);
4225
4226	SDValue Vec;
4227	if (VT.isVector()) {
4228	unsigned NElts = TargetType.getVectorNumElements();
4229	SmallVector<SDValue, `8`> HiOps;
4230	SmallVector<SDValue, `8`> LoOps;
4231	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`);
4232
4233	DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: `0`, Count: NElts);
4234	DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: `0`, Count: NElts);
4235	for (unsigned I = `0`; I != NElts; ++I) {
4236	HiAndLoOps [`2` * I + `1`] = HiOps [I];
4237	HiAndLoOps [`2` * I] = LoOps [I];
4238	}
4239	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4240	} else {
4241	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4242	}
4243	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4244	}
4245
4246	SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4247	DAGCombinerInfo &DCI) const {
4248	SDValue RHS = N->getOperand(Num: `1`);
4249	ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4250	EVT VT = N->getValueType(ResNo: `0`);
4251	SDValue LHS = N->getOperand(Num: `0`);
4252	SelectionDAG &DAG = DCI.DAG;
4253	SDLoc SL(N);
4254	unsigned RHSVal;
4255
4256	if (CRHS) {
4257	RHSVal = CRHS->getZExtValue();
4258
4259	// fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4260	// this improves the ability to match BFE patterns in isel.
4261	if (LHS.getOpcode() == ISD::AND) {
4262	if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`))) {
4263	unsigned MaskIdx, MaskLen;
4264	if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4265	MaskIdx == RHSVal) {
4266	return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4267	N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `0`),
4268	N2: N->getOperand(Num: `1`)),
4269	N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: `1`),
4270	N2: N->getOperand(Num: `1`)));
4271	}
4272	}
4273	}
4274	}
4275
4276	if (VT.getScalarType() != MVT::i64)
4277	return SDValue ();
4278
4279	// for C >= 32
4280	// i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4281
4282	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
4283	// common case, splitting this into a move and a 32-bit shift is faster and
4284	// the same code size.
4285	KnownBits Known = DAG.computeKnownBits(Op: RHS);
4286
4287	EVT ElementType = VT.getScalarType();
4288	EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4289	EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4290	: TargetScalarType;
4291
4292	if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4293	return SDValue ();
4294
4295	SDValue ShiftAmt;
4296	if (CRHS) {
4297	ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4298	VT: TargetType);
4299	} else {
4300	SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4301	const SDValue ShiftMask =
4302	DAG.getConstant(Val: TargetScalarType.getSizeInBits() - `1`, DL: SL, VT: TargetType);
4303	// This AND instruction will clamp out of bounds shift values.
4304	// It will also be removed during later instruction selection.
4305	ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4306	}
4307
4308	const SDValue Zero = DAG.getConstant(Val: `0`, DL: SL, VT: TargetScalarType);
4309	EVT ConcatType;
4310	SDValue Hi;
4311	SDLoc LHSSL(LHS);
4312	// Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4313	if (VT.isVector()) {
4314	unsigned NElts = TargetType.getVectorNumElements();
4315	ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4316	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4317	SmallVector<SDValue, `8`> HiOps(NElts);
4318	SmallVector<SDValue, `16`> HiAndLoOps;
4319
4320	DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /Start=/`0`, Count: NElts * `2`);
4321	for (unsigned I = `0`; I != NElts; ++I)
4322	HiOps [I] = HiAndLoOps [`2` * I + `1`];
4323	Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4324	} else {
4325	const SDValue One = DAG.getConstant(Val: `1`, DL: LHSSL, VT: TargetScalarType);
4326	ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: `2`);
4327	SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4328	Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4329	}
4330
4331	SDValue NewShift = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt);
4332
4333	SDValue Vec;
4334	if (VT.isVector()) {
4335	unsigned NElts = TargetType.getVectorNumElements();
4336	SmallVector<SDValue, `8`> LoOps;
4337	SmallVector<SDValue, `16`> HiAndLoOps(NElts * `2`, Zero);
4338
4339	DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: `0`, Count: NElts);
4340	for (unsigned I = `0`; I != NElts; ++I)
4341	HiAndLoOps [`2` * I] = LoOps [I];
4342	Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4343	} else {
4344	Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4345	}
4346	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4347	}
4348
4349	SDValue AMDGPUTargetLowering::performTruncateCombine(
4350	SDNode N, DAGCombinerInfo &DCI) const* {
4351	SDLoc SL(N);
4352	SelectionDAG &DAG = DCI.DAG;
4353	EVT VT = N->getValueType(ResNo: `0`);
4354	SDValue Src = N->getOperand(Num: `0`);
4355
4356	// vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4357	if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4358	SDValue Vec = Src.getOperand(i: `0`);
4359	if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4360	SDValue Elt0 = Vec.getOperand(i: `0`);
4361	EVT EltVT = Elt0.getValueType();
4362	if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4363	if (EltVT.isFloatingPoint()) {
4364	Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4365	VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4366	}
4367
4368	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4369	}
4370	}
4371	}
4372
4373	// Equivalent of above for accessing the high element of a vector as an
4374	// integer operation.
4375	// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4376	if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4377	if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: `1`))) {
4378	SDValue BV = stripBitcast(Val: Src.getOperand(i: `0`));
4379	if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4380	EVT SrcEltVT = BV.getOperand(i: `0`).getValueType();
4381	unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4382	unsigned BitIndex = K->getZExtValue();
4383	unsigned PartIndex = BitIndex / SrcEltSize;
4384
4385	if (PartIndex * SrcEltSize == BitIndex &&
4386	PartIndex < BV.getNumOperands()) {
4387	if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4388	SDValue SrcElt =
4389	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4390	Operand: BV.getOperand(i: PartIndex));
4391	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4392	}
4393	}
4394	}
4395	}
4396	}
4397
4398	// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4399	//
4400	// i16 (trunc (srl i64:x, K)), K <= 16 ->
4401	// i16 (trunc (srl (i32 (trunc x), K)))
4402	if (VT.getScalarSizeInBits() < `32`) {
4403	EVT SrcVT = Src.getValueType();
4404	if (SrcVT.getScalarSizeInBits() > `32` &&
4405	(Src.getOpcode() == ISD::SRL \|\|
4406	Src.getOpcode() == ISD::SRA \|\|
4407	Src.getOpcode() == ISD::SHL)) {
4408	SDValue Amt = Src.getOperand(i: `1`);
4409	KnownBits Known = DAG.computeKnownBits(Op: Amt);
4410
4411	// - For left shifts, do the transform as long as the shift
4412	// amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4413	// - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4414	// losing information stored in the high bits when truncating.
4415	const unsigned MaxCstSize =
4416	(Src.getOpcode() == ISD::SHL) ? `31` : (`32` - VT.getScalarSizeInBits());
4417	if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4418	EVT MidVT = VT.isVector() ?
4419	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4420	NumElements: VT.getVectorNumElements()) : MVT::i32;
4421
4422	EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4423	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4424	Operand: Src.getOperand(i: `0`));
4425	DCI.AddToWorklist(N: Trunc.getNode());
4426
4427	if (Amt.getValueType() != NewShiftVT) {
4428	Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4429	DCI.AddToWorklist(N: Amt.getNode());
4430	}
4431
4432	SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4433	N1: Trunc, N2: Amt);
4434	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4435	}
4436	}
4437	}
4438
4439	return SDValue ();
4440	}
4441
4442	// We need to specifically handle i64 mul here to avoid unnecessary conversion
4443	// instructions. If we only match on the legalized i64 mul expansion,
4444	// SimplifyDemandedBits will be unable to remove them because there will be
4445	// multiple uses due to the separate mul + mulh[su].
4446	static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4447	SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4448	if (Size <= `32`) {
4449	unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4450	return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4451	}
4452
4453	unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4454	unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4455
4456	SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4457	SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4458
4459	return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4460	}
4461
4462	/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4463	/// return SDValue().
4464	static SDValue getAddOneOp(const SDNode *V) {
4465	if (V->getOpcode() != ISD::ADD)
4466	return SDValue ();
4467
4468	return isOneConstant(V: V->getOperand(Num: `1`)) ? V->getOperand(Num: `0`) : SDValue ();
4469	}
4470
4471	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4472	DAGCombinerInfo &DCI) const {
4473	assert(N->getOpcode() == ISD::MUL);
4474	EVT VT = N->getValueType(ResNo: `0`);
4475
4476	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4477	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4478	// unnecessarily). isDivergent() is used as an approximation of whether the
4479	// value is in an SGPR.
4480	if (!N->isDivergent())
4481	return SDValue ();
4482
4483	unsigned Size = VT.getSizeInBits();
4484	if (VT.isVector() \|\| Size > `64`)
4485	return SDValue ();
4486
4487	SelectionDAG &DAG = DCI.DAG;
4488	SDLoc DL(N);
4489
4490	SDValue N0 = N->getOperand(Num: `0`);
4491	SDValue N1 = N->getOperand(Num: `1`);
4492
4493	// Undo InstCombine canonicalize X (Y + 1) -> X * Y + X to enable mad*
4494	// matching.
4495
4496	// mul x, (add y, 1) -> add (mul x, y), x
4497	auto IsFoldableAdd = [](SDValue V) -> SDValue {
4498	SDValue AddOp = getAddOneOp(V: V.getNode());
4499	if (!AddOp)
4500	return SDValue ();
4501
4502	if (V.hasOneUse() \|\| all_of(Range: V ->users(), P: [](const SDNode U) -> bool* {
4503	return U->getOpcode() == ISD::MUL;
4504	}))
4505	return AddOp;
4506
4507	return SDValue ();
4508	};
4509
4510	// FIXME: The selection pattern is not properly checking for commuted
4511	// operands, so we have to place the mul in the LHS
4512	if (SDValue MulOper = IsFoldableAdd (N0)) {
4513	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4514	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4515	}
4516
4517	if (SDValue MulOper = IsFoldableAdd (N1)) {
4518	SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4519	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4520	}
4521
4522	// There are i16 integer mul/mad.
4523	if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(VT: MVT::i16))
4524	return SDValue ();
4525
4526	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4527	// in the source into any_extends if the result of the mul is truncated. Since
4528	// we can assume the high bits are whatever we want, use the underlying value
4529	// to avoid the unknown high bits from interfering.
4530	if (N0.getOpcode() == ISD::ANY_EXTEND)
4531	N0 = N0.getOperand(i: `0`);
4532
4533	if (N1.getOpcode() == ISD::ANY_EXTEND)
4534	N1 = N1.getOperand(i: `0`);
4535
4536	SDValue Mul;
4537
4538	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4539	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4540	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4541	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4542	} else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4543	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4544	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4545	Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4546	} else {
4547	return SDValue ();
4548	}
4549
4550	// We need to use sext even for MUL_U24, because MUL_U24 is used
4551	// for signed multiply of 8 and 16-bit types.
4552	return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4553	}
4554
4555	SDValue
4556	AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4557	DAGCombinerInfo &DCI) const {
4558	if (N->getValueType(ResNo: `0`) != MVT::i32)
4559	return SDValue ();
4560
4561	SelectionDAG &DAG = DCI.DAG;
4562	SDLoc DL(N);
4563
4564	bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4565	SDValue N0 = N->getOperand(Num: `0`);
4566	SDValue N1 = N->getOperand(Num: `1`);
4567
4568	// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4569	// in the source into any_extends if the result of the mul is truncated. Since
4570	// we can assume the high bits are whatever we want, use the underlying value
4571	// to avoid the unknown high bits from interfering.
4572	if (N0.getOpcode() == ISD::ANY_EXTEND)
4573	N0 = N0.getOperand(i: `0`);
4574	if (N1.getOpcode() == ISD::ANY_EXTEND)
4575	N1 = N1.getOperand(i: `0`);
4576
4577	// Try to use two fast 24-bit multiplies (one for each half of the result)
4578	// instead of one slow extending multiply.
4579	unsigned LoOpcode = `0`;
4580	unsigned HiOpcode = `0`;
4581	if (Signed) {
4582	if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4583	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4584	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4585	LoOpcode = AMDGPUISD::MUL_I24;
4586	HiOpcode = AMDGPUISD::MULHI_I24;
4587	}
4588	} else {
4589	if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4590	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4591	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4592	LoOpcode = AMDGPUISD::MUL_U24;
4593	HiOpcode = AMDGPUISD::MULHI_U24;
4594	}
4595	}
4596	if (!LoOpcode)
4597	return SDValue ();
4598
4599	SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4600	SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4601	DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4602	return SDValue (N, `0`);
4603	}
4604
4605	SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4606	DAGCombinerInfo &DCI) const {
4607	EVT VT = N->getValueType(ResNo: `0`);
4608
4609	if (!Subtarget->hasMulI24() \|\| VT.isVector())
4610	return SDValue ();
4611
4612	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4613	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4614	// unnecessarily). isDivergent() is used as an approximation of whether the
4615	// value is in an SGPR.
4616	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4617	// valu op anyway)
4618	if (Subtarget->hasSMulHi() && !N->isDivergent())
4619	return SDValue ();
4620
4621	SelectionDAG &DAG = DCI.DAG;
4622	SDLoc DL(N);
4623
4624	SDValue N0 = N->getOperand(Num: `0`);
4625	SDValue N1 = N->getOperand(Num: `1`);
4626
4627	if (!isI24(Op: N0, DAG) \|\| !isI24(Op: N1, DAG))
4628	return SDValue ();
4629
4630	N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4631	N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4632
4633	SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4634	DCI.AddToWorklist(N: Mulhi.getNode());
4635	return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4636	}
4637
4638	SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4639	DAGCombinerInfo &DCI) const {
4640	EVT VT = N->getValueType(ResNo: `0`);
4641
4642	if (!Subtarget->hasMulU24() \|\| VT.isVector() \|\| VT.getSizeInBits() > `32`)
4643	return SDValue ();
4644
4645	// Don't generate 24-bit multiplies on values that are in SGPRs, since
4646	// we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4647	// unnecessarily). isDivergent() is used as an approximation of whether the
4648	// value is in an SGPR.
4649	// This doesn't apply if no s_mul_hi is available (since we'll end up with a
4650	// valu op anyway)
4651	if (Subtarget->hasSMulHi() && !N->isDivergent())
4652	return SDValue ();
4653
4654	SelectionDAG &DAG = DCI.DAG;
4655	SDLoc DL(N);
4656
4657	SDValue N0 = N->getOperand(Num: `0`);
4658	SDValue N1 = N->getOperand(Num: `1`);
4659
4660	if (!isU24(Op: N0, DAG) \|\| !isU24(Op: N1, DAG))
4661	return SDValue ();
4662
4663	N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4664	N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4665
4666	SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4667	DCI.AddToWorklist(N: Mulhi.getNode());
4668	return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4669	}
4670
4671	SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4672	SDValue Op,
4673	const SDLoc &DL,
4674	unsigned Opc) const {
4675	EVT VT = Op.getValueType();
4676	EVT LegalVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
4677	if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4678	LegalVT != MVT::i16))
4679	return SDValue ();
4680
4681	if (VT != MVT::i32)
4682	Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4683
4684	SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4685	if (VT != MVT::i32)
4686	FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4687
4688	return FFBX;
4689	}
4690
4691	// The native instructions return -1 on 0 input. Optimize out a select that
4692	// produces -1 on 0.
4693	//
4694	// TODO: If zero is not undef, we could also do this if the output is compared
4695	// against the bitwidth.
4696	//
4697	// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4698	SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4699	SDValue LHS, SDValue RHS,
4700	DAGCombinerInfo &DCI) const {
4701	if (!isNullConstant(V: Cond.getOperand(i: `1`)))
4702	return SDValue ();
4703
4704	SelectionDAG &DAG = DCI.DAG;
4705	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: `2`))->get();
4706	SDValue CmpLHS = Cond.getOperand(i: `0`);
4707
4708	// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4709	// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4710	if (CCOpcode == ISD::SETEQ &&
4711	(isCtlzOpc(Opc: RHS.getOpcode()) \|\| isCttzOpc(Opc: RHS.getOpcode())) &&
4712	RHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: LHS)) {
4713	unsigned Opc =
4714	isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4715	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4716	}
4717
4718	// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4719	// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4720	if (CCOpcode == ISD::SETNE &&
4721	(isCtlzOpc(Opc: LHS.getOpcode()) \|\| isCttzOpc(Opc: LHS.getOpcode())) &&
4722	LHS.getOperand(i: `0`) == CmpLHS && isAllOnesConstant(V: RHS)) {
4723	unsigned Opc =
4724	isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4725
4726	return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4727	}
4728
4729	return SDValue ();
4730	}
4731
4732	static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4733	unsigned Op,
4734	const SDLoc &SL,
4735	SDValue Cond,
4736	SDValue N1,
4737	SDValue N2) {
4738	SelectionDAG &DAG = DCI.DAG;
4739	EVT VT = N1.getValueType();
4740
4741	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4742	N2: N1.getOperand(i: `0`), N3: N2.getOperand(i: `0`));
4743	DCI.AddToWorklist(N: NewSelect.getNode());
4744	return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4745	}
4746
4747	// Pull a free FP operation out of a select so it may fold into uses.
4748	//
4749	// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4750	// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4751	//
4752	// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4753	// select c, (fabs x), +k -> fabs (select c, x, k)
4754	SDValue
4755	AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4756	SDValue N) const {
4757	SelectionDAG &DAG = DCI.DAG;
4758	SDValue Cond = N.getOperand(i: `0`);
4759	SDValue LHS = N.getOperand(i: `1`);
4760	SDValue RHS = N.getOperand(i: `2`);
4761
4762	EVT VT = N.getValueType();
4763	if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) \|\|
4764	(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4765	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4766	return SDValue ();
4767
4768	return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4769	SL: SDLoc (N), Cond, N1: LHS, N2: RHS);
4770	}
4771
4772	bool Inv = false;
4773	if (RHS.getOpcode() == ISD::FABS \|\| RHS.getOpcode() == ISD::FNEG) {
4774	std::swap(a&: LHS, b&: RHS);
4775	Inv = true;
4776	}
4777
4778	// TODO: Support vector constants.
4779	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4780	if ((LHS.getOpcode() == ISD::FNEG \|\| LHS.getOpcode() == ISD::FABS) && CRHS &&
4781	!selectSupportsSourceMods(N: N.getNode())) {
4782	SDLoc SL(N);
4783	// If one side is an fneg/fabs and the other is a constant, we can push the
4784	// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4785	SDValue NewLHS = LHS.getOperand(i: `0`);
4786	SDValue NewRHS = RHS;
4787
4788	// Careful: if the neg can be folded up, don't try to pull it back down.
4789	bool ShouldFoldNeg = true;
4790
4791	if (NewLHS.hasOneUse()) {
4792	unsigned Opc = NewLHS.getOpcode();
4793	if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4794	ShouldFoldNeg = false;
4795	if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4796	ShouldFoldNeg = false;
4797	}
4798
4799	if (ShouldFoldNeg) {
4800	if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4801	return SDValue ();
4802
4803	// We're going to be forced to use a source modifier anyway, there's no
4804	// point to pulling the negate out unless we can get a size reduction by
4805	// negating the constant.
4806	//
4807	// TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4808	// about cheaper constants.
4809	if (NewLHS.getOpcode() == ISD::FABS &&
4810	getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4811	return SDValue ();
4812
4813	if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4814	return SDValue ();
4815
4816	if (LHS.getOpcode() == ISD::FNEG)
4817	NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4818
4819	if (Inv)
4820	std::swap(a&: NewLHS, b&: NewRHS);
4821
4822	SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4823	N1: Cond, N2: NewLHS, N3: NewRHS);
4824	DCI.AddToWorklist(N: NewSelect.getNode());
4825	return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4826	}
4827	}
4828
4829	return SDValue ();
4830	}
4831
4832	SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4833	DAGCombinerInfo &DCI) const {
4834	if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue (N, `0`)))
4835	return Folded;
4836
4837	SDValue Cond = N->getOperand(Num: `0`);
4838	if (Cond.getOpcode() != ISD::SETCC)
4839	return SDValue ();
4840
4841	EVT VT = N->getValueType(ResNo: `0`);
4842	SDValue LHS = Cond.getOperand(i: `0`);
4843	SDValue RHS = Cond.getOperand(i: `1`);
4844	SDValue CC = Cond.getOperand(i: `2`);
4845
4846	SDValue True = N->getOperand(Num: `1`);
4847	SDValue False = N->getOperand(Num: `2`);
4848
4849	if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4850	SelectionDAG &DAG = DCI.DAG;
4851	if (DAG.isConstantValueOfAnyType(N: True) &&
4852	!DAG.isConstantValueOfAnyType(N: False)) {
4853	// Swap cmp + select pair to move constant to false input.
4854	// This will allow using VOPC cndmasks more often.
4855	// select (setcc x, y), k, x -> select (setccinv x, y), x, k
4856
4857	SDLoc SL(N);
4858	ISD::CondCode NewCC =
4859	getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4860
4861	SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4862	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4863	}
4864
4865	if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4866	SDValue MinMax
4867	= combineFMinMaxLegacy(DL: SDLoc (N), VT, LHS, RHS, True, False, CC, DCI);
4868	// Revisit this node so we can catch min3/max3/med3 patterns.
4869	//DCI.AddToWorklist(MinMax.getNode());
4870	return MinMax;
4871	}
4872	}
4873
4874	// There's no reason to not do this if the condition has other uses.
4875	return performCtlz_CttzCombine(SL: SDLoc (N), Cond, LHS: True, RHS: False, DCI);
4876	}
4877
4878	static bool isInv2Pi(const APFloat &APF) {
4879	static const APFloat KF16(APFloat::IEEEhalf(), APInt (`16`, `0x3118`));
4880	static const APFloat KF32(APFloat::IEEEsingle(), APInt (`32`, `0x3e22f983`));
4881	static const APFloat KF64(APFloat::IEEEdouble(), APInt (`64`, `0x3fc45f306dc9c882`));
4882
4883	return APF.bitwiseIsEqual(RHS: KF16) \|\|
4884	APF.bitwiseIsEqual(RHS: KF32) \|\|
4885	APF.bitwiseIsEqual(RHS: KF64);
4886	}
4887
4888	// 0 and 1.0 / (0.5 pi) do not have inline immmediates, so there is an*
4889	// additional cost to negate them.
4890	TargetLowering::NegatibleCost
4891	AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode C) const* {
4892	if (C->isZero())
4893	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4894
4895	if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
4896	return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4897
4898	return NegatibleCost::Neutral;
4899	}
4900
4901	bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4902	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4903	return getConstantNegateCost(C) == NegatibleCost::Expensive;
4904	return false;
4905	}
4906
4907	bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4908	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4909	return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4910	return false;
4911	}
4912
4913	static unsigned inverseMinMax(unsigned Opc) {
4914	switch (Opc) {
4915	case ISD::FMAXNUM:
4916	return ISD::FMINNUM;
4917	case ISD::FMINNUM:
4918	return ISD::FMAXNUM;
4919	case ISD::FMAXNUM_IEEE:
4920	return ISD::FMINNUM_IEEE;
4921	case ISD::FMINNUM_IEEE:
4922	return ISD::FMAXNUM_IEEE;
4923	case ISD::FMAXIMUM:
4924	return ISD::FMINIMUM;
4925	case ISD::FMINIMUM:
4926	return ISD::FMAXIMUM;
4927	case ISD::FMAXIMUMNUM:
4928	return ISD::FMINIMUMNUM;
4929	case ISD::FMINIMUMNUM:
4930	return ISD::FMAXIMUMNUM;
4931	case AMDGPUISD::FMAX_LEGACY:
4932	return AMDGPUISD::FMIN_LEGACY;
4933	case AMDGPUISD::FMIN_LEGACY:
4934	return AMDGPUISD::FMAX_LEGACY;
4935	default:
4936	llvm_unreachable("invalid min/max opcode");
4937	}
4938	}
4939
4940	/// \return true if it's profitable to try to push an fneg into its source
4941	/// instruction.
4942	bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4943	// If the input has multiple uses and we can either fold the negate down, or
4944	// the other uses cannot, give up. This both prevents unprofitable
4945	// transformations and infinite loops: we won't repeatedly try to fold around
4946	// a negate that has no 'good' form.
4947	if (N0.hasOneUse()) {
4948	// This may be able to fold into the source, but at a code size cost. Don't
4949	// fold if the fold into the user is free.
4950	if (allUsesHaveSourceMods(N, CostThreshold: `0`))
4951	return false;
4952	} else {
4953	if (fnegFoldsIntoOp(N: N0.getNode()) &&
4954	(allUsesHaveSourceMods(N) \|\| !allUsesHaveSourceMods(N: N0.getNode())))
4955	return false;
4956	}
4957
4958	return true;
4959	}
4960
4961	SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4962	DAGCombinerInfo &DCI) const {
4963	SelectionDAG &DAG = DCI.DAG;
4964	SDValue N0 = N->getOperand(Num: `0`);
4965	EVT VT = N->getValueType(ResNo: `0`);
4966
4967	unsigned Opc = N0.getOpcode();
4968
4969	if (!shouldFoldFNegIntoSrc(N, N0))
4970	return SDValue ();
4971
4972	SDLoc SL(N);
4973	switch (Opc) {
4974	case ISD::FADD: {
4975	if (!mayIgnoreSignedZero(Op: N0))
4976	return SDValue ();
4977
4978	// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4979	SDValue LHS = N0.getOperand(i: `0`);
4980	SDValue RHS = N0.getOperand(i: `1`);
4981
4982	if (LHS.getOpcode() != ISD::FNEG)
4983	LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4984	else
4985	LHS = LHS.getOperand(i: `0`);
4986
4987	if (RHS.getOpcode() != ISD::FNEG)
4988	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4989	else
4990	RHS = RHS.getOperand(i: `0`);
4991
4992	SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
4993	if (Res.getOpcode() != ISD::FADD)
4994	return SDValue (); // Op got folded away.
4995	if (!N0.hasOneUse())
4996	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4997	return Res;
4998	}
4999	case ISD::FMUL:
5000	case AMDGPUISD::FMUL_LEGACY: {
5001	// (fneg (fmul x, y)) -> (fmul x, (fneg y))
5002	// (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5003	SDValue LHS = N0.getOperand(i: `0`);
5004	SDValue RHS = N0.getOperand(i: `1`);
5005
5006	if (LHS.getOpcode() == ISD::FNEG)
5007	LHS = LHS.getOperand(i: `0`);
5008	else if (RHS.getOpcode() == ISD::FNEG)
5009	RHS = RHS.getOperand(i: `0`);
5010	else
5011	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5012
5013	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0 ->getFlags());
5014	if (Res.getOpcode() != Opc)
5015	return SDValue (); // Op got folded away.
5016	if (!N0.hasOneUse())
5017	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5018	return Res;
5019	}
5020	case ISD::FMA:
5021	case ISD::FMAD: {
5022	// TODO: handle llvm.amdgcn.fma.legacy
5023	if (!mayIgnoreSignedZero(Op: N0))
5024	return SDValue ();
5025
5026	// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5027	SDValue LHS = N0.getOperand(i: `0`);
5028	SDValue MHS = N0.getOperand(i: `1`);
5029	SDValue RHS = N0.getOperand(i: `2`);
5030
5031	if (LHS.getOpcode() == ISD::FNEG)
5032	LHS = LHS.getOperand(i: `0`);
5033	else if (MHS.getOpcode() == ISD::FNEG)
5034	MHS = MHS.getOperand(i: `0`);
5035	else
5036	MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5037
5038	if (RHS.getOpcode() != ISD::FNEG)
5039	RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5040	else
5041	RHS = RHS.getOperand(i: `0`);
5042
5043	SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5044	if (Res.getOpcode() != Opc)
5045	return SDValue (); // Op got folded away.
5046	if (!N0.hasOneUse())
5047	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5048	return Res;
5049	}
5050	case ISD::FMAXNUM:
5051	case ISD::FMINNUM:
5052	case ISD::FMAXNUM_IEEE:
5053	case ISD::FMINNUM_IEEE:
5054	case ISD::FMINIMUM:
5055	case ISD::FMAXIMUM:
5056	case ISD::FMINIMUMNUM:
5057	case ISD::FMAXIMUMNUM:
5058	case AMDGPUISD::FMAX_LEGACY:
5059	case AMDGPUISD::FMIN_LEGACY: {
5060	// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5061	// fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5062	// fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5063	// fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5064
5065	SDValue LHS = N0.getOperand(i: `0`);
5066	SDValue RHS = N0.getOperand(i: `1`);
5067
5068	// 0 doesn't have a negated inline immediate.
5069	// TODO: This constant check should be generalized to other operations.
5070	if (isConstantCostlierToNegate(N: RHS))
5071	return SDValue ();
5072
5073	SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5074	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5075	unsigned Opposite = inverseMinMax(Opc);
5076
5077	SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0 ->getFlags());
5078	if (Res.getOpcode() != Opposite)
5079	return SDValue (); // Op got folded away.
5080	if (!N0.hasOneUse())
5081	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5082	return Res;
5083	}
5084	case AMDGPUISD::FMED3: {
5085	SDValue Ops[`3`];
5086	for (unsigned I = `0`; I < `3`; ++I)
5087	Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0 ->getOperand(Num: I), Flags: N0 ->getFlags());
5088
5089	SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0 ->getFlags());
5090	if (Res.getOpcode() != AMDGPUISD::FMED3)
5091	return SDValue (); // Op got folded away.
5092
5093	if (!N0.hasOneUse()) {
5094	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5095	DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5096
5097	for (SDNode *U : Neg ->users())
5098	DCI.AddToWorklist(N: U);
5099	}
5100
5101	return Res;
5102	}
5103	case ISD::FP_EXTEND:
5104	case ISD::FTRUNC:
5105	case ISD::FRINT:
5106	case ISD::FNEARBYINT: // XXX - Should fround be handled?
5107	case ISD::FROUNDEVEN:
5108	case ISD::FSIN:
5109	case ISD::FCANONICALIZE:
5110	case AMDGPUISD::RCP:
5111	case AMDGPUISD::RCP_LEGACY:
5112	case AMDGPUISD::RCP_IFLAG:
5113	case AMDGPUISD::SIN_HW: {
5114	SDValue CvtSrc = N0.getOperand(i: `0`);
5115	if (CvtSrc.getOpcode() == ISD::FNEG) {
5116	// (fneg (fp_extend (fneg x))) -> (fp_extend x)
5117	// (fneg (rcp (fneg x))) -> (rcp x)
5118	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: `0`));
5119	}
5120
5121	if (!N0.hasOneUse())
5122	return SDValue ();
5123
5124	// (fneg (fp_extend x)) -> (fp_extend (fneg x))
5125	// (fneg (rcp x)) -> (rcp (fneg x))
5126	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5127	return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0 ->getFlags());
5128	}
5129	case ISD::FP_ROUND: {
5130	SDValue CvtSrc = N0.getOperand(i: `0`);
5131
5132	if (CvtSrc.getOpcode() == ISD::FNEG) {
5133	// (fneg (fp_round (fneg x))) -> (fp_round x)
5134	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5135	N1: CvtSrc.getOperand(i: `0`), N2: N0.getOperand(i: `1`));
5136	}
5137
5138	if (!N0.hasOneUse())
5139	return SDValue ();
5140
5141	// (fneg (fp_round x)) -> (fp_round (fneg x))
5142	SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5143	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: `1`));
5144	}
5145	case ISD::FP16_TO_FP: {
5146	// v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5147	// f16, but legalization of f16 fneg ends up pulling it out of the source.
5148	// Put the fneg back as a legal source operation that can be matched later.
5149	SDLoc SL(N);
5150
5151	SDValue Src = N0.getOperand(i: `0`);
5152	EVT SrcVT = Src.getValueType();
5153
5154	// fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5155	SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5156	N2: DAG.getConstant(Val: `0x8000`, DL: SL, VT: SrcVT));
5157	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFNeg);
5158	}
5159	case ISD::SELECT: {
5160	// fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5161	// TODO: Invert conditions of foldFreeOpFromSelect
5162	return SDValue ();
5163	}
5164	case ISD::BITCAST: {
5165	SDLoc SL(N);
5166	SDValue BCSrc = N0.getOperand(i: `0`);
5167	if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5168	SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - `1`);
5169	if (HighBits.getValueType().getSizeInBits() != `32` \|\|
5170	!fnegFoldsIntoOp(N: HighBits.getNode()))
5171	return SDValue ();
5172
5173	// f64 fneg only really needs to operate on the high half of of the
5174	// register, so try to force it to an f32 operation to help make use of
5175	// source modifiers.
5176	//
5177	//
5178	// fneg (f64 (bitcast (build_vector x, y))) ->
5179	// f64 (bitcast (build_vector (bitcast i32:x to f32),
5180	// (fneg (bitcast i32:y to f32)))
5181
5182	SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5183	SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5184	SDValue CastBack =
5185	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5186
5187	SmallVector<SDValue, `8`> Ops(BCSrc ->ops());
5188	Ops.back() = CastBack;
5189	DCI.AddToWorklist(N: NegHi.getNode());
5190	SDValue Build =
5191	DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5192	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5193
5194	if (!N0.hasOneUse())
5195	DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5196	return Result;
5197	}
5198
5199	if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5200	BCSrc.hasOneUse()) {
5201	// fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5202	// select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5203
5204	// TODO: Cast back result for multiple uses is beneficial in some cases.
5205
5206	SDValue LHS =
5207	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: `1`));
5208	SDValue RHS =
5209	DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: `2`));
5210
5211	SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5212	SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5213
5214	return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: `0`), N2: NegLHS,
5215	N3: NegRHS);
5216	}
5217
5218	return SDValue ();
5219	}
5220	default:
5221	return SDValue ();
5222	}
5223	}
5224
5225	SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5226	DAGCombinerInfo &DCI) const {
5227	SelectionDAG &DAG = DCI.DAG;
5228	SDValue N0 = N->getOperand(Num: `0`);
5229
5230	if (!N0.hasOneUse())
5231	return SDValue ();
5232
5233	switch (N0.getOpcode()) {
5234	case ISD::FP16_TO_FP: {
5235	assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5236	SDLoc SL(N);
5237	SDValue Src = N0.getOperand(i: `0`);
5238	EVT SrcVT = Src.getValueType();
5239
5240	// fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5241	SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5242	N2: DAG.getConstant(Val: `0x7fff`, DL: SL, VT: SrcVT));
5243	return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: `0`), Operand: IntFAbs);
5244	}
5245	default:
5246	return SDValue ();
5247	}
5248	}
5249
5250	SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5251	DAGCombinerInfo &DCI) const {
5252	const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: `0`));
5253	if (!CFP)
5254	return SDValue ();
5255
5256	// XXX - Should this flush denormals?
5257	const APFloat &Val = CFP->getValueAPF();
5258	APFloat One(Val.getSemantics(), "1.0");
5259	return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`));
5260	}
5261
5262	SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5263	DAGCombinerInfo &DCI) const {
5264	SelectionDAG &DAG = DCI.DAG;
5265	SDLoc DL(N);
5266
5267	switch(N->getOpcode()) {
5268	default:
5269	break;
5270	case ISD::BITCAST: {
5271	EVT DestVT = N->getValueType(ResNo: `0`);
5272
5273	// Push casts through vector builds. This helps avoid emitting a large
5274	// number of copies when materializing floating point vector constants.
5275	//
5276	// vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5277	// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5278	if (DestVT.isVector()) {
5279	SDValue Src = N->getOperand(Num: `0`);
5280	if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5281	(DCI.getDAGCombineLevel() < AfterLegalizeDAG \|\|
5282	isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5283	EVT SrcVT = Src.getValueType();
5284	unsigned NElts = DestVT.getVectorNumElements();
5285
5286	if (SrcVT.getVectorNumElements() == NElts) {
5287	EVT DestEltVT = DestVT.getVectorElementType();
5288
5289	SmallVector<SDValue, `8`> CastedElts;
5290	SDLoc SL(N);
5291	for (unsigned I = `0`, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5292	SDValue Elt = Src.getOperand(i: I);
5293	CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5294	}
5295
5296	return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5297	}
5298	}
5299	}
5300
5301	if (DestVT.getSizeInBits() != `64` \|\| !DestVT.isVector())
5302	break;
5303
5304	// Fold bitcasts of constants.
5305	//
5306	// v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5307	// TODO: Generalize and move to DAGCombiner
5308	SDValue Src = N->getOperand(Num: `0`);
5309	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5310	SDLoc SL(N);
5311	uint64_t CVal = C->getZExtValue();
5312	SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5313	N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5314	N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5315	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5316	}
5317
5318	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5319	const APInt &Val = C->getValueAPF().bitcastToAPInt();
5320	SDLoc SL(N);
5321	uint64_t CVal = Val.getZExtValue();
5322	SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5323	N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5324	N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5325
5326	return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5327	}
5328
5329	break;
5330	}
5331	case ISD::SHL:
5332	case ISD::SRA:
5333	case ISD::SRL: {
5334	// Range metadata can be invalidated when loads are converted to legal types
5335	// (e.g. v2i64 -> v4i32).
5336	// Try to convert vector shl/sra/srl before type legalization so that range
5337	// metadata can be utilized.
5338	if (!(N->getValueType(ResNo: `0`).isVector() &&
5339	DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5340	DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5341	break;
5342	if (N->getOpcode() == ISD::SHL)
5343	return performShlCombine(N, DCI);
5344	if (N->getOpcode() == ISD::SRA)
5345	return performSraCombine(N, DCI);
5346	return performSrlCombine(N, DCI);
5347	}
5348	case ISD::TRUNCATE:
5349	return performTruncateCombine(N, DCI);
5350	case ISD::MUL:
5351	return performMulCombine(N, DCI);
5352	case AMDGPUISD::MUL_U24:
5353	case AMDGPUISD::MUL_I24: {
5354	if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5355	return Simplified;
5356	break;
5357	}
5358	case AMDGPUISD::MULHI_I24:
5359	case AMDGPUISD::MULHI_U24:
5360	return simplifyMul24(Node24: N, DCI);
5361	case ISD::SMUL_LOHI:
5362	case ISD::UMUL_LOHI:
5363	return performMulLoHiCombine(N, DCI);
5364	case ISD::MULHS:
5365	return performMulhsCombine(N, DCI);
5366	case ISD::MULHU:
5367	return performMulhuCombine(N, DCI);
5368	case ISD::SELECT:
5369	return performSelectCombine(N, DCI);
5370	case ISD::FNEG:
5371	return performFNegCombine(N, DCI);
5372	case ISD::FABS:
5373	return performFAbsCombine(N, DCI);
5374	case AMDGPUISD::BFE_I32:
5375	case AMDGPUISD::BFE_U32: {
5376	assert(!N->getValueType(`0`).isVector() &&
5377	"Vector handling of BFE not implemented");
5378	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`));
5379	if (!Width)
5380	break;
5381
5382	uint32_t WidthVal = Width->getZExtValue() & `0x1f`;
5383	if (WidthVal == `0`)
5384	return DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
5385
5386	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
5387	if (!Offset)
5388	break;
5389
5390	SDValue BitsFrom = N->getOperand(Num: `0`);
5391	uint32_t OffsetVal = Offset->getZExtValue() & `0x1f`;
5392
5393	bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5394
5395	if (OffsetVal == `0`) {
5396	// This is already sign / zero extended, so try to fold away extra BFEs.
5397	unsigned SignBits = Signed ? (`32` - WidthVal + `1`) : (`32` - WidthVal);
5398
5399	unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5400	if (OpSignBits >= SignBits)
5401	return BitsFrom;
5402
5403	EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5404	if (Signed) {
5405	// This is a sign_extend_inreg. Replace it to take advantage of existing
5406	// DAG Combines. If not eliminated, we will match back to BFE during
5407	// selection.
5408
5409	// TODO: The sext_inreg of extended types ends, although we can could
5410	// handle them in a single BFE.
5411	return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5412	N2: DAG.getValueType(SmallVT));
5413	}
5414
5415	return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5416	}
5417
5418	if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5419	if (Signed) {
5420	return constantFoldBFE<int32_t>(DAG,
5421	Src0: CVal->getSExtValue(),
5422	Offset: OffsetVal,
5423	Width: WidthVal,
5424	DL);
5425	}
5426
5427	return constantFoldBFE<uint32_t>(DAG,
5428	Src0: CVal->getZExtValue(),
5429	Offset: OffsetVal,
5430	Width: WidthVal,
5431	DL);
5432	}
5433
5434	if ((OffsetVal + WidthVal) >= `32` &&
5435	!(Subtarget->hasSDWA() && OffsetVal == `16` && WidthVal == `16`)) {
5436	SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5437	return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5438	N1: BitsFrom, N2: ShiftVal);
5439	}
5440
5441	if (BitsFrom.hasOneUse()) {
5442	APInt Demanded = APInt::getBitsSet(numBits: `32`,
5443	loBit: OffsetVal,
5444	hiBit: OffsetVal + WidthVal);
5445
5446	KnownBits Known;
5447	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5448	!DCI.isBeforeLegalizeOps());
5449	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5450	if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) \|\|
5451	TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5452	DCI.CommitTargetLoweringOpt(TLO);
5453	}
5454	}
5455
5456	break;
5457	}
5458	case ISD::LOAD:
5459	return performLoadCombine(N, DCI);
5460	case ISD::STORE:
5461	return performStoreCombine(N, DCI);
5462	case AMDGPUISD::RCP:
5463	case AMDGPUISD::RCP_IFLAG:
5464	return performRcpCombine(N, DCI);
5465	case ISD::AssertZext:
5466	case ISD::AssertSext:
5467	return performAssertSZExtCombine(N, DCI);
5468	case ISD::INTRINSIC_WO_CHAIN:
5469	return performIntrinsicWOChainCombine(N, DCI);
5470	case AMDGPUISD::FMAD_FTZ: {
5471	SDValue N0 = N->getOperand(Num: `0`);
5472	SDValue N1 = N->getOperand(Num: `1`);
5473	SDValue N2 = N->getOperand(Num: `2`);
5474	EVT VT = N->getValueType(ResNo: `0`);
5475
5476	// FMAD_FTZ is a FMAD + flush denormals to zero.
5477	// We flush the inputs, the intermediate step, and the output.
5478	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5479	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5480	ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5481	if (N0CFP && N1CFP && N2CFP) {
5482	const auto FTZ = [](const APFloat &V) {
5483	if (V.isDenormal()) {
5484	APFloat Zero(V.getSemantics(), `0`);
5485	return V.isNegative() ? -Zero : Zero;
5486	}
5487	return V;
5488	};
5489
5490	APFloat V0 = FTZ (N0CFP->getValueAPF());
5491	APFloat V1 = FTZ (N1CFP->getValueAPF());
5492	APFloat V2 = FTZ (N2CFP->getValueAPF());
5493	V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5494	V0 = FTZ (V0);
5495	V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5496	return DAG.getConstantFP(Val: FTZ (V0), DL, VT);
5497	}
5498	break;
5499	}
5500	}
5501	return SDValue ();
5502	}
5503
5504	//===----------------------------------------------------------------------===//
5505	// Helper functions
5506	//===----------------------------------------------------------------------===//
5507
5508	SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5509	const TargetRegisterClass *RC,
5510	Register Reg, EVT VT,
5511	const SDLoc &SL,
5512	bool RawReg) const {
5513	MachineFunction &MF = DAG.getMachineFunction();
5514	MachineRegisterInfo &MRI = MF.getRegInfo();
5515	Register VReg;
5516
5517	if (!MRI.isLiveIn(Reg)) {
5518	VReg = MRI.createVirtualRegister(RegClass: RC);
5519	MRI.addLiveIn(Reg, vreg: VReg);
5520	} else {
5521	VReg = MRI.getLiveInVirtReg(PReg: Reg);
5522	}
5523
5524	if (RawReg)
5525	return DAG.getRegister(Reg: VReg, VT);
5526
5527	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5528	}
5529
5530	// This may be called multiple times, and nothing prevents creating multiple
5531	// objects at the same offset. See if we already defined this object.
5532	static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5533	int64_t Offset) {
5534	for (int I = MFI.getObjectIndexBegin(); I < `0`; ++I) {
5535	if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5536	assert(MFI.getObjectSize(I) == Size);
5537	return I;
5538	}
5539	}
5540
5541	return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5542	}
5543
5544	SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5545	EVT VT,
5546	const SDLoc &SL,
5547	int64_t Offset) const {
5548	MachineFunction &MF = DAG.getMachineFunction();
5549	MachineFrameInfo &MFI = MF.getFrameInfo();
5550	int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5551
5552	auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5553	SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5554
5555	return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align (`4`),
5556	MMOFlags: MachineMemOperand::MODereferenceable \|
5557	MachineMemOperand::MOInvariant);
5558	}
5559
5560	SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5561	const SDLoc &SL,
5562	SDValue Chain,
5563	SDValue ArgVal,
5564	int64_t Offset) const {
5565	MachineFunction &MF = DAG.getMachineFunction();
5566	MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5567	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5568
5569	SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5570	// Stores to the argument stack area are relative to the stack pointer.
5571	SDValue SP =
5572	DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5573	Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5574	SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align (`4`),
5575	MMOFlags: MachineMemOperand::MODereferenceable);
5576	return Store;
5577	}
5578
5579	SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5580	const TargetRegisterClass *RC,
5581	EVT VT, const SDLoc &SL,
5582	const ArgDescriptor &Arg) const {
5583	assert(Arg && "Attempting to load missing argument");
5584
5585	SDValue V = Arg.isRegister() ?
5586	CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5587	loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5588
5589	if (!Arg.isMasked())
5590	return V;
5591
5592	unsigned Mask = Arg.getMask();
5593	unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5594	V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5595	N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5596	return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5597	N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5598	}
5599
5600	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5601	uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5602	unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5603	const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5604	uint64_t ArgOffset =
5605	alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5606	switch (Param) {
5607	case FIRST_IMPLICIT:
5608	return ArgOffset;
5609	case PRIVATE_BASE:
5610	return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5611	case SHARED_BASE:
5612	return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5613	case QUEUE_PTR:
5614	return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5615	}
5616	llvm_unreachable("unexpected implicit parameter type");
5617	}
5618
5619	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5620	const MachineFunction &MF, const ImplicitParameter Param) const {
5621	const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5622	return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5623	}
5624
5625	#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5626
5627	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5628	switch ((AMDGPUISD::NodeType)Opcode) {
5629	case AMDGPUISD::FIRST_NUMBER: break;
5630	// AMDIL DAG nodes
5631	NODE_NAME_CASE(BRANCH_COND);
5632
5633	// AMDGPU DAG nodes
5634	NODE_NAME_CASE(IF)
5635	NODE_NAME_CASE(ELSE)
5636	NODE_NAME_CASE(LOOP)
5637	NODE_NAME_CASE(CALL)
5638	NODE_NAME_CASE(TC_RETURN)
5639	NODE_NAME_CASE(TC_RETURN_GFX)
5640	NODE_NAME_CASE(TC_RETURN_CHAIN)
5641	NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5642	NODE_NAME_CASE(TRAP)
5643	NODE_NAME_CASE(RET_GLUE)
5644	NODE_NAME_CASE(WAVE_ADDRESS)
5645	NODE_NAME_CASE(RETURN_TO_EPILOG)
5646	NODE_NAME_CASE(ENDPGM)
5647	NODE_NAME_CASE(ENDPGM_TRAP)
5648	NODE_NAME_CASE(SIMULATED_TRAP)
5649	NODE_NAME_CASE(DWORDADDR)
5650	NODE_NAME_CASE(FRACT)
5651	NODE_NAME_CASE(SETCC)
5652	NODE_NAME_CASE(DENORM_MODE)
5653	NODE_NAME_CASE(FMA_W_CHAIN)
5654	NODE_NAME_CASE(FMUL_W_CHAIN)
5655	NODE_NAME_CASE(CLAMP)
5656	NODE_NAME_CASE(COS_HW)
5657	NODE_NAME_CASE(SIN_HW)
5658	NODE_NAME_CASE(FMAX_LEGACY)
5659	NODE_NAME_CASE(FMIN_LEGACY)
5660	NODE_NAME_CASE(FMAX3)
5661	NODE_NAME_CASE(SMAX3)
5662	NODE_NAME_CASE(UMAX3)
5663	NODE_NAME_CASE(FMIN3)
5664	NODE_NAME_CASE(SMIN3)
5665	NODE_NAME_CASE(UMIN3)
5666	NODE_NAME_CASE(FMED3)
5667	NODE_NAME_CASE(SMED3)
5668	NODE_NAME_CASE(UMED3)
5669	NODE_NAME_CASE(FMAXIMUM3)
5670	NODE_NAME_CASE(FMINIMUM3)
5671	NODE_NAME_CASE(FDOT2)
5672	NODE_NAME_CASE(URECIP)
5673	NODE_NAME_CASE(DIV_SCALE)
5674	NODE_NAME_CASE(DIV_FMAS)
5675	NODE_NAME_CASE(DIV_FIXUP)
5676	NODE_NAME_CASE(FMAD_FTZ)
5677	NODE_NAME_CASE(RCP)
5678	NODE_NAME_CASE(RSQ)
5679	NODE_NAME_CASE(RCP_LEGACY)
5680	NODE_NAME_CASE(RCP_IFLAG)
5681	NODE_NAME_CASE(LOG)
5682	NODE_NAME_CASE(EXP)
5683	NODE_NAME_CASE(FMUL_LEGACY)
5684	NODE_NAME_CASE(RSQ_CLAMP)
5685	NODE_NAME_CASE(FP_CLASS)
5686	NODE_NAME_CASE(DOT4)
5687	NODE_NAME_CASE(CARRY)
5688	NODE_NAME_CASE(BORROW)
5689	NODE_NAME_CASE(BFE_U32)
5690	NODE_NAME_CASE(BFE_I32)
5691	NODE_NAME_CASE(BFI)
5692	NODE_NAME_CASE(BFM)
5693	NODE_NAME_CASE(FFBH_U32)
5694	NODE_NAME_CASE(FFBH_I32)
5695	NODE_NAME_CASE(FFBL_B32)
5696	NODE_NAME_CASE(MUL_U24)
5697	NODE_NAME_CASE(MUL_I24)
5698	NODE_NAME_CASE(MULHI_U24)
5699	NODE_NAME_CASE(MULHI_I24)
5700	NODE_NAME_CASE(MAD_U24)
5701	NODE_NAME_CASE(MAD_I24)
5702	NODE_NAME_CASE(MAD_I64_I32)
5703	NODE_NAME_CASE(MAD_U64_U32)
5704	NODE_NAME_CASE(PERM)
5705	NODE_NAME_CASE(TEXTURE_FETCH)
5706	NODE_NAME_CASE(R600_EXPORT)
5707	NODE_NAME_CASE(CONST_ADDRESS)
5708	NODE_NAME_CASE(REGISTER_LOAD)
5709	NODE_NAME_CASE(REGISTER_STORE)
5710	NODE_NAME_CASE(CVT_F32_UBYTE0)
5711	NODE_NAME_CASE(CVT_F32_UBYTE1)
5712	NODE_NAME_CASE(CVT_F32_UBYTE2)
5713	NODE_NAME_CASE(CVT_F32_UBYTE3)
5714	NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5715	NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5716	NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5717	NODE_NAME_CASE(CVT_PK_I16_I32)
5718	NODE_NAME_CASE(CVT_PK_U16_U32)
5719	NODE_NAME_CASE(FP_TO_FP16)
5720	NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5721	NODE_NAME_CASE(CONST_DATA_PTR)
5722	NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5723	NODE_NAME_CASE(LDS)
5724	NODE_NAME_CASE(DUMMY_CHAIN)
5725	NODE_NAME_CASE(LOAD_D16_HI)
5726	NODE_NAME_CASE(LOAD_D16_LO)
5727	NODE_NAME_CASE(LOAD_D16_HI_I8)
5728	NODE_NAME_CASE(LOAD_D16_HI_U8)
5729	NODE_NAME_CASE(LOAD_D16_LO_I8)
5730	NODE_NAME_CASE(LOAD_D16_LO_U8)
5731	NODE_NAME_CASE(STORE_MSKOR)
5732	NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5733	NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5734	NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5735	NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5736	NODE_NAME_CASE(DS_ORDERED_COUNT)
5737	NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5738	NODE_NAME_CASE(BUFFER_LOAD)
5739	NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5740	NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5741	NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5742	NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5743	NODE_NAME_CASE(BUFFER_LOAD_TFE)
5744	NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5745	NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5746	NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5747	NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5748	NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5749	NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5750	NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5751	NODE_NAME_CASE(SBUFFER_LOAD)
5752	NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5753	NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5754	NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5755	NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5756	NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5757	NODE_NAME_CASE(BUFFER_STORE)
5758	NODE_NAME_CASE(BUFFER_STORE_BYTE)
5759	NODE_NAME_CASE(BUFFER_STORE_SHORT)
5760	NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5761	NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5762	NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5763	NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5764	NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5765	NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5766	NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5767	NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5768	NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5769	NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5770	NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5771	NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5772	NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5773	NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5774	NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5775	NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5776	NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5777	NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5778	NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5779	NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5780	}
5781	return nullptr;
5782	}
5783
5784	SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5785	SelectionDAG &DAG, int Enabled,
5786	int &RefinementSteps,
5787	bool &UseOneConstNR,
5788	bool Reciprocal) const {
5789	EVT VT = Operand.getValueType();
5790
5791	if (VT == MVT::f32) {
5792	RefinementSteps = `0`;
5793	return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc (Operand), VT, Operand);
5794	}
5795
5796	// TODO: There is also f64 rsq instruction, but the documentation is less
5797	// clear on its precision.
5798
5799	return SDValue ();
5800	}
5801
5802	SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5803	SelectionDAG &DAG, int Enabled,
5804	int &RefinementSteps) const {
5805	EVT VT = Operand.getValueType();
5806
5807	if (VT == MVT::f32) {
5808	// Reciprocal, < 1 ulp error.
5809	//
5810	// This reciprocal approximation converges to < 0.5 ulp error with one
5811	// newton rhapson performed with two fused multiple adds (FMAs).
5812
5813	RefinementSteps = `0`;
5814	return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc (Operand), VT, Operand);
5815	}
5816
5817	// TODO: There is also f64 rcp instruction, but the documentation is less
5818	// clear on its precision.
5819
5820	return SDValue ();
5821	}
5822
5823	static unsigned workitemIntrinsicDim(unsigned ID) {
5824	switch (ID) {
5825	case Intrinsic::amdgcn_workitem_id_x:
5826	return `0`;
5827	case Intrinsic::amdgcn_workitem_id_y:
5828	return `1`;
5829	case Intrinsic::amdgcn_workitem_id_z:
5830	return `2`;
5831	default:
5832	llvm_unreachable("not a workitem intrinsic");
5833	}
5834	}
5835
5836	void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5837	const SDValue Op, KnownBits &Known,
5838	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5839
5840	Known.resetAll(); // Don't know anything.
5841
5842	unsigned Opc = Op.getOpcode();
5843
5844	switch (Opc) {
5845	default:
5846	break;
5847	case AMDGPUISD::CARRY:
5848	case AMDGPUISD::BORROW: {
5849	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `31`);
5850	break;
5851	}
5852
5853	case AMDGPUISD::BFE_I32:
5854	case AMDGPUISD::BFE_U32: {
5855	ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5856	if (!CWidth)
5857	return;
5858
5859	uint32_t Width = CWidth->getZExtValue() & `0x1f`;
5860
5861	if (Opc == AMDGPUISD::BFE_U32)
5862	Known.Zero = APInt::getHighBitsSet(numBits: `32`, hiBitsSet: `32` - Width);
5863
5864	break;
5865	}
5866	case AMDGPUISD::FP_TO_FP16: {
5867	unsigned BitWidth = Known.getBitWidth();
5868
5869	// High bits are zero.
5870	Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
5871	break;
5872	}
5873	case AMDGPUISD::MUL_U24:
5874	case AMDGPUISD::MUL_I24: {
5875	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5876	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5877	unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5878	RHSKnown.countMinTrailingZeros();
5879	Known.Zero.setLowBits(std::min(a: TrailZ, b: `32u`));
5880	// Skip extra check if all bits are known zeros.
5881	if (TrailZ >= `32`)
5882	break;
5883
5884	// Truncate to 24 bits.
5885	LHSKnown = LHSKnown.trunc(BitWidth: `24`);
5886	RHSKnown = RHSKnown.trunc(BitWidth: `24`);
5887
5888	if (Opc == AMDGPUISD::MUL_I24) {
5889	unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5890	unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5891	unsigned MaxValBits = LHSValBits + RHSValBits;
5892	if (MaxValBits > `32`)
5893	break;
5894	unsigned SignBits = `32` - MaxValBits + `1`;
5895	bool LHSNegative = LHSKnown.isNegative();
5896	bool LHSNonNegative = LHSKnown.isNonNegative();
5897	bool LHSPositive = LHSKnown.isStrictlyPositive();
5898	bool RHSNegative = RHSKnown.isNegative();
5899	bool RHSNonNegative = RHSKnown.isNonNegative();
5900	bool RHSPositive = RHSKnown.isStrictlyPositive();
5901
5902	if ((LHSNonNegative && RHSNonNegative) \|\| (LHSNegative && RHSNegative))
5903	Known.Zero.setHighBits(SignBits);
5904	else if ((LHSNegative && RHSPositive) \|\| (LHSPositive && RHSNegative))
5905	Known.One.setHighBits(SignBits);
5906	} else {
5907	unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5908	unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5909	unsigned MaxValBits = LHSValBits + RHSValBits;
5910	if (MaxValBits >= `32`)
5911	break;
5912	Known.Zero.setBitsFrom(MaxValBits);
5913	}
5914	break;
5915	}
5916	case AMDGPUISD::PERM: {
5917	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
5918	if (!CMask)
5919	return;
5920
5921	KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5922	KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5923	unsigned Sel = CMask->getZExtValue();
5924
5925	for (unsigned I = `0`; I < `32`; I += `8`) {
5926	unsigned SelBits = Sel & `0xff`;
5927	if (SelBits < `4`) {
5928	SelBits *= `8`;
5929	Known.One \|= ((RHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
5930	Known.Zero \|= ((RHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
5931	} else if (SelBits < `7`) {
5932	SelBits = (SelBits & `3`) * `8`;
5933	Known.One \|= ((LHSKnown.One.getZExtValue() >> SelBits) & `0xff`) << I;
5934	Known.Zero \|= ((LHSKnown.Zero.getZExtValue() >> SelBits) & `0xff`) << I;
5935	} else if (SelBits == `0x0c`) {
5936	Known.Zero \|= `0xFFull` << I;
5937	} else if (SelBits > `0x0c`) {
5938	Known.One \|= `0xFFull` << I;
5939	}
5940	Sel >>= `8`;
5941	}
5942	break;
5943	}
5944	case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5945	Known.Zero.setHighBits(`24`);
5946	break;
5947	}
5948	case AMDGPUISD::BUFFER_LOAD_USHORT: {
5949	Known.Zero.setHighBits(`16`);
5950	break;
5951	}
5952	case AMDGPUISD::LDS: {
5953	auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: `0`).getNode());
5954	Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5955
5956	Known.Zero.setHighBits(`16`);
5957	Known.Zero.setLowBits(Log2(A: Alignment));
5958	break;
5959	}
5960	case AMDGPUISD::SMIN3:
5961	case AMDGPUISD::SMAX3:
5962	case AMDGPUISD::SMED3:
5963	case AMDGPUISD::UMIN3:
5964	case AMDGPUISD::UMAX3:
5965	case AMDGPUISD::UMED3: {
5966	KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
5967	if (Known2.isUnknown())
5968	break;
5969
5970	KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
5971	if (Known1.isUnknown())
5972	break;
5973
5974	KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
5975	if (Known0.isUnknown())
5976	break;
5977
5978	// TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5979	Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5980	Known.One = Known0.One & Known1.One & Known2.One;
5981	break;
5982	}
5983	case ISD::INTRINSIC_WO_CHAIN: {
5984	unsigned IID = Op.getConstantOperandVal(i: `0`);
5985	switch (IID) {
5986	case Intrinsic::amdgcn_workitem_id_x:
5987	case Intrinsic::amdgcn_workitem_id_y:
5988	case Intrinsic::amdgcn_workitem_id_z: {
5989	unsigned MaxValue = Subtarget->getMaxWorkitemID(
5990	Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5991	Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5992	break;
5993	}
5994	default:
5995	break;
5996	}
5997	}
5998	}
5999	}
6000
6001	unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6002	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6003	unsigned Depth) const {
6004	switch (Op.getOpcode()) {
6005	case AMDGPUISD::BFE_I32: {
6006	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
6007	if (!Width)
6008	return `1`;
6009
6010	unsigned SignBits = `32` - Width->getZExtValue() + `1`;
6011	if (!isNullConstant(V: Op.getOperand(i: `1`)))
6012	return SignBits;
6013
6014	// TODO: Could probably figure something out with non-0 offsets.
6015	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6016	return std::max(a: SignBits, b: Op0SignBits);
6017	}
6018
6019	case AMDGPUISD::BFE_U32: {
6020	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
6021	return Width ? `32` - (Width->getZExtValue() & `0x1f`) : `1`;
6022	}
6023
6024	case AMDGPUISD::CARRY:
6025	case AMDGPUISD::BORROW:
6026	return `31`;
6027	case AMDGPUISD::BUFFER_LOAD_BYTE:
6028	return `25`;
6029	case AMDGPUISD::BUFFER_LOAD_SHORT:
6030	return `17`;
6031	case AMDGPUISD::BUFFER_LOAD_UBYTE:
6032	return `24`;
6033	case AMDGPUISD::BUFFER_LOAD_USHORT:
6034	return `16`;
6035	case AMDGPUISD::FP_TO_FP16:
6036	return `16`;
6037	case AMDGPUISD::SMIN3:
6038	case AMDGPUISD::SMAX3:
6039	case AMDGPUISD::SMED3:
6040	case AMDGPUISD::UMIN3:
6041	case AMDGPUISD::UMAX3:
6042	case AMDGPUISD::UMED3: {
6043	unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `2`), Depth: Depth + `1`);
6044	if (Tmp2 == `1`)
6045	return `1`; // Early out.
6046
6047	unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `1`), Depth: Depth + `1`);
6048	if (Tmp1 == `1`)
6049	return `1`; // Early out.
6050
6051	unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), Depth: Depth + `1`);
6052	if (Tmp0 == `1`)
6053	return `1`; // Early out.
6054
6055	return std::min(l: {Tmp0, Tmp1, Tmp2});
6056	}
6057	default:
6058	return `1`;
6059	}
6060	}
6061
6062	unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6063	GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6064	const MachineRegisterInfo &MRI, unsigned Depth) const {
6065	const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6066	if (!MI)
6067	return `1`;
6068
6069	// TODO: Check range metadata on MMO.
6070	switch (MI->getOpcode()) {
6071	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6072	return `25`;
6073	case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6074	return `17`;
6075	case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6076	return `24`;
6077	case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6078	return `16`;
6079	case AMDGPU::G_AMDGPU_SMED3:
6080	case AMDGPU::G_AMDGPU_UMED3: {
6081	auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6082	unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + `1`);
6083	if (Tmp2 == `1`)
6084	return `1`;
6085	unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + `1`);
6086	if (Tmp1 == `1`)
6087	return `1`;
6088	unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + `1`);
6089	if (Tmp0 == `1`)
6090	return `1`;
6091	return std::min(l: {Tmp0, Tmp1, Tmp2});
6092	}
6093	default:
6094	return `1`;
6095	}
6096	}
6097
6098	bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6099	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6100	unsigned Depth) const {
6101	unsigned Opcode = Op.getOpcode();
6102	switch (Opcode) {
6103	case AMDGPUISD::FMIN_LEGACY:
6104	case AMDGPUISD::FMAX_LEGACY: {
6105	if (SNaN)
6106	return true;
6107
6108	// TODO: Can check no nans on one of the operands for each one, but which
6109	// one?
6110	return false;
6111	}
6112	case AMDGPUISD::FMUL_LEGACY:
6113	case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6114	if (SNaN)
6115	return true;
6116	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
6117	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
6118	}
6119	case AMDGPUISD::FMED3:
6120	case AMDGPUISD::FMIN3:
6121	case AMDGPUISD::FMAX3:
6122	case AMDGPUISD::FMINIMUM3:
6123	case AMDGPUISD::FMAXIMUM3:
6124	case AMDGPUISD::FMAD_FTZ: {
6125	if (SNaN)
6126	return true;
6127	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`) &&
6128	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6129	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
6130	}
6131	case AMDGPUISD::CVT_F32_UBYTE0:
6132	case AMDGPUISD::CVT_F32_UBYTE1:
6133	case AMDGPUISD::CVT_F32_UBYTE2:
6134	case AMDGPUISD::CVT_F32_UBYTE3:
6135	return true;
6136
6137	case AMDGPUISD::RCP:
6138	case AMDGPUISD::RSQ:
6139	case AMDGPUISD::RCP_LEGACY:
6140	case AMDGPUISD::RSQ_CLAMP: {
6141	if (SNaN)
6142	return true;
6143
6144	// TODO: Need is known positive check.
6145	return false;
6146	}
6147	case ISD::FLDEXP:
6148	case AMDGPUISD::FRACT: {
6149	if (SNaN)
6150	return true;
6151	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `0`), SNaN, Depth: Depth + `1`);
6152	}
6153	case AMDGPUISD::DIV_SCALE:
6154	case AMDGPUISD::DIV_FMAS:
6155	case AMDGPUISD::DIV_FIXUP:
6156	// TODO: Refine on operands.
6157	return SNaN;
6158	case AMDGPUISD::SIN_HW:
6159	case AMDGPUISD::COS_HW: {
6160	// TODO: Need check for infinity
6161	return SNaN;
6162	}
6163	case ISD::INTRINSIC_WO_CHAIN: {
6164	unsigned IntrinsicID = Op.getConstantOperandVal(i: `0`);
6165	// TODO: Handle more intrinsics
6166	switch (IntrinsicID) {
6167	case Intrinsic::amdgcn_cubeid:
6168	case Intrinsic::amdgcn_cvt_off_f32_i4:
6169	return true;
6170
6171	case Intrinsic::amdgcn_frexp_mant: {
6172	if (SNaN)
6173	return true;
6174	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`);
6175	}
6176	case Intrinsic::amdgcn_cvt_pkrtz: {
6177	if (SNaN)
6178	return true;
6179	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6180	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`);
6181	}
6182	case Intrinsic::amdgcn_rcp:
6183	case Intrinsic::amdgcn_rsq:
6184	case Intrinsic::amdgcn_rcp_legacy:
6185	case Intrinsic::amdgcn_rsq_legacy:
6186	case Intrinsic::amdgcn_rsq_clamp: {
6187	if (SNaN)
6188	return true;
6189
6190	// TODO: Need is known positive check.
6191	return false;
6192	}
6193	case Intrinsic::amdgcn_trig_preop:
6194	case Intrinsic::amdgcn_fdot2:
6195	// TODO: Refine on operand
6196	return SNaN;
6197	case Intrinsic::amdgcn_fma_legacy:
6198	if (SNaN)
6199	return true;
6200	return DAG.isKnownNeverNaN(Op: Op.getOperand(i: `1`), SNaN, Depth: Depth + `1`) &&
6201	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `2`), SNaN, Depth: Depth + `1`) &&
6202	DAG.isKnownNeverNaN(Op: Op.getOperand(i: `3`), SNaN, Depth: Depth + `1`);
6203	default:
6204	return false;
6205	}
6206	}
6207	default:
6208	return false;
6209	}
6210	}
6211
6212	bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6213	Register N0, Register N1) const {
6214	return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6215	}
6216

Browse the source code of llvm_projects/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp