1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunctionInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUSelectionDAGInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Target/TargetMachine.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
35static cl::opt<bool> AMDGPUBypassSlowDiv(
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(Val: true));
39
40// Find a larger type to do a load / store of a vector with.
41EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
48
49 return VT;
50}
51
52unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53 return DAG.computeKnownBits(Op).countMaxActiveBits();
54}
55
56unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
62AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
68 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
69 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
70 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
73 MaxGluedStoresPerMemcpy = 16;
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
172 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
173
174 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
175 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
176
177 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
178 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
179
180 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
181 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
186 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
187 Action: Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
195 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
196 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
197 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
198 }
199 }
200
201 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
204 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
205 Action: Expand);
206
207 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
208 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
209 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
210 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
212 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
219 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
221
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
226 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
227 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
228
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
239 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
240 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
241
242 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
243 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
244
245 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
246 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
247
248 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
249 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
250
251 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
252 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
253
254 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
255 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
256
257 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
258 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
259
260 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
261 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
262
263 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
264 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
265
266 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
267 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
268
269 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
270 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
271
272 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
273 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
274
275 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
276 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
277
278 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
279 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
280
281 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
282 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
283
284 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
285 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
286
287 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
288 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
289
290 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
291 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
292
293 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
294 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
295
296 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
297 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
298
299 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
300 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
301
302 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
303 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
304
305 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
306 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
307
308 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
309 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
310
311 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
312 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
313
314 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
315 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
316
317 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
318 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
319
320 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
321 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
322
323 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
324 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
325 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
326 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
327
328 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
329 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
330 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
331 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
332
333 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
334 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
338 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
339 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
345 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
346 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
348
349 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
350 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
352
353 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
354 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
355 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
356
357 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
358
359 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
362 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
363 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
364 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
365 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
366
367 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
369 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
370 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
372
373 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
374 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
376
377 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
378 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
379 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
380
381 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
382 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
383 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
384
385 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
386 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
387 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
388
389 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
390 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
391 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
392 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
393 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
394 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
395 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
396
397 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
398 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
399
400 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
404 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
408 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409 ISD::FROUNDEVEN, ISD::FTRUNC},
410 VTs: {MVT::f16, MVT::f32}, Action: Legal);
411 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
412
413 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
414 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
415 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
416 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
417
418 setOperationAction(
419 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
420 Action: Custom);
421 setOperationAction(Ops: {ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f64, Action: Custom);
422
423 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
424
425 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
426
427 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
428 Action: Expand);
429
430 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
431 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
432 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
433
434 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
435 Action: Custom);
436
437 setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
438
439 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
440 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
441 // default unless marked custom/legal.
442 setOperationAction(Ops: ISD::IS_FPCLASS,
443 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
444 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
445 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
446 MVT::v16f64},
447 Action: Custom);
448
449 // Expand to fneg + fadd.
450 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
451
452 setOperationAction(Ops: ISD::CONCAT_VECTORS,
453 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
454 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
455 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
456 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
457 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
458 Action: Custom);
459
460 setOperationAction(
461 Ops: ISD::EXTRACT_SUBVECTOR,
462 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
463 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
464 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
465 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
466 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
467 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
468 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
469 Action: Custom);
470
471 setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
472 Action: Expand);
473 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
474
475 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
476 for (MVT VT : ScalarIntVTs) {
477 // These should use [SU]DIVREM, so set them to expand
478 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
479 Action: Expand);
480
481 // GPU does not have divrem function for signed or unsigned.
482 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
483
484 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
485 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
486
487 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
488
489 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
490 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
491 }
492
493 // The hardware supports 32-bit FSHR, but not FSHL.
494 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
495
496 setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
497
498 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
499
500 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
501 setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
502 ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
503 ISD::FP_TO_UINT_SAT},
504 VT: MVT::i64, Action: Custom);
505 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
506
507 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
508 Action: Legal);
509
510 setOperationAction(
511 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
512 VT: MVT::i64, Action: Custom);
513
514 for (auto VT : {MVT::i8, MVT::i16})
515 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
516
517 static const MVT::SimpleValueType VectorIntTypes[] = {
518 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
519 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
520
521 for (MVT VT : VectorIntTypes) {
522 // Expand the following operations for the current type by default.
523 // clang-format off
524 setOperationAction(Ops: {ISD::ADD, ISD::AND,
525 ISD::FP_TO_SINT, ISD::FP_TO_UINT,
526 ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
527 ISD::MUL, ISD::MULHU,
528 ISD::MULHS, ISD::OR,
529 ISD::SHL, ISD::SRA,
530 ISD::SRL, ISD::ROTL,
531 ISD::ROTR, ISD::SUB,
532 ISD::SINT_TO_FP, ISD::UINT_TO_FP,
533 ISD::SDIV, ISD::UDIV,
534 ISD::SREM, ISD::UREM,
535 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
536 ISD::SDIVREM, ISD::UDIVREM,
537 ISD::SELECT, ISD::VSELECT,
538 ISD::SELECT_CC, ISD::XOR,
539 ISD::BSWAP, ISD::CTPOP,
540 ISD::CTTZ, ISD::CTLZ,
541 ISD::VECTOR_SHUFFLE, ISD::SETCC,
542 ISD::ADDRSPACECAST},
543 VT, Action: Expand);
544 // clang-format on
545 }
546
547 static const MVT::SimpleValueType FloatVectorTypes[] = {
548 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
549 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
550
551 for (MVT VT : FloatVectorTypes) {
552 setOperationAction(
553 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
554 ISD::FADD, ISD::FCEIL, ISD::FCOS,
555 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
556 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
557 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
558 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
559 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
560 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
561 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
562 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
563 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
564 VT, Action: Expand);
565 }
566
567 // This causes using an unrolled select operation rather than expansion with
568 // bit operations. This is in general better, but the alternative using BFI
569 // instructions may be better if the select sources are SGPRs.
570 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
571 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
572
573 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
574 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
575
576 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
577 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
578
579 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
580 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
581
582 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
583 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
584
585 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
586 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
587
588 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
589 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
590
591 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
592 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
593
594 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
595 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
596
597 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
598 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
599
600 setSchedulingPreference(Sched::RegPressure);
601 setJumpIsExpensive(true);
602
603 setMinCmpXchgSizeInBits(32);
604 setSupportsUnalignedAtomics(false);
605
606 PredictableSelectIsExpensive = false;
607
608 // We want to find all load dependencies for long chains of stores to enable
609 // merging into very wide vectors. The problem is with vectors with > 4
610 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
611 // vectors are a legal type, even though we have to split the loads
612 // usually. When we can more precisely specify load legality per address
613 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
614 // smarter so that they can figure out what to do in 2 iterations without all
615 // N > 4 stores on the same chain.
616 GatherAllAliasesMaxDepth = 16;
617
618 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
619 // about these during lowering.
620 MaxStoresPerMemcpy = 0xffffffff;
621 MaxStoresPerMemmove = 0xffffffff;
622 MaxStoresPerMemset = 0xffffffff;
623
624 // The expansion for 64-bit division is enormous.
625 if (AMDGPUBypassSlowDiv)
626 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
627
628 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
629 ISD::SRA, ISD::SRL,
630 ISD::TRUNCATE, ISD::MUL,
631 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
632 ISD::MULHU, ISD::MULHS,
633 ISD::SELECT, ISD::SELECT_CC,
634 ISD::STORE, ISD::FADD,
635 ISD::FSUB, ISD::FNEG,
636 ISD::FABS, ISD::AssertZext,
637 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
638
639 setMaxAtomicSizeInBitsSupported(64);
640 setMaxDivRemBitWidthSupported(64);
641 setMaxLargeFPConvertBitWidthSupported(64);
642}
643
644bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
645 const auto Flags = Op.getNode()->getFlags();
646 if (Flags.hasNoSignedZeros())
647 return true;
648
649 return false;
650}
651
652//===----------------------------------------------------------------------===//
653// Target Information
654//===----------------------------------------------------------------------===//
655
656LLVM_READNONE
657static bool fnegFoldsIntoOpcode(unsigned Opc) {
658 switch (Opc) {
659 case ISD::FADD:
660 case ISD::FSUB:
661 case ISD::FMUL:
662 case ISD::FMA:
663 case ISD::FMAD:
664 case ISD::FMINNUM:
665 case ISD::FMAXNUM:
666 case ISD::FMINNUM_IEEE:
667 case ISD::FMAXNUM_IEEE:
668 case ISD::FMINIMUM:
669 case ISD::FMAXIMUM:
670 case ISD::FMINIMUMNUM:
671 case ISD::FMAXIMUMNUM:
672 case ISD::SELECT:
673 case ISD::FSIN:
674 case ISD::FTRUNC:
675 case ISD::FRINT:
676 case ISD::FNEARBYINT:
677 case ISD::FROUNDEVEN:
678 case ISD::FCANONICALIZE:
679 case AMDGPUISD::RCP:
680 case AMDGPUISD::RCP_LEGACY:
681 case AMDGPUISD::RCP_IFLAG:
682 case AMDGPUISD::SIN_HW:
683 case AMDGPUISD::FMUL_LEGACY:
684 case AMDGPUISD::FMIN_LEGACY:
685 case AMDGPUISD::FMAX_LEGACY:
686 case AMDGPUISD::FMED3:
687 // TODO: handle llvm.amdgcn.fma.legacy
688 return true;
689 case ISD::BITCAST:
690 llvm_unreachable("bitcast is special cased");
691 default:
692 return false;
693 }
694}
695
696static bool fnegFoldsIntoOp(const SDNode *N) {
697 unsigned Opc = N->getOpcode();
698 if (Opc == ISD::BITCAST) {
699 // TODO: Is there a benefit to checking the conditions performFNegCombine
700 // does? We don't for the other cases.
701 SDValue BCSrc = N->getOperand(Num: 0);
702 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
703 return BCSrc.getNumOperands() == 2 &&
704 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
705 }
706
707 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
708 }
709
710 return fnegFoldsIntoOpcode(Opc);
711}
712
713/// \p returns true if the operation will definitely need to use a 64-bit
714/// encoding, and thus will use a VOP3 encoding regardless of the source
715/// modifiers.
716LLVM_READONLY
717static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
718 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
719 VT == MVT::f64;
720}
721
722/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
723/// type for ISD::SELECT.
724LLVM_READONLY
725static bool selectSupportsSourceMods(const SDNode *N) {
726 // TODO: Only applies if select will be vector
727 return N->getValueType(ResNo: 0) == MVT::f32;
728}
729
730// Most FP instructions support source modifiers, but this could be refined
731// slightly.
732LLVM_READONLY
733static bool hasSourceMods(const SDNode *N) {
734 if (isa<MemSDNode>(Val: N))
735 return false;
736
737 switch (N->getOpcode()) {
738 case ISD::CopyToReg:
739 case ISD::FDIV:
740 case ISD::FREM:
741 case ISD::INLINEASM:
742 case ISD::INLINEASM_BR:
743 case AMDGPUISD::DIV_SCALE:
744 case ISD::INTRINSIC_W_CHAIN:
745
746 // TODO: Should really be looking at the users of the bitcast. These are
747 // problematic because bitcasts are used to legalize all stores to integer
748 // types.
749 case ISD::BITCAST:
750 return false;
751 case ISD::INTRINSIC_WO_CHAIN: {
752 switch (N->getConstantOperandVal(Num: 0)) {
753 case Intrinsic::amdgcn_interp_p1:
754 case Intrinsic::amdgcn_interp_p2:
755 case Intrinsic::amdgcn_interp_mov:
756 case Intrinsic::amdgcn_interp_p1_f16:
757 case Intrinsic::amdgcn_interp_p2_f16:
758 return false;
759 default:
760 return true;
761 }
762 }
763 case ISD::SELECT:
764 return selectSupportsSourceMods(N);
765 default:
766 return true;
767 }
768}
769
770bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
771 unsigned CostThreshold) {
772 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
773 // it is truly free to use a source modifier in all cases. If there are
774 // multiple users but for each one will necessitate using VOP3, there will be
775 // a code size increase. Try to avoid increasing code size unless we know it
776 // will save on the instruction count.
777 unsigned NumMayIncreaseSize = 0;
778 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
779
780 assert(!N->use_empty());
781
782 // XXX - Should this limit number of uses to check?
783 for (const SDNode *U : N->users()) {
784 if (!hasSourceMods(N: U))
785 return false;
786
787 if (!opMustUseVOP3Encoding(N: U, VT)) {
788 if (++NumMayIncreaseSize > CostThreshold)
789 return false;
790 }
791 }
792
793 return true;
794}
795
796EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
797 ISD::NodeType ExtendKind) const {
798 assert(!VT.isVector() && "only scalar expected");
799
800 // Round to the next multiple of 32-bits.
801 unsigned Size = VT.getSizeInBits();
802 if (Size <= 32)
803 return MVT::i32;
804 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
805}
806
807unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
808 return 32;
809}
810
811bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
812 return true;
813}
814
815// The backend supports 32 and 64 bit floating point immediates.
816// FIXME: Why are we reporting vectors of FP immediates as legal?
817bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
818 bool ForCodeSize) const {
819 return isTypeLegal(VT: VT.getScalarType());
820}
821
822// We don't want to shrink f64 / f32 constants.
823bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
826}
827
828bool AMDGPUTargetLowering::shouldReduceLoadWidth(
829 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
830 std::optional<unsigned> ByteOffset) const {
831 // TODO: This may be worth removing. Check regression tests for diffs.
832 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
833 return false;
834
835 unsigned NewSize = NewVT.getStoreSizeInBits();
836
837 // If we are reducing to a 32-bit load or a smaller multi-dword load,
838 // this is always better.
839 if (NewSize >= 32)
840 return true;
841
842 EVT OldVT = N->getValueType(ResNo: 0);
843 unsigned OldSize = OldVT.getStoreSizeInBits();
844
845 MemSDNode *MN = cast<MemSDNode>(Val: N);
846 unsigned AS = MN->getAddressSpace();
847 // Do not shrink an aligned scalar load to sub-dword.
848 // Scalar engine cannot do sub-dword loads.
849 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
850 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
851 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
852 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
853 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
854 MN->isInvariant())) &&
855 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
856 return false;
857
858 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
859 // extloads, so doing one requires using a buffer_load. In cases where we
860 // still couldn't use a scalar load, using the wider load shouldn't really
861 // hurt anything.
862
863 // If the old size already had to be an extload, there's no harm in continuing
864 // to reduce the width.
865 return (OldSize < 32);
866}
867
868bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
869 const SelectionDAG &DAG,
870 const MachineMemOperand &MMO) const {
871
872 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
873
874 if (LoadTy.getScalarType() == MVT::i32)
875 return false;
876
877 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
878 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
879
880 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
881 return false;
882
883 unsigned Fast = 0;
884 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
885 VT: CastTy, MMO, Fast: &Fast) &&
886 Fast;
887}
888
889// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
890// profitable with the expansion for 64-bit since it's generally good to
891// speculate things.
892bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
893 return true;
894}
895
896bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
897 return true;
898}
899
900bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
901 switch (N->getOpcode()) {
902 case ISD::EntryToken:
903 case ISD::TokenFactor:
904 return true;
905 case ISD::INTRINSIC_WO_CHAIN: {
906 unsigned IntrID = N->getConstantOperandVal(Num: 0);
907 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
908 }
909 case ISD::INTRINSIC_W_CHAIN: {
910 unsigned IntrID = N->getConstantOperandVal(Num: 1);
911 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
912 }
913 case ISD::LOAD:
914 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
915 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
916 return true;
917 return false;
918 case AMDGPUISD::SETCC: // ballot-style instruction
919 return true;
920 }
921 return false;
922}
923
924SDValue AMDGPUTargetLowering::getNegatedExpression(
925 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
926 NegatibleCost &Cost, unsigned Depth) const {
927
928 switch (Op.getOpcode()) {
929 case ISD::FMA:
930 case ISD::FMAD: {
931 // Negating a fma is not free if it has users without source mods.
932 if (!allUsesHaveSourceMods(N: Op.getNode()))
933 return SDValue();
934 break;
935 }
936 case AMDGPUISD::RCP: {
937 SDValue Src = Op.getOperand(i: 0);
938 EVT VT = Op.getValueType();
939 SDLoc SL(Op);
940
941 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
942 ForCodeSize, Cost, Depth: Depth + 1);
943 if (NegSrc)
944 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
945 return SDValue();
946 }
947 default:
948 break;
949 }
950
951 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
952 OptForSize: ForCodeSize, Cost, Depth);
953}
954
955//===---------------------------------------------------------------------===//
956// Target Properties
957//===---------------------------------------------------------------------===//
958
959bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
960 assert(VT.isFloatingPoint());
961
962 // Packed operations do not have a fabs modifier.
963 // Report this based on the end legalized type.
964 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
965}
966
967bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
968 assert(VT.isFloatingPoint());
969 // Report this based on the end legalized type.
970 VT = VT.getScalarType();
971 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
972}
973
974bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
975 unsigned NumElem,
976 unsigned AS) const {
977 return true;
978}
979
980bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
981 // There are few operations which truly have vector input operands. Any vector
982 // operation is going to involve operations on each component, and a
983 // build_vector will be a copy per element, so it always makes sense to use a
984 // build_vector input in place of the extracted element to avoid a copy into a
985 // super register.
986 //
987 // We should probably only do this if all users are extracts only, but this
988 // should be the common case.
989 return true;
990}
991
992bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
993 // Truncate is just accessing a subregister.
994
995 unsigned SrcSize = Source.getSizeInBits();
996 unsigned DestSize = Dest.getSizeInBits();
997
998 return DestSize < SrcSize && DestSize % 32 == 0 ;
999}
1000
1001bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
1002 // Truncate is just accessing a subregister.
1003
1004 unsigned SrcSize = Source->getScalarSizeInBits();
1005 unsigned DestSize = Dest->getScalarSizeInBits();
1006
1007 if (DestSize== 16 && Subtarget->has16BitInsts())
1008 return SrcSize >= 32;
1009
1010 return DestSize < SrcSize && DestSize % 32 == 0;
1011}
1012
1013bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1014 unsigned SrcSize = Src->getScalarSizeInBits();
1015 unsigned DestSize = Dest->getScalarSizeInBits();
1016
1017 if (SrcSize == 16 && Subtarget->has16BitInsts())
1018 return DestSize >= 32;
1019
1020 return SrcSize == 32 && DestSize == 64;
1021}
1022
1023bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1024 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1025 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1026 // this will enable reducing 64-bit operations the 32-bit, which is always
1027 // good.
1028
1029 if (Src == MVT::i16)
1030 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1031
1032 return Src == MVT::i32 && Dest == MVT::i64;
1033}
1034
1035bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1036 EVT DestVT) const {
1037 switch (N->getOpcode()) {
1038 case ISD::ADD:
1039 case ISD::SUB:
1040 case ISD::SHL:
1041 case ISD::SRL:
1042 case ISD::SRA:
1043 case ISD::AND:
1044 case ISD::OR:
1045 case ISD::XOR:
1046 case ISD::MUL:
1047 case ISD::SETCC:
1048 case ISD::SELECT:
1049 case ISD::SMIN:
1050 case ISD::SMAX:
1051 case ISD::UMIN:
1052 case ISD::UMAX:
1053 if (isTypeLegal(VT: MVT::i16) &&
1054 (!DestVT.isVector() ||
1055 !isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1056 // Don't narrow back down to i16 if promoted to i32 already.
1057 if (!N->isDivergent() && DestVT.isInteger() &&
1058 DestVT.getScalarSizeInBits() > 1 &&
1059 DestVT.getScalarSizeInBits() <= 16 &&
1060 SrcVT.getScalarSizeInBits() > 16) {
1061 return false;
1062 }
1063 }
1064 return true;
1065 default:
1066 break;
1067 }
1068
1069 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1070 // limited number of native 64-bit operations. Shrinking an operation to fit
1071 // in a single 32-bit register should always be helpful. As currently used,
1072 // this is much less general than the name suggests, and is only used in
1073 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1074 // not profitable, and may actually be harmful.
1075 if (isa<LoadSDNode>(Val: N))
1076 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1077
1078 return true;
1079}
1080
1081bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1082 const SDNode* N, CombineLevel Level) const {
1083 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1084 N->getOpcode() == ISD::SRL) &&
1085 "Expected shift op");
1086
1087 SDValue ShiftLHS = N->getOperand(Num: 0);
1088 if (!ShiftLHS->hasOneUse())
1089 return false;
1090
1091 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1092 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1093 return false;
1094
1095 // Always commute pre-type legalization and right shifts.
1096 // We're looking for shl(or(x,y),z) patterns.
1097 if (Level < CombineLevel::AfterLegalizeTypes ||
1098 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1099 return true;
1100
1101 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1102 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1103 (N->user_begin()->getOpcode() == ISD::SRA ||
1104 N->user_begin()->getOpcode() == ISD::SRL))
1105 return false;
1106
1107 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1108 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1109 if (LHS.getOpcode() != ISD::SHL)
1110 return false;
1111 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1112 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1113 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1114 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1115 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1116 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1117 };
1118 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1119 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1120 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1121}
1122
1123//===---------------------------------------------------------------------===//
1124// TargetLowering Callbacks
1125//===---------------------------------------------------------------------===//
1126
1127CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1128 bool IsVarArg) {
1129 switch (CC) {
1130 case CallingConv::AMDGPU_VS:
1131 case CallingConv::AMDGPU_GS:
1132 case CallingConv::AMDGPU_PS:
1133 case CallingConv::AMDGPU_CS:
1134 case CallingConv::AMDGPU_HS:
1135 case CallingConv::AMDGPU_ES:
1136 case CallingConv::AMDGPU_LS:
1137 return CC_AMDGPU;
1138 case CallingConv::AMDGPU_CS_Chain:
1139 case CallingConv::AMDGPU_CS_ChainPreserve:
1140 return CC_AMDGPU_CS_CHAIN;
1141 case CallingConv::C:
1142 case CallingConv::Fast:
1143 case CallingConv::Cold:
1144 return CC_AMDGPU_Func;
1145 case CallingConv::AMDGPU_Gfx:
1146 case CallingConv::AMDGPU_Gfx_WholeWave:
1147 return CC_SI_Gfx;
1148 case CallingConv::AMDGPU_KERNEL:
1149 case CallingConv::SPIR_KERNEL:
1150 default:
1151 reportFatalUsageError(reason: "unsupported calling convention for call");
1152 }
1153}
1154
1155CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1156 bool IsVarArg) {
1157 switch (CC) {
1158 case CallingConv::AMDGPU_KERNEL:
1159 case CallingConv::SPIR_KERNEL:
1160 llvm_unreachable("kernels should not be handled here");
1161 case CallingConv::AMDGPU_VS:
1162 case CallingConv::AMDGPU_GS:
1163 case CallingConv::AMDGPU_PS:
1164 case CallingConv::AMDGPU_CS:
1165 case CallingConv::AMDGPU_CS_Chain:
1166 case CallingConv::AMDGPU_CS_ChainPreserve:
1167 case CallingConv::AMDGPU_HS:
1168 case CallingConv::AMDGPU_ES:
1169 case CallingConv::AMDGPU_LS:
1170 return RetCC_SI_Shader;
1171 case CallingConv::AMDGPU_Gfx:
1172 case CallingConv::AMDGPU_Gfx_WholeWave:
1173 return RetCC_SI_Gfx;
1174 case CallingConv::C:
1175 case CallingConv::Fast:
1176 case CallingConv::Cold:
1177 return RetCC_AMDGPU_Func;
1178 default:
1179 reportFatalUsageError(reason: "unsupported calling convention");
1180 }
1181}
1182
1183/// The SelectionDAGBuilder will automatically promote function arguments
1184/// with illegal types. However, this does not work for the AMDGPU targets
1185/// since the function arguments are stored in memory as these illegal types.
1186/// In order to handle this properly we need to get the original types sizes
1187/// from the LLVM IR Function and fixup the ISD:InputArg values before
1188/// passing them to AnalyzeFormalArguments()
1189
1190/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1191/// input values across multiple registers. Each item in the Ins array
1192/// represents a single value that will be stored in registers. Ins[x].VT is
1193/// the value type of the value that will be stored in the register, so
1194/// whatever SDNode we lower the argument to needs to be this type.
1195///
1196/// In order to correctly lower the arguments we need to know the size of each
1197/// argument. Since Ins[x].VT gives us the size of the register that will
1198/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1199/// for the original function argument so that we can deduce the correct memory
1200/// type to use for Ins[x]. In most cases the correct memory type will be
1201/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1202/// we have a kernel argument of type v8i8, this argument will be split into
1203/// 8 parts and each part will be represented by its own item in the Ins array.
1204/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1205/// the argument before it was split. From this, we deduce that the memory type
1206/// for each individual part is i8. We pass the memory type as LocVT to the
1207/// calling convention analysis function and the register type (Ins[x].VT) as
1208/// the ValVT.
1209void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1210 CCState &State,
1211 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1212 const MachineFunction &MF = State.getMachineFunction();
1213 const Function &Fn = MF.getFunction();
1214 LLVMContext &Ctx = Fn.getContext();
1215 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1216 CallingConv::ID CC = Fn.getCallingConv();
1217
1218 Align MaxAlign = Align(1);
1219 uint64_t ExplicitArgOffset = 0;
1220 const DataLayout &DL = Fn.getDataLayout();
1221
1222 unsigned InIndex = 0;
1223
1224 for (const Argument &Arg : Fn.args()) {
1225 const bool IsByRef = Arg.hasByRefAttr();
1226 Type *BaseArgTy = Arg.getType();
1227 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1228 Align Alignment = DL.getValueOrABITypeAlignment(
1229 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1230 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1231 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1232
1233 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1234 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1235
1236 // We're basically throwing away everything passed into us and starting over
1237 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1238 // to us as computed in Ins.
1239 //
1240 // We also need to figure out what type legalization is trying to do to get
1241 // the correct memory offsets.
1242
1243 SmallVector<EVT, 16> ValueVTs;
1244 SmallVector<uint64_t, 16> Offsets;
1245 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1246 FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1247
1248 for (unsigned Value = 0, NumValues = ValueVTs.size();
1249 Value != NumValues; ++Value) {
1250 uint64_t BasePartOffset = Offsets[Value];
1251
1252 EVT ArgVT = ValueVTs[Value];
1253 EVT MemVT = ArgVT;
1254 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1255 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1256
1257 if (NumRegs == 1) {
1258 // This argument is not split, so the IR type is the memory type.
1259 if (ArgVT.isExtended()) {
1260 // We have an extended type, like i24, so we should just use the
1261 // register type.
1262 MemVT = RegisterVT;
1263 } else {
1264 MemVT = ArgVT;
1265 }
1266 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1267 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1268 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1269 // We have a vector value which has been split into a vector with
1270 // the same scalar type, but fewer elements. This should handle
1271 // all the floating-point vector types.
1272 MemVT = RegisterVT;
1273 } else if (ArgVT.isVector() &&
1274 ArgVT.getVectorNumElements() == NumRegs) {
1275 // This arg has been split so that each element is stored in a separate
1276 // register.
1277 MemVT = ArgVT.getScalarType();
1278 } else if (ArgVT.isExtended()) {
1279 // We have an extended type, like i65.
1280 MemVT = RegisterVT;
1281 } else {
1282 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1283 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1284 if (RegisterVT.isInteger()) {
1285 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1286 } else if (RegisterVT.isVector()) {
1287 assert(!RegisterVT.getScalarType().isFloatingPoint());
1288 unsigned NumElements = RegisterVT.getVectorNumElements();
1289 assert(MemoryBits % NumElements == 0);
1290 // This vector type has been split into another vector type with
1291 // a different elements size.
1292 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1293 BitWidth: MemoryBits / NumElements);
1294 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1295 } else {
1296 llvm_unreachable("cannot deduce memory type.");
1297 }
1298 }
1299
1300 // Convert one element vectors to scalar.
1301 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1302 MemVT = MemVT.getScalarType();
1303
1304 // Round up vec3/vec5 argument.
1305 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1306 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1307 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1308 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1309 }
1310
1311 unsigned PartOffset = 0;
1312 for (unsigned i = 0; i != NumRegs; ++i) {
1313 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1314 Offset: BasePartOffset + PartOffset,
1315 LocVT: MemVT.getSimpleVT(),
1316 HTP: CCValAssign::Full));
1317 PartOffset += MemVT.getStoreSize();
1318 }
1319 }
1320 }
1321}
1322
1323SDValue AMDGPUTargetLowering::LowerReturn(
1324 SDValue Chain, CallingConv::ID CallConv,
1325 bool isVarArg,
1326 const SmallVectorImpl<ISD::OutputArg> &Outs,
1327 const SmallVectorImpl<SDValue> &OutVals,
1328 const SDLoc &DL, SelectionDAG &DAG) const {
1329 // FIXME: Fails for r600 tests
1330 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1331 // "wave terminate should not have return values");
1332 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1333}
1334
1335//===---------------------------------------------------------------------===//
1336// Target specific lowering
1337//===---------------------------------------------------------------------===//
1338
1339/// Selects the correct CCAssignFn for a given CallingConvention value.
1340CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1341 bool IsVarArg) {
1342 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1343}
1344
1345CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1346 bool IsVarArg) {
1347 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1348}
1349
1350SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1351 SelectionDAG &DAG,
1352 MachineFrameInfo &MFI,
1353 int ClobberedFI) const {
1354 SmallVector<SDValue, 8> ArgChains;
1355 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1356 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1357
1358 // Include the original chain at the beginning of the list. When this is
1359 // used by target LowerCall hooks, this helps legalize find the
1360 // CALLSEQ_BEGIN node.
1361 ArgChains.push_back(Elt: Chain);
1362
1363 // Add a chain value for each stack argument corresponding
1364 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1365 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1366 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1367 if (FI->getIndex() < 0) {
1368 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1369 int64_t InLastByte = InFirstByte;
1370 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1371
1372 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1373 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1374 ArgChains.push_back(Elt: SDValue(L, 1));
1375 }
1376 }
1377 }
1378 }
1379
1380 // Build a tokenfactor for all the chains.
1381 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1382}
1383
1384SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1385 SmallVectorImpl<SDValue> &InVals,
1386 StringRef Reason) const {
1387 SDValue Callee = CLI.Callee;
1388 SelectionDAG &DAG = CLI.DAG;
1389
1390 const Function &Fn = DAG.getMachineFunction().getFunction();
1391
1392 StringRef FuncName("<unknown>");
1393
1394 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1395 FuncName = G->getSymbol();
1396 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1397 FuncName = G->getGlobal()->getName();
1398
1399 DAG.getContext()->diagnose(
1400 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1401
1402 if (!CLI.IsTailCall) {
1403 for (ISD::InputArg &Arg : CLI.Ins)
1404 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1405 }
1406
1407 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1408 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1409 return CLI.Chain;
1410
1411 SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: 0, OutSize: 0, DL: CLI.DL);
1412 return DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, /*InGlue=*/Glue: SDValue(), DL: CLI.DL);
1413}
1414
1415SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1418}
1419
1420SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1424 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1427 return DAG.getMergeValues(Ops, dl: SDLoc());
1428}
1429
1430SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(OS&: errs(), G: &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1438 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1440 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM:
1443 return LowerSDIVREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1468 case ISD::FP_TO_SINT_SAT:
1469 case ISD::FP_TO_UINT_SAT:
1470 return LowerFP_TO_INT_SAT(Op, DAG);
1471 case ISD::CTTZ:
1472 case ISD::CTTZ_ZERO_UNDEF:
1473 case ISD::CTLZ:
1474 case ISD::CTLZ_ZERO_UNDEF:
1475 return LowerCTLZ_CTTZ(Op, DAG);
1476 case ISD::CTLS:
1477 return LowerCTLS(Op, DAG);
1478 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1479 }
1480 return Op;
1481}
1482
1483void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1484 SmallVectorImpl<SDValue> &Results,
1485 SelectionDAG &DAG) const {
1486 switch (N->getOpcode()) {
1487 case ISD::SIGN_EXTEND_INREG:
1488 // Different parts of legalization seem to interpret which type of
1489 // sign_extend_inreg is the one to check for custom lowering. The extended
1490 // from type is what really matters, but some places check for custom
1491 // lowering of the result type. This results in trying to use
1492 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1493 // nothing here and let the illegal result integer be handled normally.
1494 return;
1495 case ISD::FLOG2:
1496 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1497 Results.push_back(Elt: Lowered);
1498 return;
1499 case ISD::FLOG:
1500 case ISD::FLOG10:
1501 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1502 Results.push_back(Elt: Lowered);
1503 return;
1504 case ISD::FEXP2:
1505 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1506 Results.push_back(Elt: Lowered);
1507 return;
1508 case ISD::FEXP:
1509 case ISD::FEXP10:
1510 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1511 Results.push_back(Elt: Lowered);
1512 return;
1513 case ISD::CTLZ:
1514 case ISD::CTLZ_ZERO_UNDEF:
1515 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1516 Results.push_back(Elt: Lowered);
1517 return;
1518 default:
1519 return;
1520 }
1521}
1522
1523SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
1524 SDValue Op,
1525 SelectionDAG &DAG) const {
1526
1527 const DataLayout &DL = DAG.getDataLayout();
1528 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1529 const GlobalValue *GV = G->getGlobal();
1530
1531 if (!MFI->isModuleEntryFunction()) {
1532 auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1533 if (std::optional<uint32_t> Address =
1534 AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(GV: *GV)) {
1535 if (IsNamedBarrier) {
1536 unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / 16;
1537 MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1538 }
1539 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1540 } else if (IsNamedBarrier) {
1541 llvm_unreachable("named barrier should have an assigned address");
1542 }
1543 }
1544
1545 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1546 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1547 if (!MFI->isModuleEntryFunction() &&
1548 GV->getName() != "llvm.amdgcn.module.lds" &&
1549 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1550 SDLoc DL(Op);
1551 const Function &Fn = DAG.getMachineFunction().getFunction();
1552 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1553 Fn, "local memory global used by non-kernel function",
1554 DL.getDebugLoc(), DS_Warning));
1555
1556 // We currently don't have a way to correctly allocate LDS objects that
1557 // aren't directly associated with a kernel. We do force inlining of
1558 // functions that use local objects. However, if these dead functions are
1559 // not eliminated, we don't want a compile time error. Just emit a warning
1560 // and a trap, since there should be no callable path here.
1561 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1562 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1563 N1: Trap, N2: DAG.getRoot());
1564 DAG.setRoot(OutputChain);
1565 return DAG.getPOISON(VT: Op.getValueType());
1566 }
1567
1568 // XXX: What does the value of G->getOffset() mean?
1569 assert(G->getOffset() == 0 &&
1570 "Do not know what to do with an non-zero offset");
1571
1572 // TODO: We could emit code to handle the initialization somewhere.
1573 // We ignore the initializer for now and legalize it to allow selection.
1574 // The initializer will anyway get errored out during assembly emission.
1575 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1576 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1577 }
1578 return SDValue();
1579}
1580
1581SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1582 SelectionDAG &DAG) const {
1583 SmallVector<SDValue, 8> Args;
1584 SDLoc SL(Op);
1585
1586 EVT VT = Op.getValueType();
1587 if (VT.getVectorElementType().getSizeInBits() < 32) {
1588 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1589 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1590 unsigned NewNumElt = OpBitSize / 32;
1591 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1592 : EVT::getVectorVT(Context&: *DAG.getContext(),
1593 VT: MVT::i32, NumElements: NewNumElt);
1594 for (const SDUse &U : Op->ops()) {
1595 SDValue In = U.get();
1596 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1597 if (NewNumElt > 1)
1598 DAG.ExtractVectorElements(Op: NewIn, Args);
1599 else
1600 Args.push_back(Elt: NewIn);
1601 }
1602
1603 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1604 NumElements: NewNumElt * Op.getNumOperands());
1605 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1606 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1607 }
1608 }
1609
1610 for (const SDUse &U : Op->ops())
1611 DAG.ExtractVectorElements(Op: U.get(), Args);
1612
1613 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1614}
1615
1616SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1617 SelectionDAG &DAG) const {
1618 SDLoc SL(Op);
1619 SmallVector<SDValue, 8> Args;
1620 unsigned Start = Op.getConstantOperandVal(i: 1);
1621 EVT VT = Op.getValueType();
1622 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1623
1624 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1625 unsigned NumElt = VT.getVectorNumElements();
1626 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1627 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1628
1629 // Extract 32-bit registers at a time.
1630 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1631 EVT NewVT = NumElt == 2
1632 ? MVT::i32
1633 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1634 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1635
1636 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1637 if (NumElt == 2)
1638 Tmp = Args[0];
1639 else
1640 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1641
1642 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1643 }
1644
1645 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1646 Count: VT.getVectorNumElements());
1647
1648 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1649}
1650
1651// TODO: Handle fabs too
1652static SDValue peekFNeg(SDValue Val) {
1653 if (Val.getOpcode() == ISD::FNEG)
1654 return Val.getOperand(i: 0);
1655
1656 return Val;
1657}
1658
1659static SDValue peekFPSignOps(SDValue Val) {
1660 if (Val.getOpcode() == ISD::FNEG)
1661 Val = Val.getOperand(i: 0);
1662 if (Val.getOpcode() == ISD::FABS)
1663 Val = Val.getOperand(i: 0);
1664 if (Val.getOpcode() == ISD::FCOPYSIGN)
1665 Val = Val.getOperand(i: 0);
1666 return Val;
1667}
1668
1669SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1670 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1671 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1672 SelectionDAG &DAG = DCI.DAG;
1673 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1674 switch (CCOpcode) {
1675 case ISD::SETOEQ:
1676 case ISD::SETONE:
1677 case ISD::SETUNE:
1678 case ISD::SETNE:
1679 case ISD::SETUEQ:
1680 case ISD::SETEQ:
1681 case ISD::SETFALSE:
1682 case ISD::SETFALSE2:
1683 case ISD::SETTRUE:
1684 case ISD::SETTRUE2:
1685 case ISD::SETUO:
1686 case ISD::SETO:
1687 break;
1688 case ISD::SETULE:
1689 case ISD::SETULT: {
1690 if (LHS == True)
1691 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1692 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1693 }
1694 case ISD::SETOLE:
1695 case ISD::SETOLT:
1696 case ISD::SETLE:
1697 case ISD::SETLT: {
1698 // Ordered. Assume ordered for undefined.
1699
1700 // Only do this after legalization to avoid interfering with other combines
1701 // which might occur.
1702 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1703 !DCI.isCalledByLegalizer())
1704 return SDValue();
1705
1706 // We need to permute the operands to get the correct NaN behavior. The
1707 // selected operand is the second one based on the failing compare with NaN,
1708 // so permute it based on the compare type the hardware uses.
1709 if (LHS == True)
1710 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1711 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1712 }
1713 case ISD::SETUGE:
1714 case ISD::SETUGT: {
1715 if (LHS == True)
1716 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1717 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1718 }
1719 case ISD::SETGT:
1720 case ISD::SETGE:
1721 case ISD::SETOGE:
1722 case ISD::SETOGT: {
1723 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1724 !DCI.isCalledByLegalizer())
1725 return SDValue();
1726
1727 if (LHS == True)
1728 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1729 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1730 }
1731 case ISD::SETCC_INVALID:
1732 llvm_unreachable("Invalid setcc condcode!");
1733 }
1734 return SDValue();
1735}
1736
1737/// Generate Min/Max node
1738SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1739 SDValue LHS, SDValue RHS,
1740 SDValue True, SDValue False,
1741 SDValue CC,
1742 DAGCombinerInfo &DCI) const {
1743 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1744 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1745
1746 SelectionDAG &DAG = DCI.DAG;
1747
1748 // If we can't directly match this, try to see if we can fold an fneg to
1749 // match.
1750
1751 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1752 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1753 SDValue NegTrue = peekFNeg(Val: True);
1754
1755 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1756 // fmin/fmax.
1757 //
1758 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1759 // -> fneg (fmin_legacy lhs, K)
1760 //
1761 // TODO: Use getNegatedExpression
1762 if (LHS == NegTrue && CFalse && CRHS) {
1763 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1764 if (NegRHS == CFalse->getValueAPF()) {
1765 SDValue Combined =
1766 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1767 if (Combined)
1768 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1769 return SDValue();
1770 }
1771 }
1772
1773 return SDValue();
1774}
1775
1776std::pair<SDValue, SDValue>
1777AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1778 SDLoc SL(Op);
1779
1780 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1781
1782 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1783 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1784
1785 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1786 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1787
1788 return std::pair(Lo, Hi);
1789}
1790
1791SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1792 SDLoc SL(Op);
1793
1794 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1795 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1796 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1797}
1798
1799SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1800 SDLoc SL(Op);
1801
1802 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1803 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1804 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1805}
1806
1807// Split a vector type into two parts. The first part is a power of two vector.
1808// The second part is whatever is left over, and is a scalar if it would
1809// otherwise be a 1-vector.
1810std::pair<EVT, EVT>
1811AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1812 EVT LoVT, HiVT;
1813 EVT EltVT = VT.getVectorElementType();
1814 unsigned NumElts = VT.getVectorNumElements();
1815 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1816 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1817 HiVT = NumElts - LoNumElts == 1
1818 ? EltVT
1819 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1820 return std::pair(LoVT, HiVT);
1821}
1822
1823// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1824// scalar.
1825std::pair<SDValue, SDValue>
1826AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1827 const EVT &LoVT, const EVT &HiVT,
1828 SelectionDAG &DAG) const {
1829 EVT VT = N.getValueType();
1830 assert(LoVT.getVectorNumElements() +
1831 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1832 VT.getVectorNumElements() &&
1833 "More vector elements requested than available!");
1834 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1835 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1836
1837 unsigned LoNumElts = LoVT.getVectorNumElements();
1838
1839 if (HiVT.isVector()) {
1840 unsigned HiNumElts = HiVT.getVectorNumElements();
1841 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1842 // Avoid creating an extract_subvector with an index that isn't a multiple
1843 // of the result type.
1844 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1845 N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1846 return {Lo, Hi};
1847 }
1848
1849 SmallVector<SDValue, 8> Elts;
1850 DAG.ExtractVectorElements(Op: N, Args&: Elts, /*Start=*/LoNumElts,
1851 /*Count=*/HiNumElts);
1852 SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1853 return {Lo, Hi};
1854 }
1855
1856 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1857 N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1858 return {Lo, Hi};
1859}
1860
1861SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1862 SelectionDAG &DAG) const {
1863 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1864 EVT VT = Op.getValueType();
1865 SDLoc SL(Op);
1866
1867
1868 // If this is a 2 element vector, we really want to scalarize and not create
1869 // weird 1 element vectors.
1870 if (VT.getVectorNumElements() == 2) {
1871 SDValue Ops[2];
1872 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1873 return DAG.getMergeValues(Ops, dl: SL);
1874 }
1875
1876 SDValue BasePtr = Load->getBasePtr();
1877 EVT MemVT = Load->getMemoryVT();
1878
1879 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1880
1881 EVT LoVT, HiVT;
1882 EVT LoMemVT, HiMemVT;
1883 SDValue Lo, Hi;
1884
1885 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1886 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1887 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1888
1889 unsigned Size = LoMemVT.getStoreSize();
1890 Align BaseAlign = Load->getAlign();
1891 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1892
1893 SDValue LoLoad = DAG.getExtLoad(
1894 ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1895 MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1896 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1897 SDValue HiLoad = DAG.getExtLoad(
1898 ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1899 PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1900 MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1901
1902 SDValue Join;
1903 if (LoVT == HiVT) {
1904 // This is the case that the vector is power of two so was evenly split.
1905 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1906 } else {
1907 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1908 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1909 Join = DAG.getNode(
1910 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1911 VT, N1: Join, N2: HiLoad,
1912 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1913 }
1914
1915 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1916 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1917
1918 return DAG.getMergeValues(Ops, dl: SL);
1919}
1920
1921SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1922 SelectionDAG &DAG) const {
1923 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1924 EVT VT = Op.getValueType();
1925 SDValue BasePtr = Load->getBasePtr();
1926 EVT MemVT = Load->getMemoryVT();
1927 SDLoc SL(Op);
1928 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1929 Align BaseAlign = Load->getAlign();
1930 unsigned NumElements = MemVT.getVectorNumElements();
1931
1932 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1933 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1934 if (NumElements != 3 ||
1935 (BaseAlign < Align(8) &&
1936 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1937 return SplitVectorLoad(Op, DAG);
1938
1939 assert(NumElements == 3);
1940
1941 EVT WideVT =
1942 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1943 EVT WideMemVT =
1944 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1945 SDValue WideLoad = DAG.getExtLoad(
1946 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1947 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1948 return DAG.getMergeValues(
1949 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1950 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1951 WideLoad.getValue(R: 1)},
1952 dl: SL);
1953}
1954
1955SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1956 SelectionDAG &DAG) const {
1957 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1958 SDValue Val = Store->getValue();
1959 EVT VT = Val.getValueType();
1960
1961 // If this is a 2 element vector, we really want to scalarize and not create
1962 // weird 1 element vectors.
1963 if (VT.getVectorNumElements() == 2)
1964 return scalarizeVectorStore(ST: Store, DAG);
1965
1966 EVT MemVT = Store->getMemoryVT();
1967 SDValue Chain = Store->getChain();
1968 SDValue BasePtr = Store->getBasePtr();
1969 SDLoc SL(Op);
1970
1971 EVT LoVT, HiVT;
1972 EVT LoMemVT, HiMemVT;
1973 SDValue Lo, Hi;
1974
1975 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1976 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1977 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1978
1979 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1980
1981 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1982 Align BaseAlign = Store->getAlign();
1983 unsigned Size = LoMemVT.getStoreSize();
1984 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1985
1986 SDValue LoStore =
1987 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1988 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1989 SDValue HiStore = DAG.getTruncStore(
1990 Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
1991 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1992
1993 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1994}
1995
1996// This is a shortcut for integer division because we have fast i32<->f32
1997// conversions, and fast f32 reciprocal instructions. The fractional part of a
1998// float is enough to accurately represent up to a 24-bit signed integer.
1999SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
2000 bool Sign) const {
2001 SDLoc DL(Op);
2002 EVT VT = Op.getValueType();
2003 SDValue LHS = Op.getOperand(i: 0);
2004 SDValue RHS = Op.getOperand(i: 1);
2005 MVT IntVT = MVT::i32;
2006 MVT FltVT = MVT::f32;
2007
2008 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2009 if (LHSSignBits < 9)
2010 return SDValue();
2011
2012 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2013 if (RHSSignBits < 9)
2014 return SDValue();
2015
2016 unsigned BitSize = VT.getSizeInBits();
2017 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2018 unsigned DivBits = BitSize - SignBits;
2019 if (Sign)
2020 ++DivBits;
2021
2022 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2023 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2024
2025 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
2026
2027 if (Sign) {
2028 // char|short jq = ia ^ ib;
2029 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2030
2031 // jq = jq >> (bitsize - 2)
2032 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2033 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
2034
2035 // jq = jq | 0x1
2036 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
2037 }
2038
2039 // int ia = (int)LHS;
2040 SDValue ia = LHS;
2041
2042 // int ib, (int)RHS;
2043 SDValue ib = RHS;
2044
2045 // float fa = (float)ia;
2046 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2047
2048 // float fb = (float)ib;
2049 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2050
2051 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2052 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2053
2054 // fq = trunc(fq);
2055 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2056
2057 // float fqneg = -fq;
2058 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2059
2060 MachineFunction &MF = DAG.getMachineFunction();
2061
2062 bool UseFmadFtz = false;
2063 if (Subtarget->isGCN()) {
2064 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2065 UseFmadFtz =
2066 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2067 }
2068
2069 // float fr = mad(fqneg, fb, fa);
2070 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2071 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2072 : (unsigned)ISD::FMAD;
2073 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2074
2075 // int iq = (int)fq;
2076 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2077
2078 // fr = fabs(fr);
2079 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2080
2081 // fb = fabs(fb);
2082 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2083
2084 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2085
2086 // int cv = fr >= fb;
2087 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2088
2089 // jq = (cv ? jq : 0);
2090 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2091
2092 // dst = iq + jq;
2093 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2094
2095 // Rem needs compensation, it's easier to recompute it
2096 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2097 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2098
2099 // Truncate to number of bits this divide really is.
2100 if (Sign) {
2101 SDValue InRegSize
2102 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2103 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2104 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2105 } else {
2106 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2107 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2108 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2109 }
2110
2111 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2112}
2113
2114void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2115 SelectionDAG &DAG,
2116 SmallVectorImpl<SDValue> &Results) const {
2117 SDLoc DL(Op);
2118 EVT VT = Op.getValueType();
2119
2120 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2121
2122 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2123
2124 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2125 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2126
2127 //HiLo split
2128 SDValue LHS_Lo, LHS_Hi;
2129 SDValue LHS = Op.getOperand(i: 0);
2130 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2131
2132 SDValue RHS_Lo, RHS_Hi;
2133 SDValue RHS = Op.getOperand(i: 1);
2134 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2135
2136 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2137 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2138
2139 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2140 N1: LHS_Lo, N2: RHS_Lo);
2141
2142 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2143 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2144
2145 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2146 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2147 return;
2148 }
2149
2150 if (isTypeLegal(VT: MVT::i64)) {
2151 // The algorithm here is based on ideas from "Software Integer Division",
2152 // Tom Rodeheffer, August 2008.
2153
2154 MachineFunction &MF = DAG.getMachineFunction();
2155 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2156
2157 // Compute denominator reciprocal.
2158 unsigned FMAD =
2159 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2160 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2161 ? (unsigned)ISD::FMAD
2162 : (unsigned)AMDGPUISD::FMAD_FTZ;
2163
2164 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2165 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2166 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2167 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2168 N3: Cvt_Lo);
2169 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2170 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2171 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2172 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2173 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2174 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2175 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2176 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2177 N3: Mul1);
2178 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2179 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2180 SDValue Rcp64 = DAG.getBitcast(VT,
2181 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2182
2183 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2184 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2185 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2186 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2187
2188 // First round of UNR (Unsigned integer Newton-Raphson).
2189 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2190 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2191 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2192 SDValue Mulhi1_Lo, Mulhi1_Hi;
2193 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2194 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2195 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2196 N2: Mulhi1_Lo, N3: Zero1);
2197 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2198 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2199 SDValue Add1 = DAG.getBitcast(VT,
2200 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2201
2202 // Second round of UNR.
2203 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2204 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2205 SDValue Mulhi2_Lo, Mulhi2_Hi;
2206 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2207 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2208 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2209 N2: Mulhi2_Lo, N3: Zero1);
2210 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2211 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2212 SDValue Add2 = DAG.getBitcast(VT,
2213 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2214
2215 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2216
2217 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2218
2219 SDValue Mul3_Lo, Mul3_Hi;
2220 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2221 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2222 N2: Mul3_Lo, N3: Zero1);
2223 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2224 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2225 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2226 SDValue Sub1 = DAG.getBitcast(VT,
2227 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2228
2229 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2230 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2231 Cond: ISD::SETUGE);
2232 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2233 Cond: ISD::SETUGE);
2234 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2235
2236 // TODO: Here and below portions of the code can be enclosed into if/endif.
2237 // Currently control flow is unconditional and we have 4 selects after
2238 // potential endif to substitute PHIs.
2239
2240 // if C3 != 0 ...
2241 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2242 N2: RHS_Lo, N3: Zero1);
2243 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2244 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2245 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2246 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2247 SDValue Sub2 = DAG.getBitcast(VT,
2248 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2249
2250 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2251
2252 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2253 Cond: ISD::SETUGE);
2254 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2255 Cond: ISD::SETUGE);
2256 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2257
2258 // if (C6 != 0)
2259 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2260
2261 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2262 N2: RHS_Lo, N3: Zero1);
2263 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2264 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2265 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2266 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2267 SDValue Sub3 = DAG.getBitcast(VT,
2268 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2269
2270 // endif C6
2271 // endif C3
2272
2273 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2274 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2275
2276 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2277 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2278
2279 Results.push_back(Elt: Div);
2280 Results.push_back(Elt: Rem);
2281
2282 return;
2283 }
2284
2285 // r600 expandion.
2286 // Get Speculative values
2287 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2288 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2289
2290 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2291 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2292 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2293
2294 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2295 SDValue DIV_Lo = Zero;
2296
2297 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2298
2299 for (unsigned i = 0; i < halfBitWidth; ++i) {
2300 const unsigned bitPos = halfBitWidth - i - 1;
2301 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2302 // Get value of high bit
2303 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2304 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2305 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2306
2307 // Shift
2308 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2309 // Add LHS high bit
2310 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2311
2312 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2313 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2314
2315 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2316
2317 // Update REM
2318 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2319 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2320 }
2321
2322 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2323 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2324 Results.push_back(Elt: DIV);
2325 Results.push_back(Elt: REM);
2326}
2327
2328SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2329 SelectionDAG &DAG) const {
2330 SDLoc DL(Op);
2331 EVT VT = Op.getValueType();
2332
2333 if (VT == MVT::i64) {
2334 SmallVector<SDValue, 2> Results;
2335 LowerUDIVREM64(Op, DAG, Results);
2336 return DAG.getMergeValues(Ops: Results, dl: DL);
2337 }
2338
2339 if (VT == MVT::i32) {
2340 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2341 return Res;
2342 }
2343
2344 SDValue X = Op.getOperand(i: 0);
2345 SDValue Y = Op.getOperand(i: 1);
2346
2347 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2348 // algorithm used here.
2349
2350 // Initial estimate of inv(y).
2351 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2352
2353 // One round of UNR.
2354 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2355 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2356 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2357 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2358
2359 // Quotient/remainder estimate.
2360 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2361 SDValue R =
2362 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2363
2364 // First quotient/remainder refinement.
2365 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2366 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2367 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2368 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2369 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2370 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2371 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2372
2373 // Second quotient/remainder refinement.
2374 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2375 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2376 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2377 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2378 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2379
2380 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2381}
2382
2383SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2384 SelectionDAG &DAG) const {
2385 SDLoc DL(Op);
2386 EVT VT = Op.getValueType();
2387
2388 SDValue LHS = Op.getOperand(i: 0);
2389 SDValue RHS = Op.getOperand(i: 1);
2390
2391 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2392 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2393
2394 if (VT == MVT::i32) {
2395 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2396 return Res;
2397 }
2398
2399 if (VT == MVT::i64 &&
2400 DAG.ComputeNumSignBits(Op: LHS) > 32 &&
2401 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2402 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2403
2404 //HiLo split
2405 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2406 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2407 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2408 N1: LHS_Lo, N2: RHS_Lo);
2409 SDValue Res[2] = {
2410 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2411 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2412 };
2413 return DAG.getMergeValues(Ops: Res, dl: DL);
2414 }
2415
2416 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2417 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2418 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2419 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2420
2421 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2422 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2423
2424 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2425 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2426
2427 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2428 SDValue Rem = Div.getValue(R: 1);
2429
2430 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2431 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2432
2433 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2434 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2435
2436 SDValue Res[2] = {
2437 Div,
2438 Rem
2439 };
2440 return DAG.getMergeValues(Ops: Res, dl: DL);
2441}
2442
2443SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2444 SDLoc SL(Op);
2445 SDValue Src = Op.getOperand(i: 0);
2446
2447 // result = trunc(src)
2448 // if (src > 0.0 && src != result)
2449 // result += 1.0
2450
2451 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2452
2453 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2454 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2455
2456 EVT SetCCVT =
2457 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2458
2459 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2460 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2461 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2462
2463 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2464 // TODO: Should this propagate fast-math-flags?
2465 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2466}
2467
2468static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2469 SelectionDAG &DAG) {
2470 const unsigned FractBits = 52;
2471 const unsigned ExpBits = 11;
2472
2473 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2474 N1: Hi,
2475 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2476 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2477 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2478 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2479
2480 return Exp;
2481}
2482
2483SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2484 SDLoc SL(Op);
2485 SDValue Src = Op.getOperand(i: 0);
2486
2487 assert(Op.getValueType() == MVT::f64);
2488
2489 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2490
2491 // Extract the upper half, since this is where we will find the sign and
2492 // exponent.
2493 SDValue Hi = getHiHalf64(Op: Src, DAG);
2494
2495 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2496
2497 const unsigned FractBits = 52;
2498
2499 // Extract the sign bit.
2500 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2501 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2502
2503 // Extend back to 64-bits.
2504 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2505 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2506
2507 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2508 const SDValue FractMask
2509 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2510
2511 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2512 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2513 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2514
2515 EVT SetCCVT =
2516 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2517
2518 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2519
2520 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2521 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2522
2523 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2524 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2525
2526 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2527}
2528
2529SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2530 SelectionDAG &DAG) const {
2531 SDLoc SL(Op);
2532 SDValue Src = Op.getOperand(i: 0);
2533
2534 assert(Op.getValueType() == MVT::f64);
2535
2536 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2537 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2538 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2539
2540 // TODO: Should this propagate fast-math-flags?
2541
2542 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2543 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2544
2545 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2546
2547 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2548 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2549
2550 EVT SetCCVT =
2551 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2552 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2553
2554 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2555}
2556
2557SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2558 SelectionDAG &DAG) const {
2559 // FNEARBYINT and FRINT are the same, except in their handling of FP
2560 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2561 // rint, so just treat them as equivalent.
2562 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2563 Operand: Op.getOperand(i: 0));
2564}
2565
2566SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2567 auto VT = Op.getValueType();
2568 auto Arg = Op.getOperand(i: 0u);
2569 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2570}
2571
2572// XXX - May require not supporting f32 denormals?
2573
2574// Don't handle v2f16. The extra instructions to scalarize and repack around the
2575// compare and vselect end up producing worse code than scalarizing the whole
2576// operation.
2577SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2578 SDLoc SL(Op);
2579 SDValue X = Op.getOperand(i: 0);
2580 EVT VT = Op.getValueType();
2581
2582 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2583
2584 // TODO: Should this propagate fast-math-flags?
2585
2586 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2587
2588 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2589
2590 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2591 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2592
2593 EVT SetCCVT =
2594 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2595
2596 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2597 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2598 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2599
2600 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2601 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2602}
2603
2604SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2605 SDLoc SL(Op);
2606 SDValue Src = Op.getOperand(i: 0);
2607
2608 // result = trunc(src);
2609 // if (src < 0.0 && src != result)
2610 // result += -1.0.
2611
2612 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2613
2614 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2615 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2616
2617 EVT SetCCVT =
2618 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2619
2620 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2621 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2622 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2623
2624 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2625 // TODO: Should this propagate fast-math-flags?
2626 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2627}
2628
2629/// Return true if it's known that \p Src can never be an f32 denormal value.
2630static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2631 switch (Src.getOpcode()) {
2632 case ISD::FP_EXTEND:
2633 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2634 case ISD::FP16_TO_FP:
2635 case ISD::FFREXP:
2636 case ISD::FSQRT:
2637 case AMDGPUISD::LOG:
2638 case AMDGPUISD::EXP:
2639 return true;
2640 case ISD::INTRINSIC_WO_CHAIN: {
2641 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2642 switch (IntrinsicID) {
2643 case Intrinsic::amdgcn_frexp_mant:
2644 case Intrinsic::amdgcn_log:
2645 case Intrinsic::amdgcn_log_clamp:
2646 case Intrinsic::amdgcn_exp2:
2647 case Intrinsic::amdgcn_sqrt:
2648 return true;
2649 default:
2650 return false;
2651 }
2652 }
2653 default:
2654 return false;
2655 }
2656
2657 llvm_unreachable("covered opcode switch");
2658}
2659
2660bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2661 SDNodeFlags Flags) {
2662 return Flags.hasApproximateFuncs();
2663}
2664
2665bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2666 SDValue Src,
2667 SDNodeFlags Flags) {
2668 return !valueIsKnownNeverF32Denorm(Src) &&
2669 DAG.getMachineFunction()
2670 .getDenormalMode(FPType: APFloat::IEEEsingle())
2671 .Input != DenormalMode::PreserveSign;
2672}
2673
2674SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2675 SDValue Src,
2676 SDNodeFlags Flags) const {
2677 SDLoc SL(Src);
2678 EVT VT = Src.getValueType();
2679 const fltSemantics &Semantics = VT.getFltSemantics();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2682
2683 // Want to scale denormals up, but negatives and 0 work just as well on the
2684 // scaled path.
2685 SDValue IsLtSmallestNormal = DAG.getSetCC(
2686 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2687 RHS: SmallestNormal, Cond: ISD::SETOLT);
2688
2689 return IsLtSmallestNormal;
2690}
2691
2692SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2693 SDNodeFlags Flags) const {
2694 SDLoc SL(Src);
2695 EVT VT = Src.getValueType();
2696 const fltSemantics &Semantics = VT.getFltSemantics();
2697 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2698
2699 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2700 SDValue IsFinite = DAG.getSetCC(
2701 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2702 RHS: Inf, Cond: ISD::SETOLT);
2703 return IsFinite;
2704}
2705
2706/// If denormal handling is required return the scaled input to FLOG2, and the
2707/// check for denormal range. Otherwise, return null values.
2708std::pair<SDValue, SDValue>
2709AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2710 SDValue Src, SDNodeFlags Flags) const {
2711 if (!needsDenormHandlingF32(DAG, Src, Flags))
2712 return {};
2713
2714 MVT VT = MVT::f32;
2715 const fltSemantics &Semantics = APFloat::IEEEsingle();
2716 SDValue SmallestNormal =
2717 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2718
2719 SDValue IsLtSmallestNormal = DAG.getSetCC(
2720 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2721 RHS: SmallestNormal, Cond: ISD::SETOLT);
2722
2723 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2724 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2725 SDValue ScaleFactor =
2726 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2727
2728 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2729 return {ScaledInput, IsLtSmallestNormal};
2730}
2731
2732SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2733 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2734 // If we have to handle denormals, scale up the input and adjust the result.
2735
2736 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2737 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2738
2739 SDLoc SL(Op);
2740 EVT VT = Op.getValueType();
2741 SDValue Src = Op.getOperand(i: 0);
2742 SDNodeFlags Flags = Op->getFlags();
2743
2744 if (VT == MVT::f16) {
2745 // Nothing in half is a denormal when promoted to f32.
2746 assert(!isTypeLegal(VT));
2747 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2748 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2749 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2750 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2751 }
2752
2753 auto [ScaledInput, IsLtSmallestNormal] =
2754 getScaledLogInput(DAG, SL, Src, Flags);
2755 if (!ScaledInput)
2756 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2757
2758 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2759
2760 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2761 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2762 SDValue ResultOffset =
2763 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2764 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2765}
2766
2767static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2768 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2769 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2770 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2771}
2772
2773SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2774 SelectionDAG &DAG) const {
2775 SDValue X = Op.getOperand(i: 0);
2776 EVT VT = Op.getValueType();
2777 SDNodeFlags Flags = Op->getFlags();
2778 SDLoc DL(Op);
2779 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2780 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2781
2782 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2783 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2784 // depending on !fpmath metadata.
2785
2786 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2787 !isTypeLegal(VT: MVT::f16));
2788
2789 if (PromoteToF32) {
2790 // Log and multiply in f32 is always good enough for f16.
2791 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2792 }
2793
2794 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2795 if (PromoteToF32) {
2796 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2797 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2798 }
2799
2800 return Lowered;
2801 }
2802
2803 SDValue ScaledInput, IsScaled;
2804 if (VT == MVT::f16)
2805 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2806 else {
2807 std::tie(args&: ScaledInput, args&: IsScaled) = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2808 if (ScaledInput)
2809 X = ScaledInput;
2810 }
2811
2812 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2813
2814 SDValue R;
2815 if (Subtarget->hasFastFMAF32()) {
2816 // c+cc are ln(2)/ln(10) to more than 49 bits
2817 const float c_log10 = 0x1.344134p-2f;
2818 const float cc_log10 = 0x1.09f79ep-26f;
2819
2820 // c + cc is ln(2) to more than 49 bits
2821 const float c_log = 0x1.62e42ep-1f;
2822 const float cc_log = 0x1.efa39ep-25f;
2823
2824 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2825 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2826 // This adds correction terms for which contraction may lead to an increase
2827 // in the error of the approximation, so disable it.
2828 Flags.setAllowContract(false);
2829 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2830 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2831 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2832 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2833 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2834 } else {
2835 // ch+ct is ln(2)/ln(10) to more than 36 bits
2836 const float ch_log10 = 0x1.344000p-2f;
2837 const float ct_log10 = 0x1.3509f6p-18f;
2838
2839 // ch + ct is ln(2) to more than 36 bits
2840 const float ch_log = 0x1.62e000p-1f;
2841 const float ct_log = 0x1.0bfbe8p-15f;
2842
2843 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2844 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2845
2846 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2847 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2848 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2849 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2850 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2851 // This adds correction terms for which contraction may lead to an increase
2852 // in the error of the approximation, so disable it.
2853 Flags.setAllowContract(false);
2854 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2855 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2856 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2857 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2858 }
2859
2860 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2861
2862 // TODO: Check if known finite from source value.
2863 if (!IsFiniteOnly) {
2864 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2865 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2866 }
2867
2868 if (IsScaled) {
2869 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2870 SDValue ShiftK =
2871 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2872 SDValue Shift =
2873 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2874 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2875 }
2876
2877 return R;
2878}
2879
2880SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2881 return LowerFLOGCommon(Op, DAG);
2882}
2883
2884// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2885// promote f16 operation.
2886SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2887 SelectionDAG &DAG, bool IsLog10,
2888 SDNodeFlags Flags) const {
2889 EVT VT = Src.getValueType();
2890 unsigned LogOp =
2891 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2892
2893 double Log2BaseInverted =
2894 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2895
2896 if (VT == MVT::f32) {
2897 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2898 if (ScaledInput) {
2899 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2900 SDValue ScaledResultOffset =
2901 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2902
2903 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2904
2905 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2906 N2: ScaledResultOffset, N3: Zero, Flags);
2907
2908 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2909
2910 if (Subtarget->hasFastFMAF32())
2911 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2912 Flags);
2913 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2914 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2915 }
2916 }
2917
2918 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2919 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2920
2921 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2922 Flags);
2923}
2924
2925// This expansion gives a result slightly better than 1ulp.
2926SDValue AMDGPUTargetLowering::lowerFEXPF64(SDValue Op,
2927 SelectionDAG &DAG) const {
2928 SDLoc DL(Op);
2929 SDValue X = Op.getOperand(i: 0);
2930
2931 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2932 // exp10, which slightly increases ulp.
2933 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2934
2935 SDValue DN, F, T;
2936
2937 if (Op.getOpcode() == ISD::FEXP2) {
2938 // dn = rint(x)
2939 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: X, Flags);
2940 // f = x - dn
2941 F = DAG.getNode(Opcode: ISD::FSUB, DL, VT: MVT::f64, N1: X, N2: DN, Flags);
2942 // t = f*C1 + f*C2
2943 SDValue C1 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
2944 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
2945 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C2, Flags);
2946 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C1, N3: Mul2, Flags);
2947 } else if (Op.getOpcode() == ISD::FEXP10) {
2948 // dn = rint(x * C1)
2949 SDValue C1 = DAG.getConstantFP(Val: 0x1.a934f0979a371p+1, DL, VT: MVT::f64);
2950 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2951 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2952
2953 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2954 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2955 SDValue C2 = DAG.getConstantFP(Val: -0x1.9dc1da994fd21p-59, DL, VT: MVT::f64);
2956 SDValue C3 = DAG.getConstantFP(Val: 0x1.34413509f79ffp-2, DL, VT: MVT::f64);
2957 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2958 F = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2959
2960 // t = FMA(f, C4, f*C5)
2961 SDValue C4 = DAG.getConstantFP(Val: 0x1.26bb1bbb55516p+1, DL, VT: MVT::f64);
2962 SDValue C5 = DAG.getConstantFP(Val: -0x1.f48ad494ea3e9p-53, DL, VT: MVT::f64);
2963 SDValue MulF = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C5, Flags);
2964 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C4, N3: MulF, Flags);
2965 } else { // ISD::FEXP
2966 // dn = rint(x * C1)
2967 SDValue C1 = DAG.getConstantFP(Val: 0x1.71547652b82fep+0, DL, VT: MVT::f64);
2968 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2969 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2970
2971 // t = FMA(-dn, C2, FMA(-dn, C3, x))
2972 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2973 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
2974 SDValue C3 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
2975 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2976 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2977 }
2978
2979 // Polynomial expansion for p
2980 SDValue P = DAG.getConstantFP(Val: 0x1.ade156a5dcb37p-26, DL, VT: MVT::f64);
2981 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2982 N3: DAG.getConstantFP(Val: 0x1.28af3fca7ab0cp-22, DL, VT: MVT::f64),
2983 Flags);
2984 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2985 N3: DAG.getConstantFP(Val: 0x1.71dee623fde64p-19, DL, VT: MVT::f64),
2986 Flags);
2987 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2988 N3: DAG.getConstantFP(Val: 0x1.a01997c89e6b0p-16, DL, VT: MVT::f64),
2989 Flags);
2990 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2991 N3: DAG.getConstantFP(Val: 0x1.a01a014761f6ep-13, DL, VT: MVT::f64),
2992 Flags);
2993 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2994 N3: DAG.getConstantFP(Val: 0x1.6c16c1852b7b0p-10, DL, VT: MVT::f64),
2995 Flags);
2996 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2997 N3: DAG.getConstantFP(Val: 0x1.1111111122322p-7, DL, VT: MVT::f64), Flags);
2998 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2999 N3: DAG.getConstantFP(Val: 0x1.55555555502a1p-5, DL, VT: MVT::f64), Flags);
3000 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3001 N3: DAG.getConstantFP(Val: 0x1.5555555555511p-3, DL, VT: MVT::f64), Flags);
3002 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3003 N3: DAG.getConstantFP(Val: 0x1.000000000000bp-1, DL, VT: MVT::f64), Flags);
3004
3005 SDValue One = DAG.getConstantFP(Val: 1.0, DL, VT: MVT::f64);
3006
3007 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3008 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3009
3010 // z = ldexp(p, (int)dn)
3011 SDValue DNInt = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL, VT: MVT::i32, Operand: DN);
3012 SDValue Z = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: P, N2: DNInt, Flags);
3013
3014 // Overflow/underflow guards
3015 SDValue CondHi = DAG.getSetCC(
3016 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: 1024.0, DL, VT: MVT::f64), Cond: ISD::SETULE);
3017
3018 if (!Flags.hasNoInfs()) {
3019 SDValue PInf = DAG.getConstantFP(Val: std::numeric_limits<double>::infinity(),
3020 DL, VT: MVT::f64);
3021 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondHi, LHS: Z, RHS: PInf, Flags);
3022 }
3023
3024 SDValue CondLo = DAG.getSetCC(
3025 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: -1075.0, DL, VT: MVT::f64), Cond: ISD::SETUGE);
3026 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL, VT: MVT::f64);
3027 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondLo, LHS: Z, RHS: Zero, Flags);
3028
3029 return Z;
3030}
3031
3032SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
3033 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3034 // If we have to handle denormals, scale up the input and adjust the result.
3035
3036 EVT VT = Op.getValueType();
3037 if (VT == MVT::f64)
3038 return lowerFEXPF64(Op, DAG);
3039
3040 SDLoc SL(Op);
3041 SDValue Src = Op.getOperand(i: 0);
3042 SDNodeFlags Flags = Op->getFlags();
3043
3044 if (VT == MVT::f16) {
3045 // Nothing in half is a denormal when promoted to f32.
3046 assert(!isTypeLegal(MVT::f16));
3047 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3048 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
3049 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
3050 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3051 }
3052
3053 assert(VT == MVT::f32);
3054
3055 if (!needsDenormHandlingF32(DAG, Src, Flags))
3056 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3057
3058 // bool needs_scaling = x < -0x1.f80000p+6f;
3059 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3060
3061 // -nextafter(128.0, -1)
3062 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
3063
3064 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3065
3066 SDValue NeedsScaling =
3067 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
3068
3069 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3070 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3071
3072 SDValue AddOffset =
3073 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
3074
3075 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
3076 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
3077
3078 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
3079 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
3080 SDValue ResultScale =
3081 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
3082
3083 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
3084}
3085
3086SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
3087 SelectionDAG &DAG,
3088 SDNodeFlags Flags,
3089 bool IsExp10) const {
3090 // exp(x) -> exp2(M_LOG2E_F * x);
3091 // exp10(x) -> exp2(log2(10) * x);
3092 EVT VT = X.getValueType();
3093 SDValue Const =
3094 DAG.getConstantFP(Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, DL: SL, VT);
3095
3096 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
3097 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3098 : (unsigned)ISD::FEXP2,
3099 DL: SL, VT, Operand: Mul, Flags);
3100}
3101
3102SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
3103 SelectionDAG &DAG,
3104 SDNodeFlags Flags) const {
3105 EVT VT = X.getValueType();
3106 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags))
3107 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3108
3109 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3110
3111 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
3112 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3113
3114 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3115
3116 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3117
3118 SDValue AdjustedX =
3119 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3120
3121 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3122 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3123
3124 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3125
3126 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
3127 SDValue AdjustedResult =
3128 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3129
3130 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3131 Flags);
3132}
3133
3134/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3135/// handled correctly.
3136SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3137 SelectionDAG &DAG,
3138 SDNodeFlags Flags) const {
3139 const EVT VT = X.getValueType();
3140
3141 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3142 : static_cast<unsigned>(ISD::FEXP2);
3143
3144 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3145 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3146 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3147 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3148
3149 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3150 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3151 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3152 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3153 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3154 }
3155
3156 // bool s = x < -0x1.2f7030p+5f;
3157 // x += s ? 0x1.0p+5f : 0.0f;
3158 // exp10 = exp2(x * 0x1.a92000p+1f) *
3159 // exp2(x * 0x1.4f0978p-11f) *
3160 // (s ? 0x1.9f623ep-107f : 1.0f);
3161
3162 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3163
3164 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3165 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3166
3167 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3168 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3169 SDValue AdjustedX =
3170 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3171
3172 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3173 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3174
3175 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3176 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3177 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3178 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3179
3180 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3181
3182 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3183 SDValue AdjustedResult =
3184 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3185
3186 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3187 Flags);
3188}
3189
3190SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3191 EVT VT = Op.getValueType();
3192
3193 if (VT == MVT::f64)
3194 return lowerFEXPF64(Op, DAG);
3195
3196 SDLoc SL(Op);
3197 SDValue X = Op.getOperand(i: 0);
3198 SDNodeFlags Flags = Op->getFlags();
3199 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3200
3201 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3202 // library behavior. Also, is known-not-daz source sufficient?
3203 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3204 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3205 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3206 }
3207
3208 if (VT.getScalarType() == MVT::f16) {
3209 if (VT.isVector())
3210 return SDValue();
3211
3212 // Nothing in half is a denormal when promoted to f32.
3213 //
3214 // exp(f16 x) ->
3215 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3216 //
3217 // exp10(f16 x) ->
3218 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3219 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3220 SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3221 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3222 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3223 }
3224
3225 assert(VT == MVT::f32);
3226
3227 // Algorithm:
3228 //
3229 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3230 //
3231 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3232 // n = 64*m + j, 0 <= j < 64
3233 //
3234 // e^x = 2^((64*m + j + f)/64)
3235 // = (2^m) * (2^(j/64)) * 2^(f/64)
3236 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3237 //
3238 // f = x*(64/ln(2)) - n
3239 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3240 //
3241 // e^x = (2^m) * (2^(j/64)) * e^r
3242 //
3243 // (2^(j/64)) is precomputed
3244 //
3245 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3246 // e^r = 1 + q
3247 //
3248 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3249 //
3250 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3251 SDNodeFlags FlagsNoContract = Flags;
3252 FlagsNoContract.setAllowContract(false);
3253
3254 SDValue PH, PL;
3255 if (Subtarget->hasFastFMAF32()) {
3256 const float c_exp = numbers::log2ef;
3257 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3258 const float c_exp10 = 0x1.a934f0p+1f;
3259 const float cc_exp10 = 0x1.2f346ep-24f;
3260
3261 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3262 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3263
3264 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3265 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3266 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3267 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3268 } else {
3269 const float ch_exp = 0x1.714000p+0f;
3270 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3271
3272 const float ch_exp10 = 0x1.a92000p+1f;
3273 const float cl_exp10 = 0x1.4f0978p-11f;
3274
3275 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3276 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3277
3278 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3279 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3280 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3281 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3282 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3283
3284 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3285
3286 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3287 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3288 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3289 }
3290
3291 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3292
3293 // It is unsafe to contract this fsub into the PH multiply.
3294 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3295
3296 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3297 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3298 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3299
3300 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3301
3302 SDValue UnderflowCheckConst =
3303 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3304
3305 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3306 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3307 SDValue Underflow =
3308 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3309
3310 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3311
3312 if (!Flags.hasNoInfs()) {
3313 SDValue OverflowCheckConst =
3314 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3315 SDValue Overflow =
3316 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3317 SDValue Inf =
3318 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3319 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3320 }
3321
3322 return R;
3323}
3324
3325static bool isCtlzOpc(unsigned Opc) {
3326 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3327}
3328
3329static bool isCttzOpc(unsigned Opc) {
3330 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3331}
3332
3333SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3334 SelectionDAG &DAG) const {
3335 auto SL = SDLoc(Op);
3336 auto Opc = Op.getOpcode();
3337 auto Arg = Op.getOperand(i: 0u);
3338 auto ResultVT = Op.getValueType();
3339
3340 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3341 return {};
3342
3343 assert(isCtlzOpc(Opc));
3344 assert(ResultVT == Arg.getValueType());
3345
3346 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3347 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3348 SDValue NewOp;
3349
3350 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3351 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3352 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3353 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3354 } else {
3355 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3356 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3357 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3358 }
3359
3360 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3361}
3362
3363SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3364 SDLoc SL(Op);
3365 SDValue Src = Op.getOperand(i: 0);
3366
3367 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3368 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3369 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3370
3371 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3372 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3373 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3374
3375 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3376 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3377 // (cttz hi:lo) -> (umin (ffbl src), 32)
3378 // (ctlz_zero_undef src) -> (ffbh src)
3379 // (cttz_zero_undef src) -> (ffbl src)
3380
3381 // 64-bit scalar version produce 32-bit result
3382 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3383 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3384 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3385 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3386 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3387 if (!ZeroUndef) {
3388 const SDValue ConstVal = DAG.getConstant(
3389 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3390 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3391 }
3392 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3393 }
3394
3395 SDValue Lo, Hi;
3396 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3397
3398 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3399 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3400
3401 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3402 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3403 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3404 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3405
3406 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3407 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3408 if (Ctlz)
3409 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3410 else
3411 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3412
3413 SDValue NewOpr;
3414 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3415 if (!ZeroUndef) {
3416 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3417 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3418 }
3419
3420 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3421}
3422
3423SDValue AMDGPUTargetLowering::LowerCTLS(SDValue Op, SelectionDAG &DAG) const {
3424 SDLoc SL(Op);
3425 SDValue Src = Op.getOperand(i: 0);
3426 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3427 SDValue Ffbh = DAG.getNode(
3428 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
3429 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sffbh, DL: SL, VT: MVT::i32), N2: Src);
3430 SDValue Clamped = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: Ffbh,
3431 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3432 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: Clamped,
3433 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i32));
3434}
3435
3436SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3437 bool Signed) const {
3438 // The regular method converting a 64-bit integer to float roughly consists of
3439 // 2 steps: normalization and rounding. In fact, after normalization, the
3440 // conversion from a 64-bit integer to a float is essentially the same as the
3441 // one from a 32-bit integer. The only difference is that it has more
3442 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3443 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3444 // converted into the correct float number. The basic steps for the unsigned
3445 // conversion are illustrated in the following pseudo code:
3446 //
3447 // f32 uitofp(i64 u) {
3448 // i32 hi, lo = split(u);
3449 // // Only count the leading zeros in hi as we have native support of the
3450 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3451 // // reduced to a 32-bit one automatically.
3452 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3453 // u <<= shamt;
3454 // hi, lo = split(u);
3455 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3456 // // convert it as a 32-bit integer and scale the result back.
3457 // return uitofp(hi) * 2^(32 - shamt);
3458 // }
3459 //
3460 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3461 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3462 // converted instead followed by negation based its sign bit.
3463
3464 SDLoc SL(Op);
3465 SDValue Src = Op.getOperand(i: 0);
3466
3467 SDValue Lo, Hi;
3468 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3469 SDValue Sign;
3470 SDValue ShAmt;
3471 if (Signed && Subtarget->isGCN()) {
3472 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3473 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3474 // account. That is, the maximal shift is
3475 // - 32 if Lo and Hi have opposite signs;
3476 // - 33 if Lo and Hi have the same sign.
3477 //
3478 // Or, MaxShAmt = 33 + OppositeSign, where
3479 //
3480 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3481 // - -1 if Lo and Hi have opposite signs; and
3482 // - 0 otherwise.
3483 //
3484 // All in all, ShAmt is calculated as
3485 //
3486 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3487 //
3488 // or
3489 //
3490 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3491 //
3492 // to reduce the critical path.
3493 SDValue OppositeSign = DAG.getNode(
3494 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3495 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3496 SDValue MaxShAmt =
3497 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3498 N2: OppositeSign);
3499 // Count the leading sign bits.
3500 ShAmt = DAG.getNode(
3501 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
3502 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sffbh, DL: SL, VT: MVT::i32), N2: Hi);
3503 // Different from unsigned conversion, the shift should be one bit less to
3504 // preserve the sign bit.
3505 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3506 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3507 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3508 } else {
3509 if (Signed) {
3510 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3511 // absolute value first.
3512 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3513 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3514 SDValue Abs =
3515 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3516 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3517 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3518 }
3519 // Count the leading zeros.
3520 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3521 // The shift amount for signed integers is [0, 32].
3522 }
3523 // Normalize the given 64-bit integer.
3524 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3525 // Split it again.
3526 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3527 // Calculate the adjust bit for rounding.
3528 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3529 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3530 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3531 // Get the 32-bit normalized integer.
3532 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3533 // Convert the normalized 32-bit integer into f32.
3534
3535 bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3536 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3537 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3538
3539 // Finally, need to scale back the converted floating number as the original
3540 // 64-bit integer is converted as a 32-bit one.
3541 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3542 N2: ShAmt);
3543 // On GCN, use LDEXP directly.
3544 if (UseLDEXP)
3545 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3546
3547 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3548 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3549 // exponent is enough to avoid overflowing into the sign bit.
3550 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3551 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3552 SDValue IVal =
3553 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3554 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3555 if (Signed) {
3556 // Set the sign bit.
3557 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3558 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3559 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3560 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3561 }
3562 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3563}
3564
3565SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3566 bool Signed) const {
3567 SDLoc SL(Op);
3568 SDValue Src = Op.getOperand(i: 0);
3569
3570 SDValue Lo, Hi;
3571 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3572
3573 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3574 DL: SL, VT: MVT::f64, Operand: Hi);
3575
3576 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3577
3578 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3579 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3580 // TODO: Should this propagate fast-math-flags?
3581 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3582}
3583
3584SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3585 SelectionDAG &DAG) const {
3586 // TODO: Factor out code common with LowerSINT_TO_FP.
3587 EVT DestVT = Op.getValueType();
3588 SDValue Src = Op.getOperand(i: 0);
3589 EVT SrcVT = Src.getValueType();
3590
3591 if (SrcVT == MVT::i16) {
3592 if (DestVT == MVT::f16)
3593 return Op;
3594 SDLoc DL(Op);
3595
3596 // Promote src to i32
3597 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3598 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3599 }
3600
3601 if (DestVT == MVT::bf16) {
3602 SDLoc SL(Op);
3603 SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3604 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3605 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3606 }
3607
3608 if (SrcVT != MVT::i64)
3609 return Op;
3610
3611 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3612 SDLoc DL(Op);
3613
3614 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3615 SDValue FPRoundFlag =
3616 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3617 SDValue FPRound =
3618 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3619
3620 return FPRound;
3621 }
3622
3623 if (DestVT == MVT::f32)
3624 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3625
3626 assert(DestVT == MVT::f64);
3627 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3628}
3629
3630SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3631 SelectionDAG &DAG) const {
3632 EVT DestVT = Op.getValueType();
3633
3634 SDValue Src = Op.getOperand(i: 0);
3635 EVT SrcVT = Src.getValueType();
3636
3637 if (SrcVT == MVT::i16) {
3638 if (DestVT == MVT::f16)
3639 return Op;
3640
3641 SDLoc DL(Op);
3642 // Promote src to i32
3643 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3644 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3645 }
3646
3647 if (DestVT == MVT::bf16) {
3648 SDLoc SL(Op);
3649 SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3650 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3651 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3652 }
3653
3654 if (SrcVT != MVT::i64)
3655 return Op;
3656
3657 // TODO: Factor out code common with LowerUINT_TO_FP.
3658
3659 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3660 SDLoc DL(Op);
3661 SDValue Src = Op.getOperand(i: 0);
3662
3663 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3664 SDValue FPRoundFlag =
3665 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3666 SDValue FPRound =
3667 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3668
3669 return FPRound;
3670 }
3671
3672 if (DestVT == MVT::f32)
3673 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3674
3675 assert(DestVT == MVT::f64);
3676 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3677}
3678
3679SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3680 bool Signed) const {
3681 SDLoc SL(Op);
3682
3683 SDValue Src = Op.getOperand(i: 0);
3684 EVT SrcVT = Src.getValueType();
3685
3686 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3687
3688 // The basic idea of converting a floating point number into a pair of 32-bit
3689 // integers is illustrated as follows:
3690 //
3691 // tf := trunc(val);
3692 // hif := floor(tf * 2^-32);
3693 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3694 // hi := fptoi(hif);
3695 // lo := fptoi(lof);
3696 //
3697 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3698 SDValue Sign;
3699 if (Signed && SrcVT == MVT::f32) {
3700 // However, a 32-bit floating point number has only 23 bits mantissa and
3701 // it's not enough to hold all the significant bits of `lof` if val is
3702 // negative. To avoid the loss of precision, We need to take the absolute
3703 // value after truncating and flip the result back based on the original
3704 // signedness.
3705 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3706 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3707 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3708 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3709 }
3710
3711 SDValue K0, K1;
3712 if (SrcVT == MVT::f64) {
3713 K0 = DAG.getConstantFP(
3714 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3715 VT: SrcVT);
3716 K1 = DAG.getConstantFP(
3717 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3718 VT: SrcVT);
3719 } else {
3720 K0 = DAG.getConstantFP(
3721 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3722 K1 = DAG.getConstantFP(
3723 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3724 }
3725 // TODO: Should this propagate fast-math-flags?
3726 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3727
3728 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3729
3730 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3731
3732 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3733 : ISD::FP_TO_UINT,
3734 DL: SL, VT: MVT::i32, Operand: FloorMul);
3735 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3736
3737 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3738 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3739
3740 if (Signed && SrcVT == MVT::f32) {
3741 assert(Sign);
3742 // Flip the result based on the signedness, which is either all 0s or 1s.
3743 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3744 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3745 // r := xor(r, sign) - sign;
3746 Result =
3747 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3748 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3749 }
3750
3751 return Result;
3752}
3753
3754SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3755 SDLoc DL(Op);
3756 SDValue N0 = Op.getOperand(i: 0);
3757
3758 // Convert to target node to get known bits
3759 if (N0.getValueType() == MVT::f32)
3760 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3761
3762 if (Op->getFlags().hasApproximateFuncs()) {
3763 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3764 return SDValue();
3765 }
3766
3767 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3768}
3769
3770// return node in i32
3771SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3772 SelectionDAG &DAG) const {
3773 assert(Src.getSimpleValueType() == MVT::f64);
3774
3775 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3776 // TODO: We can generate better code for True16.
3777 const unsigned ExpMask = 0x7ff;
3778 const unsigned ExpBiasf64 = 1023;
3779 const unsigned ExpBiasf16 = 15;
3780 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3781 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3782 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3783 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3784 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3785 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3786 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3787 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3788 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3789 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3790 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3791 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3792 // add the f16 bias (15) to get the biased exponent for the f16 format.
3793 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3794 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3795
3796 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3797 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3798 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3799 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3800
3801 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3802 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3803 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3804
3805 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3806 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3807
3808 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3809 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3810 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3811 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3812
3813 // N = M | (E << 12);
3814 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3815 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3816 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3817
3818 // B = clamp(1-E, 0, 13);
3819 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3820 N1: One, N2: E);
3821 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3822 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3823 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3824
3825 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3826 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3827
3828 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3829 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3830 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3831 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3832
3833 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3834 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3835 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3836 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3837 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3838 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3839 True: One, False: Zero, Cond: ISD::SETEQ);
3840 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3841 True: One, False: Zero, Cond: ISD::SETGT);
3842 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3843 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3844
3845 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3846 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3847 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3848 True: I, False: V, Cond: ISD::SETEQ);
3849
3850 // Extract the sign bit.
3851 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3852 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3853 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3854 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3855
3856 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3857}
3858
3859SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3860 SelectionDAG &DAG) const {
3861 SDValue Src = Op.getOperand(i: 0);
3862 unsigned OpOpcode = Op.getOpcode();
3863 EVT SrcVT = Src.getValueType();
3864 EVT DestVT = Op.getValueType();
3865
3866 // Will be selected natively
3867 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3868 return Op;
3869
3870 if (SrcVT == MVT::bf16) {
3871 SDLoc DL(Op);
3872 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3873 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3874 }
3875
3876 // Promote i16 to i32
3877 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3878 SDLoc DL(Op);
3879
3880 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3881 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3882 }
3883
3884 if (DestVT != MVT::i64)
3885 return Op;
3886
3887 if (SrcVT == MVT::f16 ||
3888 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3889 SDLoc DL(Op);
3890
3891 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3892 unsigned Ext =
3893 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3894 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3895 }
3896
3897 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3898 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3899
3900 return SDValue();
3901}
3902
3903SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3904 SelectionDAG &DAG) const {
3905 SDValue Src = Op.getOperand(i: 0);
3906 unsigned OpOpcode = Op.getOpcode();
3907 EVT SrcVT = Src.getValueType();
3908 EVT DstVT = Op.getValueType();
3909 SDValue SatVTOp = Op.getNode()->getOperand(Num: 1);
3910 EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3911 SDLoc DL(Op);
3912
3913 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3914 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3915 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3916
3917 // Will be selected natively
3918 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3919 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3920 return Op;
3921
3922 if (DstVT == MVT::i16 && SatWidth == DstWidth && SrcVT == MVT::f16)
3923 return Op;
3924
3925 // Perform all saturation at selected width (i16 or i32) and truncate
3926 if (SatWidth < DstWidth && SatWidth <= 32) {
3927 // For f16 conversion with sub-i16 saturation perform saturation
3928 // at i16, if available in the target. This removes the need for extra f16
3929 // to f32 conversion. For all the others use i32.
3930 MVT ResultVT =
3931 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3932 ? MVT::i16
3933 : MVT::i32;
3934
3935 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3936 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3937
3938 // First, convert input float into selected integer (i16 or i32)
3939 SDValue FpToInt = DAG.getNode(Opcode: OpOpcode, DL, VT: ResultVT, N1: Src, N2: ResultVTOp);
3940 SDValue IntSatVal;
3941
3942 // Then, clamp at the saturation width using either i16 or i32 instructions
3943 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3944 SDValue MinConst = DAG.getConstant(
3945 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: ResultWidth), DL, VT: ResultVT);
3946 SDValue MaxConst = DAG.getConstant(
3947 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: ResultWidth), DL, VT: ResultVT);
3948 SDValue MinVal = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ResultVT, N1: FpToInt, N2: MinConst);
3949 IntSatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ResultVT, N1: MinVal, N2: MaxConst);
3950 } else {
3951 SDValue MinConst = DAG.getConstant(
3952 Val: APInt::getMaxValue(numBits: SatWidth).zext(width: ResultWidth), DL, VT: ResultVT);
3953 IntSatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: ResultVT, N1: FpToInt, N2: MinConst);
3954 }
3955
3956 // Finally, after saturating at i16 or i32 fit into the destination type
3957 return DAG.getExtOrTrunc(IsSigned: OpOpcode == ISD::FP_TO_SINT_SAT, Op: IntSatVal, DL,
3958 VT: DstVT);
3959 }
3960
3961 // SatWidth == DstWidth
3962
3963 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3964 // below)
3965 if (DstVT == MVT::i64 &&
3966 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3967 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3968 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3969 return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VTOp);
3970 }
3971
3972 // Promote f16/bf16 src to f32 for i32 conversion
3973 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3974 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3975 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3976 }
3977
3978 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3979 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3980 // saturation; this covers i16.f32 and i16.f64
3981 if (DstWidth < 32) {
3982 // Note: this triggers SatWidth < DstWidth above to generate saturated
3983 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
3984 MVT PromoteVT =
3985 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
3986 SDValue FpToInt = DAG.getNode(Opcode: OpOpcode, DL, VT: PromoteVT, N1: Src, N2: SatVTOp);
3987 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt);
3988 }
3989
3990 // TODO: can we implement i64 dst for f32/f64?
3991
3992 return SDValue();
3993}
3994
3995SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3996 SelectionDAG &DAG) const {
3997 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3998 MVT VT = Op.getSimpleValueType();
3999 MVT ScalarVT = VT.getScalarType();
4000
4001 assert(VT.isVector());
4002
4003 SDValue Src = Op.getOperand(i: 0);
4004 SDLoc DL(Op);
4005
4006 // TODO: Don't scalarize on Evergreen?
4007 unsigned NElts = VT.getVectorNumElements();
4008 SmallVector<SDValue, 8> Args;
4009 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
4010
4011 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4012 for (unsigned I = 0; I < NElts; ++I)
4013 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
4014
4015 return DAG.getBuildVector(VT, DL, Ops: Args);
4016}
4017
4018//===----------------------------------------------------------------------===//
4019// Custom DAG optimizations
4020//===----------------------------------------------------------------------===//
4021
4022static bool isU24(SDValue Op, SelectionDAG &DAG) {
4023 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4024}
4025
4026static bool isI24(SDValue Op, SelectionDAG &DAG) {
4027 EVT VT = Op.getValueType();
4028 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4029 // as unsigned 24-bit values.
4030 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
4031}
4032
4033static SDValue simplifyMul24(SDNode *Node24,
4034 TargetLowering::DAGCombinerInfo &DCI) {
4035 SelectionDAG &DAG = DCI.DAG;
4036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4037 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4038
4039 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
4040 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
4041 unsigned NewOpcode = Node24->getOpcode();
4042 if (IsIntrin) {
4043 unsigned IID = Node24->getConstantOperandVal(Num: 0);
4044 switch (IID) {
4045 case Intrinsic::amdgcn_mul_i24:
4046 NewOpcode = AMDGPUISD::MUL_I24;
4047 break;
4048 case Intrinsic::amdgcn_mul_u24:
4049 NewOpcode = AMDGPUISD::MUL_U24;
4050 break;
4051 case Intrinsic::amdgcn_mulhi_i24:
4052 NewOpcode = AMDGPUISD::MULHI_I24;
4053 break;
4054 case Intrinsic::amdgcn_mulhi_u24:
4055 NewOpcode = AMDGPUISD::MULHI_U24;
4056 break;
4057 default:
4058 llvm_unreachable("Expected 24-bit mul intrinsic");
4059 }
4060 }
4061
4062 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
4063
4064 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4065 // the operands to have other uses, but will only perform simplifications that
4066 // involve bypassing some nodes for this user.
4067 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
4068 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
4069 if (DemandedLHS || DemandedRHS)
4070 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
4071 N1: DemandedLHS ? DemandedLHS : LHS,
4072 N2: DemandedRHS ? DemandedRHS : RHS);
4073
4074 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4075 // operands if this node is the only user.
4076 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
4077 return SDValue(Node24, 0);
4078 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
4079 return SDValue(Node24, 0);
4080
4081 return SDValue();
4082}
4083
4084template <typename IntTy>
4085static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
4086 uint32_t Width, const SDLoc &DL) {
4087 if (Width + Offset < 32) {
4088 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4089 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4090 if constexpr (std::is_signed_v<IntTy>) {
4091 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
4092 } else {
4093 return DAG.getConstant(Result, DL, MVT::i32);
4094 }
4095 }
4096
4097 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4098}
4099
4100static bool hasVolatileUser(SDNode *Val) {
4101 for (SDNode *U : Val->users()) {
4102 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
4103 if (M->isVolatile())
4104 return true;
4105 }
4106 }
4107
4108 return false;
4109}
4110
4111bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
4112 // i32 vectors are the canonical memory type.
4113 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4114 return false;
4115
4116 if (!VT.isByteSized())
4117 return false;
4118
4119 unsigned Size = VT.getStoreSize();
4120
4121 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4122 return false;
4123
4124 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4125 return false;
4126
4127 return true;
4128}
4129
4130// Replace load of an illegal type with a bitcast from a load of a friendlier
4131// type.
4132SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
4133 DAGCombinerInfo &DCI) const {
4134 if (!DCI.isBeforeLegalize())
4135 return SDValue();
4136
4137 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
4138 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
4139 return SDValue();
4140
4141 SDLoc SL(N);
4142 SelectionDAG &DAG = DCI.DAG;
4143 EVT VT = LN->getMemoryVT();
4144
4145 unsigned Size = VT.getStoreSize();
4146 Align Alignment = LN->getAlign();
4147 if (Alignment < Size && isTypeLegal(VT)) {
4148 unsigned IsFast;
4149 unsigned AS = LN->getAddressSpace();
4150
4151 // Expand unaligned loads earlier than legalization. Due to visitation order
4152 // problems during legalization, the emitted instructions to pack and unpack
4153 // the bytes again are not eliminated in the case of an unaligned copy.
4154 if (!allowsMisalignedMemoryAccesses(
4155 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4156 if (VT.isVector())
4157 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
4158
4159 SDValue Ops[2];
4160 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
4161
4162 return DAG.getMergeValues(Ops, dl: SDLoc(N));
4163 }
4164
4165 if (!IsFast)
4166 return SDValue();
4167 }
4168
4169 if (!shouldCombineMemoryType(VT))
4170 return SDValue();
4171
4172 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4173
4174 SDValue NewLoad
4175 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4176 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4177
4178 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4179 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
4180 return SDValue(N, 0);
4181}
4182
4183// Replace store of an illegal type with a store of a bitcast to a friendlier
4184// type.
4185SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4186 DAGCombinerInfo &DCI) const {
4187 if (!DCI.isBeforeLegalize())
4188 return SDValue();
4189
4190 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4191 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
4192 return SDValue();
4193
4194 EVT VT = SN->getMemoryVT();
4195 unsigned Size = VT.getStoreSize();
4196
4197 SDLoc SL(N);
4198 SelectionDAG &DAG = DCI.DAG;
4199 Align Alignment = SN->getAlign();
4200 if (Alignment < Size && isTypeLegal(VT)) {
4201 unsigned IsFast;
4202 unsigned AS = SN->getAddressSpace();
4203
4204 // Expand unaligned stores earlier than legalization. Due to visitation
4205 // order problems during legalization, the emitted instructions to pack and
4206 // unpack the bytes again are not eliminated in the case of an unaligned
4207 // copy.
4208 if (!allowsMisalignedMemoryAccesses(
4209 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4210 if (VT.isVector())
4211 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
4212
4213 return expandUnalignedStore(ST: SN, DAG);
4214 }
4215
4216 if (!IsFast)
4217 return SDValue();
4218 }
4219
4220 if (!shouldCombineMemoryType(VT))
4221 return SDValue();
4222
4223 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4224 SDValue Val = SN->getValue();
4225
4226 //DCI.AddToWorklist(Val.getNode());
4227
4228 bool OtherUses = !Val.hasOneUse();
4229 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
4230 if (OtherUses) {
4231 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
4232 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4233 }
4234
4235 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4236 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4237}
4238
4239// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4240// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4241// issues.
4242SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4243 DAGCombinerInfo &DCI) const {
4244 SelectionDAG &DAG = DCI.DAG;
4245 SDValue N0 = N->getOperand(Num: 0);
4246
4247 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4248 // (vt2 (truncate (assertzext vt0:x, vt1)))
4249 if (N0.getOpcode() == ISD::TRUNCATE) {
4250 SDValue N1 = N->getOperand(Num: 1);
4251 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4252 SDLoc SL(N);
4253
4254 SDValue Src = N0.getOperand(i: 0);
4255 EVT SrcVT = Src.getValueType();
4256 if (SrcVT.bitsGE(VT: ExtVT)) {
4257 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4258 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
4259 }
4260 }
4261
4262 return SDValue();
4263}
4264
4265SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4266 SDNode *N, DAGCombinerInfo &DCI) const {
4267 unsigned IID = N->getConstantOperandVal(Num: 0);
4268 switch (IID) {
4269 case Intrinsic::amdgcn_mul_i24:
4270 case Intrinsic::amdgcn_mul_u24:
4271 case Intrinsic::amdgcn_mulhi_i24:
4272 case Intrinsic::amdgcn_mulhi_u24:
4273 return simplifyMul24(Node24: N, DCI);
4274 case Intrinsic::amdgcn_fract:
4275 case Intrinsic::amdgcn_rsq:
4276 case Intrinsic::amdgcn_rcp_legacy:
4277 case Intrinsic::amdgcn_rsq_legacy:
4278 case Intrinsic::amdgcn_rsq_clamp:
4279 case Intrinsic::amdgcn_tanh:
4280 case Intrinsic::amdgcn_prng_b32: {
4281 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4282 SDValue Src = N->getOperand(Num: 1);
4283 return Src.isUndef() ? Src : SDValue();
4284 }
4285 case Intrinsic::amdgcn_frexp_exp: {
4286 // frexp_exp (fneg x) -> frexp_exp x
4287 // frexp_exp (fabs x) -> frexp_exp x
4288 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4289 SDValue Src = N->getOperand(Num: 1);
4290 SDValue PeekSign = peekFPSignOps(Val: Src);
4291 if (PeekSign == Src)
4292 return SDValue();
4293 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4294 0);
4295 }
4296 default:
4297 return SDValue();
4298 }
4299}
4300
4301/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4302/// binary operation \p Opc to it with the corresponding constant operands.
4303SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4304 DAGCombinerInfo &DCI, const SDLoc &SL,
4305 unsigned Opc, SDValue LHS,
4306 uint32_t ValLo, uint32_t ValHi) const {
4307 SelectionDAG &DAG = DCI.DAG;
4308 SDValue Lo, Hi;
4309 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4310
4311 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4312 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4313
4314 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4315 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4316
4317 // Re-visit the ands. It's possible we eliminated one of them and it could
4318 // simplify the vector.
4319 DCI.AddToWorklist(N: Lo.getNode());
4320 DCI.AddToWorklist(N: Hi.getNode());
4321
4322 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4323 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4324}
4325
4326SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4327 DAGCombinerInfo &DCI) const {
4328 EVT VT = N->getValueType(ResNo: 0);
4329 SDValue LHS = N->getOperand(Num: 0);
4330 SDValue RHS = N->getOperand(Num: 1);
4331 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4332 SDLoc SL(N);
4333 SelectionDAG &DAG = DCI.DAG;
4334
4335 unsigned RHSVal;
4336 if (CRHS) {
4337 RHSVal = CRHS->getZExtValue();
4338 if (!RHSVal)
4339 return LHS;
4340
4341 switch (LHS->getOpcode()) {
4342 default:
4343 break;
4344 case ISD::ZERO_EXTEND:
4345 case ISD::SIGN_EXTEND:
4346 case ISD::ANY_EXTEND: {
4347 SDValue X = LHS->getOperand(Num: 0);
4348
4349 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4350 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4351 // Prefer build_vector as the canonical form if packed types are legal.
4352 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4353 SDValue Vec = DAG.getBuildVector(
4354 VT: MVT::v2i16, DL: SL,
4355 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4356 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4357 }
4358
4359 // shl (ext x) => zext (shl x), if shift does not overflow int
4360 if (VT != MVT::i64)
4361 break;
4362 KnownBits Known = DAG.computeKnownBits(Op: X);
4363 unsigned LZ = Known.countMinLeadingZeros();
4364 if (LZ < RHSVal)
4365 break;
4366 EVT XVT = X.getValueType();
4367 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4368 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4369 }
4370 }
4371 }
4372
4373 if (VT.getScalarType() != MVT::i64)
4374 return SDValue();
4375
4376 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4377 // common case, splitting this into a move and a 32-bit shift is faster and
4378 // the same code size.
4379 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4380
4381 EVT ElementType = VT.getScalarType();
4382 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4383 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4384
4385 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4386 return SDValue();
4387 SDValue ShiftAmt;
4388
4389 if (CRHS) {
4390 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4391 VT: TargetType);
4392 } else {
4393 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4394 const SDValue ShiftMask =
4395 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4396 // This AND instruction will clamp out of bounds shift values.
4397 // It will also be removed during later instruction selection.
4398 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4399 }
4400
4401 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4402 SDValue NewShift =
4403 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4404
4405 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4406 SDValue Vec;
4407
4408 if (VT.isVector()) {
4409 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4410 unsigned NElts = TargetType.getVectorNumElements();
4411 SmallVector<SDValue, 8> HiOps;
4412 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4413
4414 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4415 for (unsigned I = 0; I != NElts; ++I)
4416 HiAndLoOps[2 * I + 1] = HiOps[I];
4417 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4418 } else {
4419 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4420 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4421 }
4422 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4423}
4424
4425SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4426 DAGCombinerInfo &DCI) const {
4427 SDValue RHS = N->getOperand(Num: 1);
4428 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4429 EVT VT = N->getValueType(ResNo: 0);
4430 SDValue LHS = N->getOperand(Num: 0);
4431 SelectionDAG &DAG = DCI.DAG;
4432 SDLoc SL(N);
4433
4434 if (VT.getScalarType() != MVT::i64)
4435 return SDValue();
4436
4437 // For C >= 32
4438 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4439
4440 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4441 // common case, splitting this into a move and a 32-bit shift is faster and
4442 // the same code size.
4443 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4444
4445 EVT ElementType = VT.getScalarType();
4446 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4447 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4448
4449 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4450 return SDValue();
4451
4452 SDValue ShiftFullAmt =
4453 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4454 SDValue ShiftAmt;
4455 if (CRHS) {
4456 unsigned RHSVal = CRHS->getZExtValue();
4457 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4458 VT: TargetType);
4459 } else if (Known.getMinValue().getZExtValue() ==
4460 (ElementType.getSizeInBits() - 1)) {
4461 ShiftAmt = ShiftFullAmt;
4462 } else {
4463 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4464 const SDValue ShiftMask =
4465 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4466 // This AND instruction will clamp out of bounds shift values.
4467 // It will also be removed during later instruction selection.
4468 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4469 }
4470
4471 EVT ConcatType;
4472 SDValue Hi;
4473 SDLoc LHSSL(LHS);
4474 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4475 if (VT.isVector()) {
4476 unsigned NElts = TargetType.getVectorNumElements();
4477 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4478 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4479 SmallVector<SDValue, 8> HiOps(NElts);
4480 SmallVector<SDValue, 16> HiAndLoOps;
4481
4482 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4483 for (unsigned I = 0; I != NElts; ++I) {
4484 HiOps[I] = HiAndLoOps[2 * I + 1];
4485 }
4486 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4487 } else {
4488 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4489 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4490 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4491 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4492 }
4493
4494 KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4495 SDValue HiShift;
4496 if (KnownLHS.isNegative()) {
4497 HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4498 } else {
4499 Hi = DAG.getFreeze(V: Hi);
4500 HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4501 }
4502 SDValue NewShift =
4503 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4504
4505 SDValue Vec;
4506 if (VT.isVector()) {
4507 unsigned NElts = TargetType.getVectorNumElements();
4508 SmallVector<SDValue, 8> HiOps;
4509 SmallVector<SDValue, 8> LoOps;
4510 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4511
4512 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4513 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4514 for (unsigned I = 0; I != NElts; ++I) {
4515 HiAndLoOps[2 * I + 1] = HiOps[I];
4516 HiAndLoOps[2 * I] = LoOps[I];
4517 }
4518 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4519 } else {
4520 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4521 }
4522 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4523}
4524
4525SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4526 DAGCombinerInfo &DCI) const {
4527 SDValue RHS = N->getOperand(Num: 1);
4528 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4529 EVT VT = N->getValueType(ResNo: 0);
4530 SDValue LHS = N->getOperand(Num: 0);
4531 SelectionDAG &DAG = DCI.DAG;
4532 SDLoc SL(N);
4533 unsigned RHSVal;
4534
4535 if (CRHS) {
4536 RHSVal = CRHS->getZExtValue();
4537
4538 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4539 // this improves the ability to match BFE patterns in isel.
4540 if (LHS.getOpcode() == ISD::AND) {
4541 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4542 unsigned MaskIdx, MaskLen;
4543 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4544 MaskIdx == RHSVal) {
4545 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4546 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4547 N2: N->getOperand(Num: 1)),
4548 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4549 N2: N->getOperand(Num: 1)));
4550 }
4551 }
4552 }
4553 }
4554
4555 if (VT.getScalarType() != MVT::i64)
4556 return SDValue();
4557
4558 // for C >= 32
4559 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4560
4561 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4562 // common case, splitting this into a move and a 32-bit shift is faster and
4563 // the same code size.
4564 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4565
4566 EVT ElementType = VT.getScalarType();
4567 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4568 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4569
4570 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4571 return SDValue();
4572
4573 SDValue ShiftAmt;
4574 if (CRHS) {
4575 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4576 VT: TargetType);
4577 } else {
4578 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4579 const SDValue ShiftMask =
4580 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4581 // This AND instruction will clamp out of bounds shift values.
4582 // It will also be removed during later instruction selection.
4583 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4584 }
4585
4586 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4587 EVT ConcatType;
4588 SDValue Hi;
4589 SDLoc LHSSL(LHS);
4590 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4591 if (VT.isVector()) {
4592 unsigned NElts = TargetType.getVectorNumElements();
4593 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4594 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4595 SmallVector<SDValue, 8> HiOps(NElts);
4596 SmallVector<SDValue, 16> HiAndLoOps;
4597
4598 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4599 for (unsigned I = 0; I != NElts; ++I)
4600 HiOps[I] = HiAndLoOps[2 * I + 1];
4601 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4602 } else {
4603 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4604 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4605 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4606 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4607 }
4608
4609 SDValue NewShift =
4610 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4611
4612 SDValue Vec;
4613 if (VT.isVector()) {
4614 unsigned NElts = TargetType.getVectorNumElements();
4615 SmallVector<SDValue, 8> LoOps;
4616 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4617
4618 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4619 for (unsigned I = 0; I != NElts; ++I)
4620 HiAndLoOps[2 * I] = LoOps[I];
4621 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4622 } else {
4623 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4624 }
4625 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4626}
4627
4628SDValue AMDGPUTargetLowering::performTruncateCombine(
4629 SDNode *N, DAGCombinerInfo &DCI) const {
4630 SDLoc SL(N);
4631 SelectionDAG &DAG = DCI.DAG;
4632 EVT VT = N->getValueType(ResNo: 0);
4633 SDValue Src = N->getOperand(Num: 0);
4634
4635 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4636 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4637 SDValue Vec = Src.getOperand(i: 0);
4638 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4639 SDValue Elt0 = Vec.getOperand(i: 0);
4640 EVT EltVT = Elt0.getValueType();
4641 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4642 if (EltVT.isFloatingPoint()) {
4643 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4644 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4645 }
4646
4647 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4648 }
4649 }
4650 }
4651
4652 // Equivalent of above for accessing the high element of a vector as an
4653 // integer operation.
4654 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4655 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4656 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4657 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4658 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4659 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4660 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4661 unsigned BitIndex = K->getZExtValue();
4662 unsigned PartIndex = BitIndex / SrcEltSize;
4663
4664 if (PartIndex * SrcEltSize == BitIndex &&
4665 PartIndex < BV.getNumOperands()) {
4666 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4667 SDValue SrcElt =
4668 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4669 Operand: BV.getOperand(i: PartIndex));
4670 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4671 }
4672 }
4673 }
4674 }
4675 }
4676
4677 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4678 //
4679 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4680 // i16 (trunc (srl (i32 (trunc x), K)))
4681 if (VT.getScalarSizeInBits() < 32) {
4682 EVT SrcVT = Src.getValueType();
4683 if (SrcVT.getScalarSizeInBits() > 32 &&
4684 (Src.getOpcode() == ISD::SRL ||
4685 Src.getOpcode() == ISD::SRA ||
4686 Src.getOpcode() == ISD::SHL)) {
4687 SDValue Amt = Src.getOperand(i: 1);
4688 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4689
4690 // - For left shifts, do the transform as long as the shift
4691 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4692 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4693 // losing information stored in the high bits when truncating.
4694 const unsigned MaxCstSize =
4695 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4696 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4697 EVT MidVT = VT.isVector() ?
4698 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4699 NumElements: VT.getVectorNumElements()) : MVT::i32;
4700
4701 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4702 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4703 Operand: Src.getOperand(i: 0));
4704 DCI.AddToWorklist(N: Trunc.getNode());
4705
4706 if (Amt.getValueType() != NewShiftVT) {
4707 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4708 DCI.AddToWorklist(N: Amt.getNode());
4709 }
4710
4711 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4712 N1: Trunc, N2: Amt);
4713 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4714 }
4715 }
4716 }
4717
4718 return SDValue();
4719}
4720
4721// We need to specifically handle i64 mul here to avoid unnecessary conversion
4722// instructions. If we only match on the legalized i64 mul expansion,
4723// SimplifyDemandedBits will be unable to remove them because there will be
4724// multiple uses due to the separate mul + mulh[su].
4725static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4726 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4727 if (Size <= 32) {
4728 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4729 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4730 }
4731
4732 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4733 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4734
4735 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4736 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4737
4738 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4739}
4740
4741/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4742/// return SDValue().
4743static SDValue getAddOneOp(const SDNode *V) {
4744 if (V->getOpcode() != ISD::ADD)
4745 return SDValue();
4746
4747 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4748}
4749
4750SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4751 DAGCombinerInfo &DCI) const {
4752 assert(N->getOpcode() == ISD::MUL);
4753 EVT VT = N->getValueType(ResNo: 0);
4754
4755 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4756 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4757 // unnecessarily). isDivergent() is used as an approximation of whether the
4758 // value is in an SGPR.
4759 if (!N->isDivergent())
4760 return SDValue();
4761
4762 unsigned Size = VT.getSizeInBits();
4763 if (VT.isVector() || Size > 64)
4764 return SDValue();
4765
4766 SelectionDAG &DAG = DCI.DAG;
4767 SDLoc DL(N);
4768
4769 SDValue N0 = N->getOperand(Num: 0);
4770 SDValue N1 = N->getOperand(Num: 1);
4771
4772 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4773 // matching.
4774
4775 // mul x, (add y, 1) -> add (mul x, y), x
4776 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4777 SDValue AddOp = getAddOneOp(V: V.getNode());
4778 if (!AddOp)
4779 return SDValue();
4780
4781 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4782 return U->getOpcode() == ISD::MUL;
4783 }))
4784 return AddOp;
4785
4786 return SDValue();
4787 };
4788
4789 // FIXME: The selection pattern is not properly checking for commuted
4790 // operands, so we have to place the mul in the LHS
4791 if (SDValue MulOper = IsFoldableAdd(N0)) {
4792 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4793 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4794 }
4795
4796 if (SDValue MulOper = IsFoldableAdd(N1)) {
4797 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4798 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4799 }
4800
4801 // There are i16 integer mul/mad.
4802 if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4803 return SDValue();
4804
4805 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4806 // in the source into any_extends if the result of the mul is truncated. Since
4807 // we can assume the high bits are whatever we want, use the underlying value
4808 // to avoid the unknown high bits from interfering.
4809 if (N0.getOpcode() == ISD::ANY_EXTEND)
4810 N0 = N0.getOperand(i: 0);
4811
4812 if (N1.getOpcode() == ISD::ANY_EXTEND)
4813 N1 = N1.getOperand(i: 0);
4814
4815 SDValue Mul;
4816
4817 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4818 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4819 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4820 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4821 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4822 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4823 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4824 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4825 } else {
4826 return SDValue();
4827 }
4828
4829 // We need to use sext even for MUL_U24, because MUL_U24 is used
4830 // for signed multiply of 8 and 16-bit types.
4831 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4832}
4833
4834SDValue
4835AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4836 DAGCombinerInfo &DCI) const {
4837 if (N->getValueType(ResNo: 0) != MVT::i32)
4838 return SDValue();
4839
4840 SelectionDAG &DAG = DCI.DAG;
4841 SDLoc DL(N);
4842
4843 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4844 SDValue N0 = N->getOperand(Num: 0);
4845 SDValue N1 = N->getOperand(Num: 1);
4846
4847 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4848 // in the source into any_extends if the result of the mul is truncated. Since
4849 // we can assume the high bits are whatever we want, use the underlying value
4850 // to avoid the unknown high bits from interfering.
4851 if (N0.getOpcode() == ISD::ANY_EXTEND)
4852 N0 = N0.getOperand(i: 0);
4853 if (N1.getOpcode() == ISD::ANY_EXTEND)
4854 N1 = N1.getOperand(i: 0);
4855
4856 // Try to use two fast 24-bit multiplies (one for each half of the result)
4857 // instead of one slow extending multiply.
4858 unsigned LoOpcode = 0;
4859 unsigned HiOpcode = 0;
4860 if (Signed) {
4861 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4862 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4863 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4864 LoOpcode = AMDGPUISD::MUL_I24;
4865 HiOpcode = AMDGPUISD::MULHI_I24;
4866 }
4867 } else {
4868 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4869 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4870 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4871 LoOpcode = AMDGPUISD::MUL_U24;
4872 HiOpcode = AMDGPUISD::MULHI_U24;
4873 }
4874 }
4875 if (!LoOpcode)
4876 return SDValue();
4877
4878 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4879 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4880 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4881 return SDValue(N, 0);
4882}
4883
4884SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4885 DAGCombinerInfo &DCI) const {
4886 EVT VT = N->getValueType(ResNo: 0);
4887
4888 if (!Subtarget->hasMulI24() || VT.isVector())
4889 return SDValue();
4890
4891 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4892 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4893 // unnecessarily). isDivergent() is used as an approximation of whether the
4894 // value is in an SGPR.
4895 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4896 // valu op anyway)
4897 if (Subtarget->hasSMulHi() && !N->isDivergent())
4898 return SDValue();
4899
4900 SelectionDAG &DAG = DCI.DAG;
4901 SDLoc DL(N);
4902
4903 SDValue N0 = N->getOperand(Num: 0);
4904 SDValue N1 = N->getOperand(Num: 1);
4905
4906 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4907 return SDValue();
4908
4909 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4910 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4911
4912 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4913 DCI.AddToWorklist(N: Mulhi.getNode());
4914 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4915}
4916
4917SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4918 DAGCombinerInfo &DCI) const {
4919 EVT VT = N->getValueType(ResNo: 0);
4920
4921 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4922 return SDValue();
4923
4924 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4925 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4926 // unnecessarily). isDivergent() is used as an approximation of whether the
4927 // value is in an SGPR.
4928 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4929 // valu op anyway)
4930 if (!N->isDivergent() && Subtarget->hasSMulHi())
4931 return SDValue();
4932
4933 SelectionDAG &DAG = DCI.DAG;
4934 SDLoc DL(N);
4935
4936 SDValue N0 = N->getOperand(Num: 0);
4937 SDValue N1 = N->getOperand(Num: 1);
4938
4939 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4940 return SDValue();
4941
4942 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4943 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4944
4945 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4946 DCI.AddToWorklist(N: Mulhi.getNode());
4947 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4948}
4949
4950SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4951 SDValue Op,
4952 const SDLoc &DL,
4953 unsigned Opc) const {
4954 EVT VT = Op.getValueType();
4955 if (VT.bitsGT(VT: MVT::i32))
4956 return SDValue();
4957
4958 if (VT != MVT::i32)
4959 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4960
4961 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4962 if (VT != MVT::i32)
4963 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4964
4965 return FFBX;
4966}
4967
4968// The native instructions return -1 on 0 input. Optimize out a select that
4969// produces -1 on 0.
4970//
4971// TODO: If zero is not undef, we could also do this if the output is compared
4972// against the bitwidth.
4973//
4974// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4975SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4976 SDValue LHS, SDValue RHS,
4977 DAGCombinerInfo &DCI) const {
4978 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4979 return SDValue();
4980
4981 SelectionDAG &DAG = DCI.DAG;
4982 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4983 SDValue CmpLHS = Cond.getOperand(i: 0);
4984
4985 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4986 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4987 if (CCOpcode == ISD::SETEQ &&
4988 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4989 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4990 unsigned Opc =
4991 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4992 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4993 }
4994
4995 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4996 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4997 if (CCOpcode == ISD::SETNE &&
4998 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4999 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
5000 unsigned Opc =
5001 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5002
5003 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
5004 }
5005
5006 return SDValue();
5007}
5008
5009static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
5010 unsigned Op,
5011 const SDLoc &SL,
5012 SDValue Cond,
5013 SDValue N1,
5014 SDValue N2) {
5015 SelectionDAG &DAG = DCI.DAG;
5016 EVT VT = N1.getValueType();
5017
5018 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
5019 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
5020 DCI.AddToWorklist(N: NewSelect.getNode());
5021 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
5022}
5023
5024// Pull a free FP operation out of a select so it may fold into uses.
5025//
5026// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5027// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5028//
5029// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5030// select c, (fabs x), +k -> fabs (select c, x, k)
5031SDValue
5032AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
5033 SDValue N) const {
5034 SelectionDAG &DAG = DCI.DAG;
5035 SDValue Cond = N.getOperand(i: 0);
5036 SDValue LHS = N.getOperand(i: 1);
5037 SDValue RHS = N.getOperand(i: 2);
5038
5039 EVT VT = N.getValueType();
5040 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5041 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5042 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5043 return SDValue();
5044
5045 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
5046 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
5047 }
5048
5049 bool Inv = false;
5050 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5051 std::swap(a&: LHS, b&: RHS);
5052 Inv = true;
5053 }
5054
5055 // TODO: Support vector constants.
5056 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
5057 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5058 !selectSupportsSourceMods(N: N.getNode())) {
5059 SDLoc SL(N);
5060 // If one side is an fneg/fabs and the other is a constant, we can push the
5061 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5062 SDValue NewLHS = LHS.getOperand(i: 0);
5063 SDValue NewRHS = RHS;
5064
5065 // Careful: if the neg can be folded up, don't try to pull it back down.
5066 bool ShouldFoldNeg = true;
5067
5068 if (NewLHS.hasOneUse()) {
5069 unsigned Opc = NewLHS.getOpcode();
5070 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
5071 ShouldFoldNeg = false;
5072 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5073 ShouldFoldNeg = false;
5074 }
5075
5076 if (ShouldFoldNeg) {
5077 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5078 return SDValue();
5079
5080 // We're going to be forced to use a source modifier anyway, there's no
5081 // point to pulling the negate out unless we can get a size reduction by
5082 // negating the constant.
5083 //
5084 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5085 // about cheaper constants.
5086 if (NewLHS.getOpcode() == ISD::FABS &&
5087 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
5088 return SDValue();
5089
5090 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5091 return SDValue();
5092
5093 if (LHS.getOpcode() == ISD::FNEG)
5094 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5095
5096 if (Inv)
5097 std::swap(a&: NewLHS, b&: NewRHS);
5098
5099 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
5100 N1: Cond, N2: NewLHS, N3: NewRHS);
5101 DCI.AddToWorklist(N: NewSelect.getNode());
5102 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
5103 }
5104 }
5105
5106 return SDValue();
5107}
5108
5109SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
5110 DAGCombinerInfo &DCI) const {
5111 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
5112 return Folded;
5113
5114 SDValue Cond = N->getOperand(Num: 0);
5115 if (Cond.getOpcode() != ISD::SETCC)
5116 return SDValue();
5117
5118 EVT VT = N->getValueType(ResNo: 0);
5119 SDValue LHS = Cond.getOperand(i: 0);
5120 SDValue RHS = Cond.getOperand(i: 1);
5121 SDValue CC = Cond.getOperand(i: 2);
5122
5123 SDValue True = N->getOperand(Num: 1);
5124 SDValue False = N->getOperand(Num: 2);
5125
5126 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5127 SelectionDAG &DAG = DCI.DAG;
5128 if (DAG.isConstantValueOfAnyType(N: True) &&
5129 !DAG.isConstantValueOfAnyType(N: False)) {
5130 // Swap cmp + select pair to move constant to false input.
5131 // This will allow using VOPC cndmasks more often.
5132 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5133
5134 SDLoc SL(N);
5135 ISD::CondCode NewCC =
5136 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
5137
5138 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
5139 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
5140 }
5141
5142 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5143 SDValue MinMax
5144 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5145 // Revisit this node so we can catch min3/max3/med3 patterns.
5146 //DCI.AddToWorklist(MinMax.getNode());
5147 return MinMax;
5148 }
5149 }
5150
5151 // There's no reason to not do this if the condition has other uses.
5152 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
5153}
5154
5155static bool isInv2Pi(const APFloat &APF) {
5156 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5157 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5158 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5159
5160 return APF.bitwiseIsEqual(RHS: KF16) ||
5161 APF.bitwiseIsEqual(RHS: KF32) ||
5162 APF.bitwiseIsEqual(RHS: KF64);
5163}
5164
5165// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5166// additional cost to negate them.
5167TargetLowering::NegatibleCost
5168AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
5169 if (C->isZero())
5170 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5171
5172 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5173 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5174
5175 return NegatibleCost::Neutral;
5176}
5177
5178bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5179 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5180 return getConstantNegateCost(C) == NegatibleCost::Expensive;
5181 return false;
5182}
5183
5184bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5185 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5186 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5187 return false;
5188}
5189
5190static unsigned inverseMinMax(unsigned Opc) {
5191 switch (Opc) {
5192 case ISD::FMAXNUM:
5193 return ISD::FMINNUM;
5194 case ISD::FMINNUM:
5195 return ISD::FMAXNUM;
5196 case ISD::FMAXNUM_IEEE:
5197 return ISD::FMINNUM_IEEE;
5198 case ISD::FMINNUM_IEEE:
5199 return ISD::FMAXNUM_IEEE;
5200 case ISD::FMAXIMUM:
5201 return ISD::FMINIMUM;
5202 case ISD::FMINIMUM:
5203 return ISD::FMAXIMUM;
5204 case ISD::FMAXIMUMNUM:
5205 return ISD::FMINIMUMNUM;
5206 case ISD::FMINIMUMNUM:
5207 return ISD::FMAXIMUMNUM;
5208 case AMDGPUISD::FMAX_LEGACY:
5209 return AMDGPUISD::FMIN_LEGACY;
5210 case AMDGPUISD::FMIN_LEGACY:
5211 return AMDGPUISD::FMAX_LEGACY;
5212 default:
5213 llvm_unreachable("invalid min/max opcode");
5214 }
5215}
5216
5217/// \return true if it's profitable to try to push an fneg into its source
5218/// instruction.
5219bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5220 // If the input has multiple uses and we can either fold the negate down, or
5221 // the other uses cannot, give up. This both prevents unprofitable
5222 // transformations and infinite loops: we won't repeatedly try to fold around
5223 // a negate that has no 'good' form.
5224 if (N0.hasOneUse()) {
5225 // This may be able to fold into the source, but at a code size cost. Don't
5226 // fold if the fold into the user is free.
5227 if (allUsesHaveSourceMods(N, CostThreshold: 0))
5228 return false;
5229 } else {
5230 if (fnegFoldsIntoOp(N: N0.getNode()) &&
5231 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
5232 return false;
5233 }
5234
5235 return true;
5236}
5237
5238SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5239 DAGCombinerInfo &DCI) const {
5240 SelectionDAG &DAG = DCI.DAG;
5241 SDValue N0 = N->getOperand(Num: 0);
5242 EVT VT = N->getValueType(ResNo: 0);
5243
5244 unsigned Opc = N0.getOpcode();
5245
5246 if (!shouldFoldFNegIntoSrc(N, N0))
5247 return SDValue();
5248
5249 SDLoc SL(N);
5250 switch (Opc) {
5251 case ISD::FADD: {
5252 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5253 return SDValue();
5254
5255 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5256 SDValue LHS = N0.getOperand(i: 0);
5257 SDValue RHS = N0.getOperand(i: 1);
5258
5259 if (LHS.getOpcode() != ISD::FNEG)
5260 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5261 else
5262 LHS = LHS.getOperand(i: 0);
5263
5264 if (RHS.getOpcode() != ISD::FNEG)
5265 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5266 else
5267 RHS = RHS.getOperand(i: 0);
5268
5269 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5270 if (Res.getOpcode() != ISD::FADD)
5271 return SDValue(); // Op got folded away.
5272 if (!N0.hasOneUse())
5273 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5274 return Res;
5275 }
5276 case ISD::FMUL:
5277 case AMDGPUISD::FMUL_LEGACY: {
5278 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5279 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5280 SDValue LHS = N0.getOperand(i: 0);
5281 SDValue RHS = N0.getOperand(i: 1);
5282
5283 if (LHS.getOpcode() == ISD::FNEG)
5284 LHS = LHS.getOperand(i: 0);
5285 else if (RHS.getOpcode() == ISD::FNEG)
5286 RHS = RHS.getOperand(i: 0);
5287 else
5288 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5289
5290 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5291 if (Res.getOpcode() != Opc)
5292 return SDValue(); // Op got folded away.
5293 if (!N0.hasOneUse())
5294 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5295 return Res;
5296 }
5297 case ISD::FMA:
5298 case ISD::FMAD: {
5299 // TODO: handle llvm.amdgcn.fma.legacy
5300 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5301 return SDValue();
5302
5303 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5304 SDValue LHS = N0.getOperand(i: 0);
5305 SDValue MHS = N0.getOperand(i: 1);
5306 SDValue RHS = N0.getOperand(i: 2);
5307
5308 if (LHS.getOpcode() == ISD::FNEG)
5309 LHS = LHS.getOperand(i: 0);
5310 else if (MHS.getOpcode() == ISD::FNEG)
5311 MHS = MHS.getOperand(i: 0);
5312 else
5313 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5314
5315 if (RHS.getOpcode() != ISD::FNEG)
5316 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5317 else
5318 RHS = RHS.getOperand(i: 0);
5319
5320 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5321 if (Res.getOpcode() != Opc)
5322 return SDValue(); // Op got folded away.
5323 if (!N0.hasOneUse())
5324 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5325 return Res;
5326 }
5327 case ISD::FMAXNUM:
5328 case ISD::FMINNUM:
5329 case ISD::FMAXNUM_IEEE:
5330 case ISD::FMINNUM_IEEE:
5331 case ISD::FMINIMUM:
5332 case ISD::FMAXIMUM:
5333 case ISD::FMINIMUMNUM:
5334 case ISD::FMAXIMUMNUM:
5335 case AMDGPUISD::FMAX_LEGACY:
5336 case AMDGPUISD::FMIN_LEGACY: {
5337 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5338 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5339 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5340 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5341
5342 SDValue LHS = N0.getOperand(i: 0);
5343 SDValue RHS = N0.getOperand(i: 1);
5344
5345 // 0 doesn't have a negated inline immediate.
5346 // TODO: This constant check should be generalized to other operations.
5347 if (isConstantCostlierToNegate(N: RHS))
5348 return SDValue();
5349
5350 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5351 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5352 unsigned Opposite = inverseMinMax(Opc);
5353
5354 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5355 if (Res.getOpcode() != Opposite)
5356 return SDValue(); // Op got folded away.
5357 if (!N0.hasOneUse())
5358 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5359 return Res;
5360 }
5361 case AMDGPUISD::FMED3: {
5362 SDValue Ops[3];
5363 for (unsigned I = 0; I < 3; ++I)
5364 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5365
5366 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5367 if (Res.getOpcode() != AMDGPUISD::FMED3)
5368 return SDValue(); // Op got folded away.
5369
5370 if (!N0.hasOneUse()) {
5371 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5372 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5373
5374 for (SDNode *U : Neg->users())
5375 DCI.AddToWorklist(N: U);
5376 }
5377
5378 return Res;
5379 }
5380 case ISD::FP_EXTEND:
5381 case ISD::FTRUNC:
5382 case ISD::FRINT:
5383 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5384 case ISD::FROUNDEVEN:
5385 case ISD::FSIN:
5386 case ISD::FCANONICALIZE:
5387 case AMDGPUISD::RCP:
5388 case AMDGPUISD::RCP_LEGACY:
5389 case AMDGPUISD::RCP_IFLAG:
5390 case AMDGPUISD::SIN_HW: {
5391 SDValue CvtSrc = N0.getOperand(i: 0);
5392 if (CvtSrc.getOpcode() == ISD::FNEG) {
5393 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5394 // (fneg (rcp (fneg x))) -> (rcp x)
5395 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5396 }
5397
5398 if (!N0.hasOneUse())
5399 return SDValue();
5400
5401 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5402 // (fneg (rcp x)) -> (rcp (fneg x))
5403 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5404 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5405 }
5406 case ISD::FP_ROUND: {
5407 SDValue CvtSrc = N0.getOperand(i: 0);
5408
5409 if (CvtSrc.getOpcode() == ISD::FNEG) {
5410 // (fneg (fp_round (fneg x))) -> (fp_round x)
5411 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5412 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5413 }
5414
5415 if (!N0.hasOneUse())
5416 return SDValue();
5417
5418 // (fneg (fp_round x)) -> (fp_round (fneg x))
5419 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5420 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5421 }
5422 case ISD::FP16_TO_FP: {
5423 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5424 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5425 // Put the fneg back as a legal source operation that can be matched later.
5426 SDLoc SL(N);
5427
5428 SDValue Src = N0.getOperand(i: 0);
5429 EVT SrcVT = Src.getValueType();
5430
5431 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5432 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5433 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5434 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5435 }
5436 case ISD::SELECT: {
5437 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5438 // TODO: Invert conditions of foldFreeOpFromSelect
5439 return SDValue();
5440 }
5441 case ISD::BITCAST: {
5442 SDLoc SL(N);
5443 SDValue BCSrc = N0.getOperand(i: 0);
5444 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5445 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5446 if (HighBits.getValueType().getSizeInBits() != 32 ||
5447 !fnegFoldsIntoOp(N: HighBits.getNode()))
5448 return SDValue();
5449
5450 // f64 fneg only really needs to operate on the high half of of the
5451 // register, so try to force it to an f32 operation to help make use of
5452 // source modifiers.
5453 //
5454 //
5455 // fneg (f64 (bitcast (build_vector x, y))) ->
5456 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5457 // (fneg (bitcast i32:y to f32)))
5458
5459 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5460 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5461 SDValue CastBack =
5462 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5463
5464 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5465 Ops.back() = CastBack;
5466 DCI.AddToWorklist(N: NegHi.getNode());
5467 SDValue Build =
5468 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5469 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5470
5471 if (!N0.hasOneUse())
5472 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5473 return Result;
5474 }
5475
5476 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5477 BCSrc.hasOneUse()) {
5478 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5479 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5480
5481 // TODO: Cast back result for multiple uses is beneficial in some cases.
5482
5483 SDValue LHS =
5484 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5485 SDValue RHS =
5486 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5487
5488 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5489 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5490
5491 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5492 N3: NegRHS);
5493 }
5494
5495 return SDValue();
5496 }
5497 default:
5498 return SDValue();
5499 }
5500}
5501
5502SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5503 DAGCombinerInfo &DCI) const {
5504 SelectionDAG &DAG = DCI.DAG;
5505 SDValue N0 = N->getOperand(Num: 0);
5506
5507 if (!N0.hasOneUse())
5508 return SDValue();
5509
5510 switch (N0.getOpcode()) {
5511 case ISD::FP16_TO_FP: {
5512 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5513 SDLoc SL(N);
5514 SDValue Src = N0.getOperand(i: 0);
5515 EVT SrcVT = Src.getValueType();
5516
5517 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5518 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5519 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5520 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5521 }
5522 default:
5523 return SDValue();
5524 }
5525}
5526
5527SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5528 DAGCombinerInfo &DCI) const {
5529 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5530 if (!CFP)
5531 return SDValue();
5532
5533 // XXX - Should this flush denormals?
5534 const APFloat &Val = CFP->getValueAPF();
5535 APFloat One(Val.getSemantics(), "1.0");
5536 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5537}
5538
5539bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const {
5540 if (!Subtarget->isGCN())
5541 return false;
5542
5543 ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(Val: N);
5544 ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(Val: N);
5545 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5546 const auto *TII = ST.getInstrInfo();
5547
5548 if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant))
5549 return false;
5550
5551 if (ST.has64BitLiterals())
5552 return true;
5553
5554 if (SDConstant) {
5555 const APInt &APVal = SDConstant->getAPIntValue();
5556 return isUInt<32>(x: APVal.getZExtValue()) || TII->isInlineConstant(Imm: APVal);
5557 }
5558
5559 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5560 return isUInt<32>(x: Val.getZExtValue()) || TII->isInlineConstant(Imm: Val);
5561}
5562
5563SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5564 DAGCombinerInfo &DCI) const {
5565 SelectionDAG &DAG = DCI.DAG;
5566 SDLoc DL(N);
5567
5568 switch(N->getOpcode()) {
5569 default:
5570 break;
5571 case ISD::BITCAST: {
5572 EVT DestVT = N->getValueType(ResNo: 0);
5573
5574 // Push casts through vector builds. This helps avoid emitting a large
5575 // number of copies when materializing floating point vector constants.
5576 //
5577 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5578 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5579 if (DestVT.isVector()) {
5580 SDValue Src = N->getOperand(Num: 0);
5581 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5582 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5583 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5584 EVT SrcVT = Src.getValueType();
5585 unsigned NElts = DestVT.getVectorNumElements();
5586
5587 if (SrcVT.getVectorNumElements() == NElts) {
5588 EVT DestEltVT = DestVT.getVectorElementType();
5589
5590 SmallVector<SDValue, 8> CastedElts;
5591 SDLoc SL(N);
5592 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5593 SDValue Elt = Src.getOperand(i: I);
5594 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5595 }
5596
5597 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5598 }
5599 }
5600 }
5601
5602 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5603 break;
5604
5605 // Fold bitcasts of constants.
5606 //
5607 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5608 // TODO: Generalize and move to DAGCombiner
5609 SDValue Src = N->getOperand(Num: 0);
5610 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5611 SDLoc SL(N);
5612 if (isInt64ImmLegal(N: C, DAG))
5613 break;
5614 uint64_t CVal = C->getZExtValue();
5615 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5616 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5617 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5618 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5619 }
5620
5621 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5622 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5623 SDLoc SL(N);
5624 if (isInt64ImmLegal(N: C, DAG))
5625 break;
5626 uint64_t CVal = Val.getZExtValue();
5627 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5628 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5629 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5630
5631 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5632 }
5633
5634 break;
5635 }
5636 case ISD::SHL:
5637 case ISD::SRA:
5638 case ISD::SRL: {
5639 // Range metadata can be invalidated when loads are converted to legal types
5640 // (e.g. v2i64 -> v4i32).
5641 // Try to convert vector shl/sra/srl before type legalization so that range
5642 // metadata can be utilized.
5643 if (!(N->getValueType(ResNo: 0).isVector() &&
5644 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5645 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5646 break;
5647 if (N->getOpcode() == ISD::SHL)
5648 return performShlCombine(N, DCI);
5649 if (N->getOpcode() == ISD::SRA)
5650 return performSraCombine(N, DCI);
5651 return performSrlCombine(N, DCI);
5652 }
5653 case ISD::TRUNCATE:
5654 return performTruncateCombine(N, DCI);
5655 case ISD::MUL:
5656 return performMulCombine(N, DCI);
5657 case AMDGPUISD::MUL_U24:
5658 case AMDGPUISD::MUL_I24: {
5659 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5660 return Simplified;
5661 break;
5662 }
5663 case AMDGPUISD::MULHI_I24:
5664 case AMDGPUISD::MULHI_U24:
5665 return simplifyMul24(Node24: N, DCI);
5666 case ISD::SMUL_LOHI:
5667 case ISD::UMUL_LOHI:
5668 return performMulLoHiCombine(N, DCI);
5669 case ISD::MULHS:
5670 return performMulhsCombine(N, DCI);
5671 case ISD::MULHU:
5672 return performMulhuCombine(N, DCI);
5673 case ISD::SELECT:
5674 return performSelectCombine(N, DCI);
5675 case ISD::FNEG:
5676 return performFNegCombine(N, DCI);
5677 case ISD::FABS:
5678 return performFAbsCombine(N, DCI);
5679 case AMDGPUISD::BFE_I32:
5680 case AMDGPUISD::BFE_U32: {
5681 assert(!N->getValueType(0).isVector() &&
5682 "Vector handling of BFE not implemented");
5683 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5684 if (!Width)
5685 break;
5686
5687 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5688 if (WidthVal == 0)
5689 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5690
5691 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5692 if (!Offset)
5693 break;
5694
5695 SDValue BitsFrom = N->getOperand(Num: 0);
5696 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5697
5698 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5699
5700 if (OffsetVal == 0) {
5701 // This is already sign / zero extended, so try to fold away extra BFEs.
5702 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5703
5704 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5705 if (OpSignBits >= SignBits)
5706 return BitsFrom;
5707
5708 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5709 if (Signed) {
5710 // This is a sign_extend_inreg. Replace it to take advantage of existing
5711 // DAG Combines. If not eliminated, we will match back to BFE during
5712 // selection.
5713
5714 // TODO: The sext_inreg of extended types ends, although we can could
5715 // handle them in a single BFE.
5716 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5717 N2: DAG.getValueType(SmallVT));
5718 }
5719
5720 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5721 }
5722
5723 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5724 if (Signed) {
5725 return constantFoldBFE<int32_t>(DAG,
5726 Src0: CVal->getSExtValue(),
5727 Offset: OffsetVal,
5728 Width: WidthVal,
5729 DL);
5730 }
5731
5732 return constantFoldBFE<uint32_t>(DAG,
5733 Src0: CVal->getZExtValue(),
5734 Offset: OffsetVal,
5735 Width: WidthVal,
5736 DL);
5737 }
5738
5739 if ((OffsetVal + WidthVal) >= 32 &&
5740 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5741 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5742 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5743 N1: BitsFrom, N2: ShiftVal);
5744 }
5745
5746 if (BitsFrom.hasOneUse()) {
5747 APInt Demanded = APInt::getBitsSet(numBits: 32,
5748 loBit: OffsetVal,
5749 hiBit: OffsetVal + WidthVal);
5750
5751 KnownBits Known;
5752 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5753 !DCI.isBeforeLegalizeOps());
5754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5755 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5756 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5757 DCI.CommitTargetLoweringOpt(TLO);
5758 }
5759 }
5760
5761 break;
5762 }
5763 case ISD::LOAD:
5764 return performLoadCombine(N, DCI);
5765 case ISD::STORE:
5766 return performStoreCombine(N, DCI);
5767 case AMDGPUISD::RCP:
5768 case AMDGPUISD::RCP_IFLAG:
5769 return performRcpCombine(N, DCI);
5770 case ISD::AssertZext:
5771 case ISD::AssertSext:
5772 return performAssertSZExtCombine(N, DCI);
5773 case ISD::INTRINSIC_WO_CHAIN:
5774 return performIntrinsicWOChainCombine(N, DCI);
5775 case AMDGPUISD::FMAD_FTZ: {
5776 SDValue N0 = N->getOperand(Num: 0);
5777 SDValue N1 = N->getOperand(Num: 1);
5778 SDValue N2 = N->getOperand(Num: 2);
5779 EVT VT = N->getValueType(ResNo: 0);
5780
5781 // FMAD_FTZ is a FMAD + flush denormals to zero.
5782 // We flush the inputs, the intermediate step, and the output.
5783 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5784 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5785 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5786 if (N0CFP && N1CFP && N2CFP) {
5787 const auto FTZ = [](const APFloat &V) {
5788 if (V.isDenormal()) {
5789 APFloat Zero(V.getSemantics(), 0);
5790 return V.isNegative() ? -Zero : Zero;
5791 }
5792 return V;
5793 };
5794
5795 APFloat V0 = FTZ(N0CFP->getValueAPF());
5796 APFloat V1 = FTZ(N1CFP->getValueAPF());
5797 APFloat V2 = FTZ(N2CFP->getValueAPF());
5798 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5799 V0 = FTZ(V0);
5800 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5801 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5802 }
5803 break;
5804 }
5805 }
5806 return SDValue();
5807}
5808
5809bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(
5810 SDValue Op, const APInt &OriginalDemandedBits,
5811 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5812 unsigned Depth) const {
5813 switch (Op.getOpcode()) {
5814 case ISD::INTRINSIC_WO_CHAIN: {
5815 switch (Op.getConstantOperandVal(i: 0)) {
5816 case Intrinsic::amdgcn_readfirstlane: {
5817 if (SimplifyDemandedBits(Op: Op.getOperand(i: 1), DemandedBits: OriginalDemandedBits,
5818 DemandedElts: OriginalDemandedElts, Known, TLO, Depth: Depth + 1))
5819 return true;
5820 break;
5821 }
5822 default:
5823 break;
5824 }
5825 break;
5826 }
5827 default:
5828 break;
5829 }
5830
5831 return false;
5832}
5833
5834//===----------------------------------------------------------------------===//
5835// Helper functions
5836//===----------------------------------------------------------------------===//
5837
5838SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5839 const TargetRegisterClass *RC,
5840 Register Reg, EVT VT,
5841 const SDLoc &SL,
5842 bool RawReg) const {
5843 MachineFunction &MF = DAG.getMachineFunction();
5844 MachineRegisterInfo &MRI = MF.getRegInfo();
5845 Register VReg;
5846
5847 if (!MRI.isLiveIn(Reg)) {
5848 VReg = MRI.createVirtualRegister(RegClass: RC);
5849 MRI.addLiveIn(Reg, vreg: VReg);
5850 } else {
5851 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5852 }
5853
5854 if (RawReg)
5855 return DAG.getRegister(Reg: VReg, VT);
5856
5857 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5858}
5859
5860// This may be called multiple times, and nothing prevents creating multiple
5861// objects at the same offset. See if we already defined this object.
5862static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5863 int64_t Offset) {
5864 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5865 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5866 assert(MFI.getObjectSize(I) == Size);
5867 return I;
5868 }
5869 }
5870
5871 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5872}
5873
5874SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5875 EVT VT,
5876 const SDLoc &SL,
5877 int64_t Offset) const {
5878 MachineFunction &MF = DAG.getMachineFunction();
5879 MachineFrameInfo &MFI = MF.getFrameInfo();
5880 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5881
5882 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5883 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5884
5885 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5886 MMOFlags: MachineMemOperand::MODereferenceable |
5887 MachineMemOperand::MOInvariant);
5888}
5889
5890SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5891 const SDLoc &SL,
5892 SDValue Chain,
5893 SDValue ArgVal,
5894 int64_t Offset) const {
5895 MachineFunction &MF = DAG.getMachineFunction();
5896 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5897 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5898
5899 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5900 // Stores to the argument stack area are relative to the stack pointer.
5901 SDValue SP =
5902 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5903 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5904 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5905 MMOFlags: MachineMemOperand::MODereferenceable);
5906 return Store;
5907}
5908
5909SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5910 const TargetRegisterClass *RC,
5911 EVT VT, const SDLoc &SL,
5912 const ArgDescriptor &Arg) const {
5913 assert(Arg && "Attempting to load missing argument");
5914
5915 SDValue V = Arg.isRegister() ?
5916 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5917 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5918
5919 if (!Arg.isMasked())
5920 return V;
5921
5922 unsigned Mask = Arg.getMask();
5923 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5924 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5925 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5926 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5927 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5928}
5929
5930uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5931 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5932 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5933 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5934 uint64_t ArgOffset =
5935 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5936 switch (Param) {
5937 case FIRST_IMPLICIT:
5938 return ArgOffset;
5939 case PRIVATE_BASE:
5940 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5941 case SHARED_BASE:
5942 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5943 case QUEUE_PTR:
5944 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5945 }
5946 llvm_unreachable("unexpected implicit parameter type");
5947}
5948
5949uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5950 const MachineFunction &MF, const ImplicitParameter Param) const {
5951 const AMDGPUMachineFunctionInfo *MFI =
5952 MF.getInfo<AMDGPUMachineFunctionInfo>();
5953 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5954}
5955
5956SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5957 SelectionDAG &DAG, int Enabled,
5958 int &RefinementSteps,
5959 bool &UseOneConstNR,
5960 bool Reciprocal) const {
5961 EVT VT = Operand.getValueType();
5962
5963 if (VT == MVT::f32) {
5964 RefinementSteps = 0;
5965 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5966 }
5967
5968 // TODO: There is also f64 rsq instruction, but the documentation is less
5969 // clear on its precision.
5970
5971 return SDValue();
5972}
5973
5974SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5975 SelectionDAG &DAG, int Enabled,
5976 int &RefinementSteps) const {
5977 EVT VT = Operand.getValueType();
5978
5979 if (VT == MVT::f32) {
5980 // Reciprocal, < 1 ulp error.
5981 //
5982 // This reciprocal approximation converges to < 0.5 ulp error with one
5983 // newton rhapson performed with two fused multiple adds (FMAs).
5984
5985 RefinementSteps = 0;
5986 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5987 }
5988
5989 // TODO: There is also f64 rcp instruction, but the documentation is less
5990 // clear on its precision.
5991
5992 return SDValue();
5993}
5994
5995static unsigned workitemIntrinsicDim(unsigned ID) {
5996 switch (ID) {
5997 case Intrinsic::amdgcn_workitem_id_x:
5998 return 0;
5999 case Intrinsic::amdgcn_workitem_id_y:
6000 return 1;
6001 case Intrinsic::amdgcn_workitem_id_z:
6002 return 2;
6003 default:
6004 llvm_unreachable("not a workitem intrinsic");
6005 }
6006}
6007
6008void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
6009 const SDValue Op, KnownBits &Known,
6010 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6011
6012 Known.resetAll(); // Don't know anything.
6013
6014 unsigned Opc = Op.getOpcode();
6015
6016 switch (Opc) {
6017 default:
6018 break;
6019 case AMDGPUISD::CARRY:
6020 case AMDGPUISD::BORROW: {
6021 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
6022 break;
6023 }
6024
6025 case AMDGPUISD::BFE_I32:
6026 case AMDGPUISD::BFE_U32: {
6027 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6028 if (!CWidth)
6029 return;
6030
6031 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6032
6033 if (Opc == AMDGPUISD::BFE_U32)
6034 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
6035
6036 break;
6037 }
6038 case AMDGPUISD::FP_TO_FP16: {
6039 unsigned BitWidth = Known.getBitWidth();
6040
6041 // High bits are zero.
6042 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
6043 break;
6044 }
6045 case AMDGPUISD::MUL_U24:
6046 case AMDGPUISD::MUL_I24: {
6047 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6048 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6049 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
6050 RHSKnown.countMinTrailingZeros();
6051 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
6052 // Skip extra check if all bits are known zeros.
6053 if (TrailZ >= 32)
6054 break;
6055
6056 // Truncate to 24 bits.
6057 LHSKnown = LHSKnown.trunc(BitWidth: 24);
6058 RHSKnown = RHSKnown.trunc(BitWidth: 24);
6059
6060 if (Opc == AMDGPUISD::MUL_I24) {
6061 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
6062 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
6063 unsigned MaxValBits = LHSValBits + RHSValBits;
6064 if (MaxValBits > 32)
6065 break;
6066 unsigned SignBits = 32 - MaxValBits + 1;
6067 bool LHSNegative = LHSKnown.isNegative();
6068 bool LHSNonNegative = LHSKnown.isNonNegative();
6069 bool LHSPositive = LHSKnown.isStrictlyPositive();
6070 bool RHSNegative = RHSKnown.isNegative();
6071 bool RHSNonNegative = RHSKnown.isNonNegative();
6072 bool RHSPositive = RHSKnown.isStrictlyPositive();
6073
6074 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
6075 Known.Zero.setHighBits(SignBits);
6076 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
6077 Known.One.setHighBits(SignBits);
6078 } else {
6079 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
6080 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
6081 unsigned MaxValBits = LHSValBits + RHSValBits;
6082 if (MaxValBits >= 32)
6083 break;
6084 Known.Zero.setBitsFrom(MaxValBits);
6085 }
6086 break;
6087 }
6088 case AMDGPUISD::PERM: {
6089 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6090 if (!CMask)
6091 return;
6092
6093 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6094 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6095 unsigned Sel = CMask->getZExtValue();
6096
6097 for (unsigned I = 0; I < 32; I += 8) {
6098 unsigned SelBits = Sel & 0xff;
6099 if (SelBits < 4) {
6100 SelBits *= 8;
6101 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6102 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6103 } else if (SelBits < 7) {
6104 SelBits = (SelBits & 3) * 8;
6105 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6106 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6107 } else if (SelBits == 0x0c) {
6108 Known.Zero |= 0xFFull << I;
6109 } else if (SelBits > 0x0c) {
6110 Known.One |= 0xFFull << I;
6111 }
6112 Sel >>= 8;
6113 }
6114 break;
6115 }
6116 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6117 Known.Zero.setHighBits(24);
6118 break;
6119 }
6120 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6121 Known.Zero.setHighBits(16);
6122 break;
6123 }
6124 case AMDGPUISD::LDS: {
6125 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
6126 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
6127
6128 Known.Zero.setHighBits(16);
6129 Known.Zero.setLowBits(Log2(A: Alignment));
6130 break;
6131 }
6132 case AMDGPUISD::SMIN3:
6133 case AMDGPUISD::SMAX3:
6134 case AMDGPUISD::SMED3:
6135 case AMDGPUISD::UMIN3:
6136 case AMDGPUISD::UMAX3:
6137 case AMDGPUISD::UMED3: {
6138 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6139 if (Known2.isUnknown())
6140 break;
6141
6142 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6143 if (Known1.isUnknown())
6144 break;
6145
6146 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6147 if (Known0.isUnknown())
6148 break;
6149
6150 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6151 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6152 Known.One = Known0.One & Known1.One & Known2.One;
6153 break;
6154 }
6155 case ISD::INTRINSIC_WO_CHAIN: {
6156 unsigned IID = Op.getConstantOperandVal(i: 0);
6157 switch (IID) {
6158 case Intrinsic::amdgcn_workitem_id_x:
6159 case Intrinsic::amdgcn_workitem_id_y:
6160 case Intrinsic::amdgcn_workitem_id_z: {
6161 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6162 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
6163 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
6164 break;
6165 }
6166 default:
6167 break;
6168 }
6169 }
6170 }
6171}
6172
6173unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6174 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6175 unsigned Depth) const {
6176 switch (Op.getOpcode()) {
6177 case AMDGPUISD::BFE_I32: {
6178 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6179 if (!Width)
6180 return 1;
6181
6182 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6183 if (!isNullConstant(V: Op.getOperand(i: 1)))
6184 return SignBits;
6185
6186 // TODO: Could probably figure something out with non-0 offsets.
6187 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6188 return std::max(a: SignBits, b: Op0SignBits);
6189 }
6190
6191 case AMDGPUISD::BFE_U32: {
6192 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6193 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6194 }
6195
6196 case AMDGPUISD::CARRY:
6197 case AMDGPUISD::BORROW:
6198 return 31;
6199 case AMDGPUISD::BUFFER_LOAD_BYTE:
6200 return 25;
6201 case AMDGPUISD::BUFFER_LOAD_SHORT:
6202 return 17;
6203 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6204 return 24;
6205 case AMDGPUISD::BUFFER_LOAD_USHORT:
6206 return 16;
6207 case AMDGPUISD::FP_TO_FP16:
6208 return 16;
6209 case AMDGPUISD::SMIN3:
6210 case AMDGPUISD::SMAX3:
6211 case AMDGPUISD::SMED3:
6212 case AMDGPUISD::UMIN3:
6213 case AMDGPUISD::UMAX3:
6214 case AMDGPUISD::UMED3: {
6215 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6216 if (Tmp2 == 1)
6217 return 1; // Early out.
6218
6219 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6220 if (Tmp1 == 1)
6221 return 1; // Early out.
6222
6223 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6224 if (Tmp0 == 1)
6225 return 1; // Early out.
6226
6227 return std::min(l: {Tmp0, Tmp1, Tmp2});
6228 }
6229 default:
6230 return 1;
6231 }
6232}
6233
6234unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6235 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6236 const MachineRegisterInfo &MRI, unsigned Depth) const {
6237 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6238 if (!MI)
6239 return 1;
6240
6241 // TODO: Check range metadata on MMO.
6242 switch (MI->getOpcode()) {
6243 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6244 return 25;
6245 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6246 return 17;
6247 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6248 return 24;
6249 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6250 return 16;
6251 case AMDGPU::G_AMDGPU_SMED3:
6252 case AMDGPU::G_AMDGPU_UMED3: {
6253 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6254 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6255 if (Tmp2 == 1)
6256 return 1;
6257 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6258 if (Tmp1 == 1)
6259 return 1;
6260 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6261 if (Tmp0 == 1)
6262 return 1;
6263 return std::min(l: {Tmp0, Tmp1, Tmp2});
6264 }
6265 default:
6266 return 1;
6267 }
6268}
6269
6270bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6271 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6272 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6273 unsigned Opcode = Op.getOpcode();
6274 switch (Opcode) {
6275 case AMDGPUISD::BFE_I32:
6276 case AMDGPUISD::BFE_U32:
6277 return false;
6278 }
6279 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6280 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6281}
6282
6283bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6284 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6285 unsigned Depth) const {
6286 unsigned Opcode = Op.getOpcode();
6287 switch (Opcode) {
6288 case AMDGPUISD::FMIN_LEGACY:
6289 case AMDGPUISD::FMAX_LEGACY: {
6290 if (SNaN)
6291 return true;
6292
6293 // TODO: Can check no nans on one of the operands for each one, but which
6294 // one?
6295 return false;
6296 }
6297 case AMDGPUISD::FMUL_LEGACY:
6298 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6299 if (SNaN)
6300 return true;
6301 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6302 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6303 }
6304 case AMDGPUISD::FMED3:
6305 case AMDGPUISD::FMIN3:
6306 case AMDGPUISD::FMAX3:
6307 case AMDGPUISD::FMINIMUM3:
6308 case AMDGPUISD::FMAXIMUM3:
6309 case AMDGPUISD::FMAD_FTZ: {
6310 if (SNaN)
6311 return true;
6312 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6313 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6314 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6315 }
6316 case AMDGPUISD::CVT_F32_UBYTE0:
6317 case AMDGPUISD::CVT_F32_UBYTE1:
6318 case AMDGPUISD::CVT_F32_UBYTE2:
6319 case AMDGPUISD::CVT_F32_UBYTE3:
6320 return true;
6321
6322 case AMDGPUISD::RCP:
6323 case AMDGPUISD::RSQ:
6324 case AMDGPUISD::RCP_LEGACY:
6325 case AMDGPUISD::RSQ_CLAMP: {
6326 if (SNaN)
6327 return true;
6328
6329 // TODO: Need is known positive check.
6330 return false;
6331 }
6332 case ISD::FLDEXP:
6333 case AMDGPUISD::FRACT: {
6334 if (SNaN)
6335 return true;
6336 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6337 }
6338 case AMDGPUISD::DIV_SCALE:
6339 case AMDGPUISD::DIV_FMAS:
6340 case AMDGPUISD::DIV_FIXUP:
6341 // TODO: Refine on operands.
6342 return SNaN;
6343 case AMDGPUISD::SIN_HW:
6344 case AMDGPUISD::COS_HW: {
6345 // TODO: Need check for infinity
6346 return SNaN;
6347 }
6348 case ISD::INTRINSIC_WO_CHAIN: {
6349 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6350 // TODO: Handle more intrinsics
6351 switch (IntrinsicID) {
6352 case Intrinsic::amdgcn_cubeid:
6353 case Intrinsic::amdgcn_cvt_off_f32_i4:
6354 return true;
6355
6356 case Intrinsic::amdgcn_frexp_mant: {
6357 if (SNaN)
6358 return true;
6359 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6360 }
6361 case Intrinsic::amdgcn_cvt_pkrtz: {
6362 if (SNaN)
6363 return true;
6364 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6365 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6366 }
6367 case Intrinsic::amdgcn_rcp:
6368 case Intrinsic::amdgcn_rsq:
6369 case Intrinsic::amdgcn_rcp_legacy:
6370 case Intrinsic::amdgcn_rsq_legacy:
6371 case Intrinsic::amdgcn_rsq_clamp:
6372 case Intrinsic::amdgcn_tanh: {
6373 if (SNaN)
6374 return true;
6375
6376 // TODO: Need is known positive check.
6377 return false;
6378 }
6379 case Intrinsic::amdgcn_trig_preop:
6380 case Intrinsic::amdgcn_fdot2:
6381 // TODO: Refine on operand
6382 return SNaN;
6383 case Intrinsic::amdgcn_fma_legacy:
6384 if (SNaN)
6385 return true;
6386 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6387 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6388 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6389 default:
6390 return false;
6391 }
6392 }
6393 default:
6394 return false;
6395 }
6396}
6397
6398bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6399 Register N0, Register N1) const {
6400 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6401}
6402