1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunctionInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUSelectionDAGInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Target/TargetMachine.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
35static cl::opt<bool> AMDGPUBypassSlowDiv(
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(Val: true));
39
40// Find a larger type to do a load / store of a vector with.
41EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
48
49 return VT;
50}
51
52unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53 return DAG.computeKnownBits(Op).countMaxActiveBits();
54}
55
56unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
62AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
68 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
69 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
70 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
73 MaxGluedStoresPerMemcpy = 16;
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::v2f32, Action: Promote);
172 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::v2f32, DestVT: MVT::i64);
173
174 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
175 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
176
177 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
178 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
179
180 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
181 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
182
183 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
184 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
185
186 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::v2f32, Action: Promote);
187 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::v2f32, DestVT: MVT::i64);
188
189 // There are no 64-bit extloads. These should be done as a 32-bit extload and
190 // an extension to 64-bit.
191 for (MVT VT : MVT::integer_valuetypes())
192 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
193 Action: Expand);
194
195 for (MVT VT : MVT::integer_valuetypes()) {
196 if (VT == MVT::i64)
197 continue;
198
199 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
200 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
201 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
202 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
203 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
204 }
205 }
206
207 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
208 for (auto MemVT :
209 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
210 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
211 Action: Expand);
212
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
219 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
221 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
226 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
227
228 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
234
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
239 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
240 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
241 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
242 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
243 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
244 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
245 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
246 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
247
248 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
249 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
250
251 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
252 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
253
254 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
255 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
256
257 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
258 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
259
260 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
261 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
262
263 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
264 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
265
266 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
267 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
268
269 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
270 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
271
272 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
273 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
274
275 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
276 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
277
278 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
279 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
280
281 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
282 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
283
284 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
285 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
286
287 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
288 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
289
290 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
291 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
292
293 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
294 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
295
296 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
297 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
298
299 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
300 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
301
302 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
303 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
304
305 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
306 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
307
308 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
309 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
310
311 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
312 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
313
314 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
315 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
316
317 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
318 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
319
320 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
321 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
322
323 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
324 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
325
326 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
327 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
328
329 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
330 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
331 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
332 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
333
334 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
338
339 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
345 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
346 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
348 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
349 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
350 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
352 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
353 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
354
355 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
356 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
357 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
358
359 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
362
363 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
364
365 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
366 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
367 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
369 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
370 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
372
373 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
374 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
376 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
377 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
378
379 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
380 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
381 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
382
383 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
384 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
385 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
386
387 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
388 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
389 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
390
391 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
392 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
393 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
394
395 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
396 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
397 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
398 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
399 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
400 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
401 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
402
403 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
404 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
405
406 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
407
408 // For R600, this is totally unsupported, just custom lower to produce an
409 // error.
410 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
411
412 // Library functions. These default to Expand, but we have instructions
413 // for them.
414 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
415 ISD::FROUNDEVEN, ISD::FTRUNC},
416 VTs: {MVT::f16, MVT::f32}, Action: Legal);
417 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
418
419 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
420 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
421 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
422 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
423
424 setOperationAction(
425 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
426 Action: Custom);
427 setOperationAction(Ops: {ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f64, Action: Custom);
428
429 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
430
431 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
432
433 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
434 Action: Expand);
435
436 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
437 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
438 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
439
440 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
441 Action: Custom);
442
443 setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
444
445 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
446 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
447 // default unless marked custom/legal.
448 setOperationAction(Ops: ISD::IS_FPCLASS,
449 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
450 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
451 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
452 MVT::v16f64},
453 Action: Custom);
454
455 // Expand to fneg + fadd.
456 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
457
458 setOperationAction(Ops: ISD::CONCAT_VECTORS,
459 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
460 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Action: Custom);
465
466 setOperationAction(
467 Ops: ISD::EXTRACT_SUBVECTOR,
468 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
469 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
470 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
471 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
472 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
473 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
474 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
475 Action: Custom);
476
477 setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
478 Action: Expand);
479 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
480
481 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
482 for (MVT VT : ScalarIntVTs) {
483 // These should use [SU]DIVREM, so set them to expand
484 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
485 Action: Expand);
486
487 // GPU does not have divrem function for signed or unsigned.
488 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
489
490 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
491 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
492
493 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
494
495 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT,
496 Action: Expand);
497 }
498
499 // The hardware supports 32-bit FSHR, but not FSHL.
500 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
501
502 setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
503
504 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
505
506 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
507 setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
508 ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
509 ISD::FP_TO_UINT_SAT},
510 VT: MVT::i64, Action: Custom);
511 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
512
513 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
514 Action: Legal);
515
516 setOperationAction(
517 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_POISON, ISD::CTLZ, ISD::CTLZ_ZERO_POISON},
518 VT: MVT::i64, Action: Custom);
519
520 for (auto VT : {MVT::i8, MVT::i16})
521 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_POISON}, VT, Action: Custom);
522
523 static const MVT::SimpleValueType VectorIntTypes[] = {
524 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
525 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
526
527 for (MVT VT : VectorIntTypes) {
528 // Expand the following operations for the current type by default.
529 // clang-format off
530 setOperationAction(Ops: {ISD::ADD, ISD::AND,
531 ISD::FP_TO_SINT, ISD::FP_TO_UINT,
532 ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
533 ISD::MUL, ISD::MULHU,
534 ISD::MULHS, ISD::OR,
535 ISD::SHL, ISD::SRA,
536 ISD::SRL, ISD::ROTL,
537 ISD::ROTR, ISD::SUB,
538 ISD::SINT_TO_FP, ISD::UINT_TO_FP,
539 ISD::SDIV, ISD::UDIV,
540 ISD::SREM, ISD::UREM,
541 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
542 ISD::SDIVREM, ISD::UDIVREM,
543 ISD::SELECT, ISD::VSELECT,
544 ISD::SELECT_CC, ISD::XOR,
545 ISD::BSWAP, ISD::CTPOP,
546 ISD::CTTZ, ISD::CTLZ,
547 ISD::VECTOR_SHUFFLE, ISD::SETCC,
548 ISD::ADDRSPACECAST},
549 VT, Action: Expand);
550 // clang-format on
551 }
552
553 static const MVT::SimpleValueType FloatVectorTypes[] = {
554 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
555 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
556
557 for (MVT VT : FloatVectorTypes) {
558 setOperationAction(
559 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
560 ISD::FADD, ISD::FCEIL, ISD::FCOS,
561 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
562 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
563 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
564 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
565 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
566 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
567 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
568 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
569 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
570 VT, Action: Expand);
571 }
572
573 // This causes using an unrolled select operation rather than expansion with
574 // bit operations. This is in general better, but the alternative using BFI
575 // instructions may be better if the select sources are SGPRs.
576 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
577 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
578
579 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
580 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
581
582 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
583 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
584
585 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
586 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
587
588 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
589 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
590
591 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
592 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
593
594 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
595 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
596
597 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
598 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
599
600 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
601 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
602
603 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
604 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
605
606 setSchedulingPreference(Sched::RegPressure);
607 setJumpIsExpensive(true);
608
609 setMinCmpXchgSizeInBits(32);
610 setSupportsUnalignedAtomics(false);
611
612 PredictableSelectIsExpensive = false;
613
614 // We want to find all load dependencies for long chains of stores to enable
615 // merging into very wide vectors. The problem is with vectors with > 4
616 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
617 // vectors are a legal type, even though we have to split the loads
618 // usually. When we can more precisely specify load legality per address
619 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
620 // smarter so that they can figure out what to do in 2 iterations without all
621 // N > 4 stores on the same chain.
622 GatherAllAliasesMaxDepth = 16;
623
624 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
625 // about these during lowering.
626 MaxStoresPerMemcpy = 0xffffffff;
627 MaxStoresPerMemmove = 0xffffffff;
628 MaxStoresPerMemset = 0xffffffff;
629
630 // The expansion for 64-bit division is enormous.
631 if (AMDGPUBypassSlowDiv)
632 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
633
634 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
635 ISD::SRA, ISD::SRL,
636 ISD::TRUNCATE, ISD::MUL,
637 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
638 ISD::MULHU, ISD::MULHS,
639 ISD::SELECT, ISD::SELECT_CC,
640 ISD::STORE, ISD::FADD,
641 ISD::FSUB, ISD::FNEG,
642 ISD::FABS, ISD::AssertZext,
643 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
644
645 setMaxAtomicSizeInBitsSupported(64);
646 setMaxDivRemBitWidthSupported(64);
647 setMaxLargeFPConvertBitWidthSupported(64);
648}
649
650bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
662LLVM_READNONE
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
672 case ISD::FMINNUM_IEEE:
673 case ISD::FMAXNUM_IEEE:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
684 case ISD::FCANONICALIZE:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(Num: 0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
722LLVM_READONLY
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
730LLVM_READONLY
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(ResNo: 0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
738LLVM_READONLY
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(Val: N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
748 case ISD::INLINEASM_BR:
749 case AMDGPUISD::DIV_SCALE:
750 case ISD::INTRINSIC_W_CHAIN:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
757 case ISD::INTRINSIC_WO_CHAIN: {
758 switch (N->getConstantOperandVal(Num: 0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
770 return selectSupportsSourceMods(N);
771 default:
772 return true;
773 }
774}
775
776bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(N: U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(N: U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
802EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
811}
812
813unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
814 return 32;
815}
816
817bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
823bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
824 bool ForCodeSize) const {
825 return isTypeLegal(VT: VT.getScalarType());
826}
827
828// We don't want to shrink f64 / f32 constants.
829bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
834bool AMDGPUTargetLowering::shouldReduceLoadWidth(
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(ResNo: 0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
851 MemSDNode *MN = cast<MemSDNode>(Val: N);
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
857 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
858 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
859 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
860 MN->isInvariant())) &&
861 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
874bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
890 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
891 VT: CastTy, MMO, Fast: &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
898bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
899 return true;
900}
901
902bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
903 return true;
904}
905
906bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
911 case ISD::INTRINSIC_WO_CHAIN: {
912 unsigned IntrID = N->getConstantOperandVal(Num: 0);
913 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
914 }
915 case ISD::INTRINSIC_W_CHAIN: {
916 unsigned IntrID = N->getConstantOperandVal(Num: 1);
917 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
921 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
930SDValue AMDGPUTargetLowering::getNegatedExpression(
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(N: Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(i: 0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth: Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
958 OptForSize: ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
965bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
966 assert(VT.isFloatingPoint());
967
968 // Packed operations do not have a fabs modifier.
969 // Report this based on the end legalized type.
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
973bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
974 assert(VT.isFloatingPoint());
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
980bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
986bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
998bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1007bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1019bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1029bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1041bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ABS:
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 case ISD::USUBSAT:
1061 if (isTypeLegal(VT: MVT::i16) &&
1062 (!DestVT.isVector() ||
1063 !isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1064 // Don't narrow back down to i16 if promoted to i32 already.
1065 if (!N->isDivergent() && DestVT.isInteger() &&
1066 DestVT.getScalarSizeInBits() > 1 &&
1067 DestVT.getScalarSizeInBits() <= 16 &&
1068 SrcVT.getScalarSizeInBits() > 16) {
1069 return false;
1070 }
1071 }
1072 return true;
1073 default:
1074 break;
1075 }
1076
1077 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1078 // limited number of native 64-bit operations. Shrinking an operation to fit
1079 // in a single 32-bit register should always be helpful. As currently used,
1080 // this is much less general than the name suggests, and is only used in
1081 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1082 // not profitable, and may actually be harmful.
1083 if (isa<LoadSDNode>(Val: N))
1084 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1085
1086 return true;
1087}
1088
1089bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1090 const SDNode* N, CombineLevel Level) const {
1091 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1092 N->getOpcode() == ISD::SRL) &&
1093 "Expected shift op");
1094
1095 SDValue ShiftLHS = N->getOperand(Num: 0);
1096 if (!ShiftLHS->hasOneUse())
1097 return false;
1098
1099 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1100 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1101 return false;
1102
1103 // Always commute pre-type legalization and right shifts.
1104 // We're looking for shl(or(x,y),z) patterns.
1105 if (Level < CombineLevel::AfterLegalizeTypes ||
1106 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1107 return true;
1108
1109 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1110 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1111 (N->user_begin()->getOpcode() == ISD::SRA ||
1112 N->user_begin()->getOpcode() == ISD::SRL))
1113 return false;
1114
1115 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1116 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1117 if (LHS.getOpcode() != ISD::SHL)
1118 return false;
1119 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1120 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1121 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1122 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1123 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1124 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1125 };
1126 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1127 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1128 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1129}
1130
1131//===---------------------------------------------------------------------===//
1132// TargetLowering Callbacks
1133//===---------------------------------------------------------------------===//
1134
1135CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1136 bool IsVarArg) {
1137 switch (CC) {
1138 case CallingConv::AMDGPU_VS:
1139 case CallingConv::AMDGPU_GS:
1140 case CallingConv::AMDGPU_PS:
1141 case CallingConv::AMDGPU_CS:
1142 case CallingConv::AMDGPU_HS:
1143 case CallingConv::AMDGPU_ES:
1144 case CallingConv::AMDGPU_LS:
1145 return CC_AMDGPU;
1146 case CallingConv::AMDGPU_CS_Chain:
1147 case CallingConv::AMDGPU_CS_ChainPreserve:
1148 return CC_AMDGPU_CS_CHAIN;
1149 case CallingConv::C:
1150 case CallingConv::Fast:
1151 case CallingConv::Cold:
1152 return CC_AMDGPU_Func;
1153 case CallingConv::AMDGPU_Gfx:
1154 case CallingConv::AMDGPU_Gfx_WholeWave:
1155 return CC_SI_Gfx;
1156 case CallingConv::AMDGPU_KERNEL:
1157 case CallingConv::SPIR_KERNEL:
1158 default:
1159 reportFatalUsageError(reason: "unsupported calling convention for call");
1160 }
1161}
1162
1163CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1164 bool IsVarArg) {
1165 switch (CC) {
1166 case CallingConv::AMDGPU_KERNEL:
1167 case CallingConv::SPIR_KERNEL:
1168 llvm_unreachable("kernels should not be handled here");
1169 case CallingConv::AMDGPU_VS:
1170 case CallingConv::AMDGPU_GS:
1171 case CallingConv::AMDGPU_PS:
1172 case CallingConv::AMDGPU_CS:
1173 case CallingConv::AMDGPU_CS_Chain:
1174 case CallingConv::AMDGPU_CS_ChainPreserve:
1175 case CallingConv::AMDGPU_HS:
1176 case CallingConv::AMDGPU_ES:
1177 case CallingConv::AMDGPU_LS:
1178 return RetCC_SI_Shader;
1179 case CallingConv::AMDGPU_Gfx:
1180 case CallingConv::AMDGPU_Gfx_WholeWave:
1181 return RetCC_SI_Gfx;
1182 case CallingConv::C:
1183 case CallingConv::Fast:
1184 case CallingConv::Cold:
1185 return RetCC_AMDGPU_Func;
1186 default:
1187 reportFatalUsageError(reason: "unsupported calling convention");
1188 }
1189}
1190
1191/// The SelectionDAGBuilder will automatically promote function arguments
1192/// with illegal types. However, this does not work for the AMDGPU targets
1193/// since the function arguments are stored in memory as these illegal types.
1194/// In order to handle this properly we need to get the original types sizes
1195/// from the LLVM IR Function and fixup the ISD:InputArg values before
1196/// passing them to AnalyzeFormalArguments()
1197
1198/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1199/// input values across multiple registers. Each item in the Ins array
1200/// represents a single value that will be stored in registers. Ins[x].VT is
1201/// the value type of the value that will be stored in the register, so
1202/// whatever SDNode we lower the argument to needs to be this type.
1203///
1204/// In order to correctly lower the arguments we need to know the size of each
1205/// argument. Since Ins[x].VT gives us the size of the register that will
1206/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1207/// for the original function argument so that we can deduce the correct memory
1208/// type to use for Ins[x]. In most cases the correct memory type will be
1209/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1210/// we have a kernel argument of type v8i8, this argument will be split into
1211/// 8 parts and each part will be represented by its own item in the Ins array.
1212/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1213/// the argument before it was split. From this, we deduce that the memory type
1214/// for each individual part is i8. We pass the memory type as LocVT to the
1215/// calling convention analysis function and the register type (Ins[x].VT) as
1216/// the ValVT.
1217void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1218 CCState &State,
1219 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1220 const MachineFunction &MF = State.getMachineFunction();
1221 const Function &Fn = MF.getFunction();
1222 LLVMContext &Ctx = Fn.getContext();
1223 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1224 CallingConv::ID CC = Fn.getCallingConv();
1225
1226 Align MaxAlign = Align(1);
1227 uint64_t ExplicitArgOffset = 0;
1228 const DataLayout &DL = Fn.getDataLayout();
1229
1230 unsigned InIndex = 0;
1231
1232 for (const Argument &Arg : Fn.args()) {
1233 const bool IsByRef = Arg.hasByRefAttr();
1234 Type *BaseArgTy = Arg.getType();
1235 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1236 Align Alignment = DL.getValueOrABITypeAlignment(
1237 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1238 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1239 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1240
1241 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1242 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1243
1244 // We're basically throwing away everything passed into us and starting over
1245 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1246 // to us as computed in Ins.
1247 //
1248 // We also need to figure out what type legalization is trying to do to get
1249 // the correct memory offsets.
1250
1251 SmallVector<EVT, 16> ValueVTs;
1252 SmallVector<uint64_t, 16> Offsets;
1253 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1254 FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1255
1256 for (unsigned Value = 0, NumValues = ValueVTs.size();
1257 Value != NumValues; ++Value) {
1258 uint64_t BasePartOffset = Offsets[Value];
1259
1260 EVT ArgVT = ValueVTs[Value];
1261 EVT MemVT = ArgVT;
1262 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1263 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1264
1265 if (NumRegs == 1) {
1266 // This argument is not split, so the IR type is the memory type.
1267 if (ArgVT.isExtended()) {
1268 // We have an extended type, like i24, so we should just use the
1269 // register type.
1270 MemVT = RegisterVT;
1271 } else {
1272 MemVT = ArgVT;
1273 }
1274 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1275 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1276 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1277 // We have a vector value which has been split into a vector with
1278 // the same scalar type, but fewer elements. This should handle
1279 // all the floating-point vector types.
1280 MemVT = RegisterVT;
1281 } else if (ArgVT.isVector() &&
1282 ArgVT.getVectorNumElements() == NumRegs) {
1283 // This arg has been split so that each element is stored in a separate
1284 // register.
1285 MemVT = ArgVT.getScalarType();
1286 } else if (ArgVT.isExtended()) {
1287 // We have an extended type, like i65.
1288 MemVT = RegisterVT;
1289 } else {
1290 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1291 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1292 if (RegisterVT.isInteger()) {
1293 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1294 } else if (RegisterVT.isVector()) {
1295 assert(!RegisterVT.getScalarType().isFloatingPoint());
1296 unsigned NumElements = RegisterVT.getVectorNumElements();
1297 assert(MemoryBits % NumElements == 0);
1298 // This vector type has been split into another vector type with
1299 // a different elements size.
1300 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1301 BitWidth: MemoryBits / NumElements);
1302 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1303 } else {
1304 llvm_unreachable("cannot deduce memory type.");
1305 }
1306 }
1307
1308 // Convert one element vectors to scalar.
1309 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1310 MemVT = MemVT.getScalarType();
1311
1312 // Round up vec3/vec5 argument.
1313 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1314 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1315 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1316 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1317 }
1318
1319 unsigned PartOffset = 0;
1320 for (unsigned i = 0; i != NumRegs; ++i) {
1321 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1322 Offset: BasePartOffset + PartOffset,
1323 LocVT: MemVT.getSimpleVT(),
1324 HTP: CCValAssign::Full));
1325 PartOffset += MemVT.getStoreSize();
1326 }
1327 }
1328 }
1329}
1330
1331SDValue AMDGPUTargetLowering::LowerReturn(
1332 SDValue Chain, CallingConv::ID CallConv,
1333 bool isVarArg,
1334 const SmallVectorImpl<ISD::OutputArg> &Outs,
1335 const SmallVectorImpl<SDValue> &OutVals,
1336 const SDLoc &DL, SelectionDAG &DAG) const {
1337 // FIXME: Fails for r600 tests
1338 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1339 // "wave terminate should not have return values");
1340 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1341}
1342
1343//===---------------------------------------------------------------------===//
1344// Target specific lowering
1345//===---------------------------------------------------------------------===//
1346
1347/// Selects the correct CCAssignFn for a given CallingConvention value.
1348CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1349 bool IsVarArg) {
1350 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1351}
1352
1353CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1354 bool IsVarArg) {
1355 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1356}
1357
1358SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1359 SelectionDAG &DAG,
1360 MachineFrameInfo &MFI,
1361 int ClobberedFI) const {
1362 SmallVector<SDValue, 8> ArgChains;
1363 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1364 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1365
1366 // Include the original chain at the beginning of the list. When this is
1367 // used by target LowerCall hooks, this helps legalize find the
1368 // CALLSEQ_BEGIN node.
1369 ArgChains.push_back(Elt: Chain);
1370
1371 // Add a chain value for each stack argument corresponding
1372 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1373 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1374 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1375 if (FI->getIndex() < 0) {
1376 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1377 int64_t InLastByte = InFirstByte;
1378 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1379
1380 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1381 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1382 ArgChains.push_back(Elt: SDValue(L, 1));
1383 }
1384 }
1385 }
1386 }
1387
1388 // Build a tokenfactor for all the chains.
1389 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1390}
1391
1392SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1393 SmallVectorImpl<SDValue> &InVals,
1394 StringRef Reason) const {
1395 SDValue Callee = CLI.Callee;
1396 SelectionDAG &DAG = CLI.DAG;
1397
1398 const Function &Fn = DAG.getMachineFunction().getFunction();
1399
1400 StringRef FuncName("<unknown>");
1401
1402 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1403 FuncName = G->getSymbol();
1404 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1405 FuncName = G->getGlobal()->getName();
1406
1407 DAG.getContext()->diagnose(
1408 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1409
1410 if (!CLI.IsTailCall) {
1411 for (ISD::InputArg &Arg : CLI.Ins)
1412 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1413 }
1414
1415 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1416 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1417 return CLI.Chain;
1418
1419 SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: 0, OutSize: 0, DL: CLI.DL);
1420 return DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, /*InGlue=*/Glue: SDValue(), DL: CLI.DL);
1421}
1422
1423SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1424 SmallVectorImpl<SDValue> &InVals) const {
1425 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1426}
1427
1428SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1429 SelectionDAG &DAG) const {
1430 const Function &Fn = DAG.getMachineFunction().getFunction();
1431
1432 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1433 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1434 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1435 return DAG.getMergeValues(Ops, dl: SDLoc());
1436}
1437
1438SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1439 SelectionDAG &DAG) const {
1440 switch (Op.getOpcode()) {
1441 default:
1442 Op->print(OS&: errs(), G: &DAG);
1443 llvm_unreachable("Custom lowering code for this "
1444 "instruction is not implemented yet!");
1445 break;
1446 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1447 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1448 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1449 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1450 case ISD::SDIVREM:
1451 return LowerSDIVREM(Op, DAG);
1452 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1453 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1454 case ISD::FRINT: return LowerFRINT(Op, DAG);
1455 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1456 case ISD::FROUNDEVEN:
1457 return LowerFROUNDEVEN(Op, DAG);
1458 case ISD::FROUND: return LowerFROUND(Op, DAG);
1459 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1460 case ISD::FLOG2:
1461 return LowerFLOG2(Op, DAG);
1462 case ISD::FLOG:
1463 case ISD::FLOG10:
1464 return LowerFLOGCommon(Op, DAG);
1465 case ISD::FEXP:
1466 case ISD::FEXP10:
1467 return lowerFEXP(Op, DAG);
1468 case ISD::FEXP2:
1469 return lowerFEXP2(Op, DAG);
1470 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1471 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1472 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1473 case ISD::FP_TO_SINT:
1474 case ISD::FP_TO_UINT:
1475 return LowerFP_TO_INT(Op, DAG);
1476 case ISD::FP_TO_SINT_SAT:
1477 case ISD::FP_TO_UINT_SAT:
1478 return LowerFP_TO_INT_SAT(Op, DAG);
1479 case ISD::CTTZ:
1480 case ISD::CTTZ_ZERO_POISON:
1481 case ISD::CTLZ:
1482 case ISD::CTLZ_ZERO_POISON:
1483 return LowerCTLZ_CTTZ(Op, DAG);
1484 case ISD::CTLS:
1485 return LowerCTLS(Op, DAG);
1486 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1487 }
1488 return Op;
1489}
1490
1491void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1492 SmallVectorImpl<SDValue> &Results,
1493 SelectionDAG &DAG) const {
1494 switch (N->getOpcode()) {
1495 case ISD::SIGN_EXTEND_INREG:
1496 // Different parts of legalization seem to interpret which type of
1497 // sign_extend_inreg is the one to check for custom lowering. The extended
1498 // from type is what really matters, but some places check for custom
1499 // lowering of the result type. This results in trying to use
1500 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1501 // nothing here and let the illegal result integer be handled normally.
1502 return;
1503 case ISD::FLOG2:
1504 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1505 Results.push_back(Elt: Lowered);
1506 return;
1507 case ISD::FLOG:
1508 case ISD::FLOG10:
1509 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1510 Results.push_back(Elt: Lowered);
1511 return;
1512 case ISD::FEXP2:
1513 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1514 Results.push_back(Elt: Lowered);
1515 return;
1516 case ISD::FEXP:
1517 case ISD::FEXP10:
1518 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1519 Results.push_back(Elt: Lowered);
1520 return;
1521 case ISD::CTLZ:
1522 case ISD::CTLZ_ZERO_POISON:
1523 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1524 Results.push_back(Elt: Lowered);
1525 return;
1526 default:
1527 return;
1528 }
1529}
1530
1531SDValue AMDGPUTargetLowering::LowerBlockAddress(SDValue Op,
1532 SelectionDAG &DAG) const {
1533 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Val&: Op);
1534 SDLoc SL(Op);
1535 EVT VT = Op.getValueType();
1536 return DAG.getTargetBlockAddress(BA: BA->getBlockAddress(), VT, Offset: BA->getOffset(),
1537 TargetFlags: BA->getTargetFlags());
1538}
1539
1540SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
1541 SDValue Op,
1542 SelectionDAG &DAG) const {
1543
1544 const DataLayout &DL = DAG.getDataLayout();
1545 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1546 const GlobalValue *GV = G->getGlobal();
1547
1548 if (!MFI->isModuleEntryFunction()) {
1549 auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1550 if (std::optional<uint32_t> Address =
1551 AMDGPUMachineFunctionInfo::getLDSAbsoluteAddress(GV: *GV)) {
1552 if (IsNamedBarrier) {
1553 unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / 16;
1554 MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1555 }
1556 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1557 } else if (IsNamedBarrier) {
1558 llvm_unreachable("named barrier should have an assigned address");
1559 }
1560 }
1561
1562 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1563 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1564 if (!MFI->isModuleEntryFunction() &&
1565 GV->getName() != "llvm.amdgcn.module.lds" &&
1566 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1567 SDLoc DL(Op);
1568 const Function &Fn = DAG.getMachineFunction().getFunction();
1569 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1570 Fn, "local memory global used by non-kernel function",
1571 DL.getDebugLoc(), DS_Warning));
1572
1573 // We currently don't have a way to correctly allocate LDS objects that
1574 // aren't directly associated with a kernel. We do force inlining of
1575 // functions that use local objects. However, if these dead functions are
1576 // not eliminated, we don't want a compile time error. Just emit a warning
1577 // and a trap, since there should be no callable path here.
1578 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1579 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1580 N1: Trap, N2: DAG.getRoot());
1581 DAG.setRoot(OutputChain);
1582 return DAG.getPOISON(VT: Op.getValueType());
1583 }
1584
1585 // XXX: What does the value of G->getOffset() mean?
1586 assert(G->getOffset() == 0 &&
1587 "Do not know what to do with an non-zero offset");
1588
1589 // TODO: We could emit code to handle the initialization somewhere.
1590 // We ignore the initializer for now and legalize it to allow selection.
1591 // The initializer will anyway get errored out during assembly emission.
1592 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1593 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1594 }
1595 return SDValue();
1596}
1597
1598SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1599 SelectionDAG &DAG) const {
1600 SmallVector<SDValue, 8> Args;
1601 SDLoc SL(Op);
1602
1603 EVT VT = Op.getValueType();
1604 if (VT.getVectorElementType().getSizeInBits() < 32) {
1605 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1606 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1607 unsigned NewNumElt = OpBitSize / 32;
1608 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1609 : EVT::getVectorVT(Context&: *DAG.getContext(),
1610 VT: MVT::i32, NumElements: NewNumElt);
1611 for (const SDUse &U : Op->ops()) {
1612 SDValue In = U.get();
1613 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1614 if (NewNumElt > 1)
1615 DAG.ExtractVectorElements(Op: NewIn, Args);
1616 else
1617 Args.push_back(Elt: NewIn);
1618 }
1619
1620 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1621 NumElements: NewNumElt * Op.getNumOperands());
1622 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1623 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1624 }
1625 }
1626
1627 for (const SDUse &U : Op->ops())
1628 DAG.ExtractVectorElements(Op: U.get(), Args);
1629
1630 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1631}
1632
1633SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1634 SelectionDAG &DAG) const {
1635 SDLoc SL(Op);
1636 SmallVector<SDValue, 8> Args;
1637 unsigned Start = Op.getConstantOperandVal(i: 1);
1638 EVT VT = Op.getValueType();
1639 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1640
1641 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1642 unsigned NumElt = VT.getVectorNumElements();
1643 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1644 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1645
1646 // Extract 32-bit registers at a time.
1647 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1648 EVT NewVT = NumElt == 2
1649 ? MVT::i32
1650 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1651 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1652
1653 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1654 if (NumElt == 2)
1655 Tmp = Args[0];
1656 else
1657 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1658
1659 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1660 }
1661
1662 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1663 Count: VT.getVectorNumElements());
1664
1665 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1666}
1667
1668// TODO: Handle fabs too
1669static SDValue peekFNeg(SDValue Val) {
1670 if (Val.getOpcode() == ISD::FNEG)
1671 return Val.getOperand(i: 0);
1672
1673 return Val;
1674}
1675
1676static SDValue peekFPSignOps(SDValue Val) {
1677 if (Val.getOpcode() == ISD::FNEG)
1678 Val = Val.getOperand(i: 0);
1679 if (Val.getOpcode() == ISD::FABS)
1680 Val = Val.getOperand(i: 0);
1681 if (Val.getOpcode() == ISD::FCOPYSIGN)
1682 Val = Val.getOperand(i: 0);
1683 return Val;
1684}
1685
1686SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1687 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1688 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1689 SelectionDAG &DAG = DCI.DAG;
1690 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1691 switch (CCOpcode) {
1692 case ISD::SETOEQ:
1693 case ISD::SETONE:
1694 case ISD::SETUNE:
1695 case ISD::SETNE:
1696 case ISD::SETUEQ:
1697 case ISD::SETEQ:
1698 case ISD::SETFALSE:
1699 case ISD::SETFALSE2:
1700 case ISD::SETTRUE:
1701 case ISD::SETTRUE2:
1702 case ISD::SETUO:
1703 case ISD::SETO:
1704 break;
1705 case ISD::SETULE:
1706 case ISD::SETULT: {
1707 if (LHS == True)
1708 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1709 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1710 }
1711 case ISD::SETOLE:
1712 case ISD::SETOLT:
1713 case ISD::SETLE:
1714 case ISD::SETLT: {
1715 // Ordered. Assume ordered for undefined.
1716
1717 // Only do this after legalization to avoid interfering with other combines
1718 // which might occur.
1719 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1720 !DCI.isCalledByLegalizer())
1721 return SDValue();
1722
1723 // We need to permute the operands to get the correct NaN behavior. The
1724 // selected operand is the second one based on the failing compare with NaN,
1725 // so permute it based on the compare type the hardware uses.
1726 if (LHS == True)
1727 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1728 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1729 }
1730 case ISD::SETUGE:
1731 case ISD::SETUGT: {
1732 if (LHS == True)
1733 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1734 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1735 }
1736 case ISD::SETGT:
1737 case ISD::SETGE:
1738 case ISD::SETOGE:
1739 case ISD::SETOGT: {
1740 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1741 !DCI.isCalledByLegalizer())
1742 return SDValue();
1743
1744 if (LHS == True)
1745 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1746 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1747 }
1748 case ISD::SETCC_INVALID:
1749 llvm_unreachable("Invalid setcc condcode!");
1750 }
1751 return SDValue();
1752}
1753
1754/// Generate Min/Max node
1755SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1756 SDValue LHS, SDValue RHS,
1757 SDValue True, SDValue False,
1758 SDValue CC,
1759 DAGCombinerInfo &DCI) const {
1760 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1761 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1762
1763 SelectionDAG &DAG = DCI.DAG;
1764
1765 // If we can't directly match this, try to see if we can fold an fneg to
1766 // match.
1767
1768 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1769 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1770 SDValue NegTrue = peekFNeg(Val: True);
1771
1772 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1773 // fmin/fmax.
1774 //
1775 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1776 // -> fneg (fmin_legacy lhs, K)
1777 //
1778 // TODO: Use getNegatedExpression
1779 if (LHS == NegTrue && CFalse && CRHS) {
1780 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1781 if (NegRHS == CFalse->getValueAPF()) {
1782 SDValue Combined =
1783 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1784 if (Combined)
1785 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1786 return SDValue();
1787 }
1788 }
1789
1790 return SDValue();
1791}
1792
1793std::pair<SDValue, SDValue>
1794AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1795 SDLoc SL(Op);
1796
1797 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1798
1799 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1800 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1801
1802 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1803 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1804
1805 return std::pair(Lo, Hi);
1806}
1807
1808SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1809 SDLoc SL(Op);
1810
1811 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1812 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1813 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1814}
1815
1816SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1817 SDLoc SL(Op);
1818
1819 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1820 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1821 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1822}
1823
1824// Split a vector type into two parts. The first part is a power of two vector.
1825// The second part is whatever is left over, and is a scalar if it would
1826// otherwise be a 1-vector.
1827std::pair<EVT, EVT>
1828AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1829 EVT LoVT, HiVT;
1830 EVT EltVT = VT.getVectorElementType();
1831 unsigned NumElts = VT.getVectorNumElements();
1832 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1833 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1834 HiVT = NumElts - LoNumElts == 1
1835 ? EltVT
1836 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1837 return std::pair(LoVT, HiVT);
1838}
1839
1840// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1841// scalar.
1842std::pair<SDValue, SDValue>
1843AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1844 const EVT &LoVT, const EVT &HiVT,
1845 SelectionDAG &DAG) const {
1846 EVT VT = N.getValueType();
1847 assert(LoVT.getVectorNumElements() +
1848 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1849 VT.getVectorNumElements() &&
1850 "More vector elements requested than available!");
1851 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1852 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1853
1854 unsigned LoNumElts = LoVT.getVectorNumElements();
1855
1856 if (HiVT.isVector()) {
1857 unsigned HiNumElts = HiVT.getVectorNumElements();
1858 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1859 // Avoid creating an extract_subvector with an index that isn't a multiple
1860 // of the result type.
1861 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1862 N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1863 return {Lo, Hi};
1864 }
1865
1866 SmallVector<SDValue, 8> Elts;
1867 DAG.ExtractVectorElements(Op: N, Args&: Elts, /*Start=*/LoNumElts,
1868 /*Count=*/HiNumElts);
1869 SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1870 return {Lo, Hi};
1871 }
1872
1873 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1874 N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1875 return {Lo, Hi};
1876}
1877
1878SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1879 SelectionDAG &DAG) const {
1880 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1881 EVT VT = Op.getValueType();
1882 SDLoc SL(Op);
1883
1884
1885 // If this is a 2 element vector, we really want to scalarize and not create
1886 // weird 1 element vectors.
1887 if (VT.getVectorNumElements() == 2) {
1888 SDValue Ops[2];
1889 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1890 return DAG.getMergeValues(Ops, dl: SL);
1891 }
1892
1893 SDValue BasePtr = Load->getBasePtr();
1894 EVT MemVT = Load->getMemoryVT();
1895
1896 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1897
1898 EVT LoVT, HiVT;
1899 EVT LoMemVT, HiMemVT;
1900 SDValue Lo, Hi;
1901
1902 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1903 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1904 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1905
1906 unsigned Size = LoMemVT.getStoreSize();
1907 Align BaseAlign = Load->getAlign();
1908 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1909
1910 SDValue LoLoad = DAG.getExtLoad(
1911 ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1912 MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1913 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1914 SDValue HiLoad = DAG.getExtLoad(
1915 ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1916 PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1917 MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1918
1919 SDValue Join;
1920 if (LoVT == HiVT) {
1921 // This is the case that the vector is power of two so was evenly split.
1922 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1923 } else {
1924 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1925 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1926 Join = DAG.getNode(
1927 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1928 VT, N1: Join, N2: HiLoad,
1929 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1930 }
1931
1932 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1933 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1934
1935 return DAG.getMergeValues(Ops, dl: SL);
1936}
1937
1938SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1939 SelectionDAG &DAG) const {
1940 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1941 EVT VT = Op.getValueType();
1942 SDValue BasePtr = Load->getBasePtr();
1943 EVT MemVT = Load->getMemoryVT();
1944 SDLoc SL(Op);
1945 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1946 Align BaseAlign = Load->getAlign();
1947 unsigned NumElements = MemVT.getVectorNumElements();
1948
1949 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1950 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1951 if (NumElements != 3 ||
1952 (BaseAlign < Align(8) &&
1953 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1954 return SplitVectorLoad(Op, DAG);
1955
1956 assert(NumElements == 3);
1957
1958 EVT WideVT =
1959 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1960 EVT WideMemVT =
1961 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1962 SDValue WideLoad = DAG.getExtLoad(
1963 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1964 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1965 return DAG.getMergeValues(
1966 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1967 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1968 WideLoad.getValue(R: 1)},
1969 dl: SL);
1970}
1971
1972SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1973 SelectionDAG &DAG) const {
1974 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1975 SDValue Val = Store->getValue();
1976 EVT VT = Val.getValueType();
1977
1978 // If this is a 2 element vector, we really want to scalarize and not create
1979 // weird 1 element vectors.
1980 if (VT.getVectorNumElements() == 2)
1981 return scalarizeVectorStore(ST: Store, DAG);
1982
1983 EVT MemVT = Store->getMemoryVT();
1984 SDValue Chain = Store->getChain();
1985 SDValue BasePtr = Store->getBasePtr();
1986 SDLoc SL(Op);
1987
1988 EVT LoVT, HiVT;
1989 EVT LoMemVT, HiMemVT;
1990 SDValue Lo, Hi;
1991
1992 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1993 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1994 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1995
1996 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1997
1998 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1999 Align BaseAlign = Store->getAlign();
2000 unsigned Size = LoMemVT.getStoreSize();
2001 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
2002
2003 SDValue LoStore =
2004 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
2005 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
2006 SDValue HiStore = DAG.getTruncStore(
2007 Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
2008 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
2009
2010 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
2011}
2012
2013// This is a shortcut for integer division because we have fast i32<->f32
2014// conversions, and fast f32 reciprocal instructions. The fractional part of a
2015// float is enough to accurately represent up to a 24-bit integer.
2016SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
2017 bool Sign) const {
2018 SDLoc DL(Op);
2019 EVT VT = Op.getValueType();
2020 assert(VT == MVT::i32 && "LowerDIVREM24 expects an i32");
2021
2022 SDValue LHS = Op.getOperand(i: 0);
2023 SDValue RHS = Op.getOperand(i: 1);
2024 MVT IntVT = MVT::i32;
2025 MVT FltVT = MVT::f32;
2026
2027 unsigned LHSSignBits;
2028 unsigned RHSSignBits;
2029 if (Sign) {
2030 LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2031 RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2032 if (LHSSignBits < 9 || RHSSignBits < 9)
2033 return SDValue();
2034 } else {
2035 KnownBits LHSKnown = DAG.computeKnownBits(Op: LHS);
2036 KnownBits RHSKnown = DAG.computeKnownBits(Op: RHS);
2037 APInt U24Max = APInt::getLowBitsSet(numBits: 32, loBitsSet: 24);
2038 if (LHSKnown.getMaxValue().ugt(RHS: U24Max) ||
2039 RHSKnown.getMaxValue().ugt(RHS: U24Max))
2040 return SDValue();
2041 LHSSignBits = LHSKnown.countMinLeadingZeros();
2042 RHSSignBits = RHSKnown.countMinLeadingZeros();
2043 }
2044
2045 unsigned BitSize = VT.getSizeInBits();
2046 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2047 unsigned DivBits = BitSize - SignBits;
2048 if (Sign)
2049 ++DivBits;
2050
2051 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2052 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2053
2054 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
2055
2056 if (Sign) {
2057 // char|short jq = ia ^ ib;
2058 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2059
2060 // jq = jq >> (bitsize - 2)
2061 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2062 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
2063
2064 // jq = jq | 0x1
2065 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
2066 }
2067
2068 // int ia = (int)LHS;
2069 SDValue ia = LHS;
2070
2071 // int ib, (int)RHS;
2072 SDValue ib = RHS;
2073
2074 // float fa = (float)ia;
2075 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2076
2077 // float fb = (float)ib;
2078 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2079
2080 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2081 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2082
2083 // fq = trunc(fq);
2084 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2085
2086 // float fqneg = -fq;
2087 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2088
2089 MachineFunction &MF = DAG.getMachineFunction();
2090
2091 bool UseFmadFtz = false;
2092 if (Subtarget->isGCN()) {
2093 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2094 UseFmadFtz =
2095 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2096 }
2097
2098 // float fr = mad(fqneg, fb, fa);
2099 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2100 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2101 : (unsigned)ISD::FMAD;
2102 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2103
2104 // int iq = (int)fq;
2105 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2106
2107 // fr = fabs(fr);
2108 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2109
2110 // fb = fabs(fb);
2111 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2112
2113 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2114
2115 // int cv = fr >= fb;
2116 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2117
2118 // jq = (cv ? jq : 0);
2119 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2120
2121 // dst = iq + jq;
2122 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2123
2124 // Rem needs compensation, it's easier to recompute it
2125 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2126 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2127
2128 // Truncate to number of bits this divide really is.
2129 if (Sign) {
2130 SDValue InRegSize
2131 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2132 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2133 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2134 } else {
2135 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2136 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2137 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2138 }
2139
2140 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2141}
2142
2143void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2144 SelectionDAG &DAG,
2145 SmallVectorImpl<SDValue> &Results) const {
2146 SDLoc DL(Op);
2147 EVT VT = Op.getValueType();
2148
2149 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2150
2151 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2152
2153 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2154 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2155
2156 //HiLo split
2157 SDValue LHS_Lo, LHS_Hi;
2158 SDValue LHS = Op.getOperand(i: 0);
2159 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2160
2161 SDValue RHS_Lo, RHS_Hi;
2162 SDValue RHS = Op.getOperand(i: 1);
2163 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2164
2165 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2166 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2167
2168 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2169 N1: LHS_Lo, N2: RHS_Lo);
2170
2171 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2172 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2173
2174 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2175 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2176 return;
2177 }
2178
2179 if (isTypeLegal(VT: MVT::i64)) {
2180 // The algorithm here is based on ideas from "Software Integer Division",
2181 // Tom Rodeheffer, August 2008.
2182
2183 MachineFunction &MF = DAG.getMachineFunction();
2184 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2185
2186 // Compute denominator reciprocal.
2187 unsigned FMAD =
2188 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2189 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2190 ? (unsigned)ISD::FMAD
2191 : (unsigned)AMDGPUISD::FMAD_FTZ;
2192
2193 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2194 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2195 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2196 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2197 N3: Cvt_Lo);
2198 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2199 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2200 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2201 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2202 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2203 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2204 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2205 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2206 N3: Mul1);
2207 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2208 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2209 SDValue Rcp64 = DAG.getBitcast(VT,
2210 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2211
2212 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2213 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2214 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2215 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2216
2217 // First round of UNR (Unsigned integer Newton-Raphson).
2218 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2219 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2220 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2221 SDValue Mulhi1_Lo, Mulhi1_Hi;
2222 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2223 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2224 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2225 N2: Mulhi1_Lo, N3: Zero1);
2226 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2227 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2228 SDValue Add1 = DAG.getBitcast(VT,
2229 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2230
2231 // Second round of UNR.
2232 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2233 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2234 SDValue Mulhi2_Lo, Mulhi2_Hi;
2235 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2236 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2237 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2238 N2: Mulhi2_Lo, N3: Zero1);
2239 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2240 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2241 SDValue Add2 = DAG.getBitcast(VT,
2242 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2243
2244 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2245
2246 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2247
2248 SDValue Mul3_Lo, Mul3_Hi;
2249 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2250 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2251 N2: Mul3_Lo, N3: Zero1);
2252 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2253 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2254 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2255 SDValue Sub1 = DAG.getBitcast(VT,
2256 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2257
2258 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2259 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2260 Cond: ISD::SETUGE);
2261 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2262 Cond: ISD::SETUGE);
2263 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2264
2265 // TODO: Here and below portions of the code can be enclosed into if/endif.
2266 // Currently control flow is unconditional and we have 4 selects after
2267 // potential endif to substitute PHIs.
2268
2269 // if C3 != 0 ...
2270 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2271 N2: RHS_Lo, N3: Zero1);
2272 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2273 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2274 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2275 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2276 SDValue Sub2 = DAG.getBitcast(VT,
2277 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2278
2279 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2280
2281 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2282 Cond: ISD::SETUGE);
2283 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2284 Cond: ISD::SETUGE);
2285 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2286
2287 // if (C6 != 0)
2288 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2289
2290 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2291 N2: RHS_Lo, N3: Zero1);
2292 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2293 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2294 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2295 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2296 SDValue Sub3 = DAG.getBitcast(VT,
2297 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2298
2299 // endif C6
2300 // endif C3
2301
2302 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2303 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2304
2305 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2306 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2307
2308 Results.push_back(Elt: Div);
2309 Results.push_back(Elt: Rem);
2310
2311 return;
2312 }
2313
2314 // r600 expandion.
2315 // Get Speculative values
2316 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2317 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2318
2319 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2320 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2321 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2322
2323 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2324 SDValue DIV_Lo = Zero;
2325
2326 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2327
2328 for (unsigned i = 0; i < halfBitWidth; ++i) {
2329 const unsigned bitPos = halfBitWidth - i - 1;
2330 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2331 // Get value of high bit
2332 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2333 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2334 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2335
2336 // Shift
2337 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2338 // Add LHS high bit
2339 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2340
2341 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2342 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2343
2344 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2345
2346 // Update REM
2347 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2348 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2349 }
2350
2351 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2352 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2353 Results.push_back(Elt: DIV);
2354 Results.push_back(Elt: REM);
2355}
2356
2357SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2358 SelectionDAG &DAG) const {
2359 SDLoc DL(Op);
2360 EVT VT = Op.getValueType();
2361
2362 if (VT == MVT::i64) {
2363 SmallVector<SDValue, 2> Results;
2364 LowerUDIVREM64(Op, DAG, Results);
2365 return DAG.getMergeValues(Ops: Results, dl: DL);
2366 }
2367
2368 if (VT == MVT::i32) {
2369 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2370 return Res;
2371 }
2372
2373 SDValue X = Op.getOperand(i: 0);
2374 SDValue Y = Op.getOperand(i: 1);
2375
2376 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2377 // algorithm used here.
2378
2379 // Initial estimate of inv(y).
2380 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2381
2382 // One round of UNR.
2383 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2384 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2385 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2386 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2387
2388 // Quotient/remainder estimate.
2389 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2390 SDValue R =
2391 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2392
2393 // First quotient/remainder refinement.
2394 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2395 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2396 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2397 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2398 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2399 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2400 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2401
2402 // Second quotient/remainder refinement.
2403 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2404 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2405 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2406 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2407 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2408
2409 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2410}
2411
2412SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2413 SelectionDAG &DAG) const {
2414 SDLoc DL(Op);
2415 EVT VT = Op.getValueType();
2416
2417 SDValue LHS = Op.getOperand(i: 0);
2418 SDValue RHS = Op.getOperand(i: 1);
2419
2420 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2421 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2422
2423 if (VT == MVT::i32) {
2424 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2425 return Res;
2426 }
2427
2428 // LHS must have > 33 sign-bits to ensure that LHS != -2147483648
2429 // Otherwise 32-bit division cannot be used safely.
2430 // -2147483648/1 and -2147483648/-1 are not equal,
2431 // but they produce the same lower 32-bit result.
2432 if (VT == MVT::i64 && DAG.ComputeNumSignBits(Op: LHS) > 33 &&
2433 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2434 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2435
2436 //HiLo split
2437 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2438 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2439 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2440 N1: LHS_Lo, N2: RHS_Lo);
2441 SDValue Res[2] = {
2442 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2443 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2444 };
2445 return DAG.getMergeValues(Ops: Res, dl: DL);
2446 }
2447
2448 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2449 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2450 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2451 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2452
2453 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2454 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2455
2456 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2457 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2458
2459 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2460 SDValue Rem = Div.getValue(R: 1);
2461
2462 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2463 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2464
2465 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2466 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2467
2468 SDValue Res[2] = {
2469 Div,
2470 Rem
2471 };
2472 return DAG.getMergeValues(Ops: Res, dl: DL);
2473}
2474
2475SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2476 SDLoc SL(Op);
2477 SDValue Src = Op.getOperand(i: 0);
2478
2479 // result = trunc(src)
2480 // if (src > 0.0 && src != result)
2481 // result += 1.0
2482
2483 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2484
2485 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2486 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2487
2488 EVT SetCCVT =
2489 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2490
2491 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2492 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2493 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2494
2495 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2496 // TODO: Should this propagate fast-math-flags?
2497 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2498}
2499
2500static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2501 SelectionDAG &DAG) {
2502 const unsigned FractBits = 52;
2503 const unsigned ExpBits = 11;
2504
2505 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2506 N1: Hi,
2507 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2508 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2509 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2510 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2511
2512 return Exp;
2513}
2514
2515SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2516 SDLoc SL(Op);
2517 SDValue Src = Op.getOperand(i: 0);
2518
2519 assert(Op.getValueType() == MVT::f64);
2520
2521 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2522
2523 // Extract the upper half, since this is where we will find the sign and
2524 // exponent.
2525 SDValue Hi = getHiHalf64(Op: Src, DAG);
2526
2527 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2528
2529 const unsigned FractBits = 52;
2530
2531 // Extract the sign bit.
2532 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2533 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2534
2535 // Extend back to 64-bits.
2536 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2537 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2538
2539 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2540 const SDValue FractMask
2541 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2542
2543 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2544 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2545 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2546
2547 EVT SetCCVT =
2548 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2549
2550 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2551
2552 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2553 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2554
2555 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2556 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2557
2558 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2559}
2560
2561SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2562 SelectionDAG &DAG) const {
2563 SDLoc SL(Op);
2564 SDValue Src = Op.getOperand(i: 0);
2565
2566 assert(Op.getValueType() == MVT::f64);
2567
2568 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2569 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2570 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2571
2572 // TODO: Should this propagate fast-math-flags?
2573
2574 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2575 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2576
2577 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2578
2579 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2580 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2581
2582 EVT SetCCVT =
2583 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2584 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2585
2586 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2587}
2588
2589SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2590 SelectionDAG &DAG) const {
2591 // FNEARBYINT and FRINT are the same, except in their handling of FP
2592 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2593 // rint, so just treat them as equivalent.
2594 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2595 Operand: Op.getOperand(i: 0));
2596}
2597
2598SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2599 auto VT = Op.getValueType();
2600 auto Arg = Op.getOperand(i: 0u);
2601 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2602}
2603
2604// XXX - May require not supporting f32 denormals?
2605
2606// Don't handle v2f16. The extra instructions to scalarize and repack around the
2607// compare and vselect end up producing worse code than scalarizing the whole
2608// operation.
2609SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2610 SDLoc SL(Op);
2611 SDValue X = Op.getOperand(i: 0);
2612 EVT VT = Op.getValueType();
2613
2614 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2615
2616 // TODO: Should this propagate fast-math-flags?
2617
2618 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2619
2620 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2621
2622 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2623 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2624
2625 EVT SetCCVT =
2626 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2627
2628 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2629 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2630 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2631
2632 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2633 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2634}
2635
2636SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2637 SDLoc SL(Op);
2638 SDValue Src = Op.getOperand(i: 0);
2639
2640 // result = trunc(src);
2641 // if (src < 0.0 && src != result)
2642 // result += -1.0.
2643
2644 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2645
2646 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2647 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2648
2649 EVT SetCCVT =
2650 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2651
2652 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2653 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2654 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2655
2656 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2657 // TODO: Should this propagate fast-math-flags?
2658 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2659}
2660
2661/// Return true if it's known that \p Src can never be an f32 denormal value.
2662static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2663 switch (Src.getOpcode()) {
2664 case ISD::FP_EXTEND:
2665 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2666 case ISD::FP16_TO_FP:
2667 case ISD::FFREXP:
2668 case ISD::FSQRT:
2669 case AMDGPUISD::LOG:
2670 case AMDGPUISD::EXP:
2671 return true;
2672 case ISD::INTRINSIC_WO_CHAIN: {
2673 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2674 switch (IntrinsicID) {
2675 case Intrinsic::amdgcn_frexp_mant:
2676 case Intrinsic::amdgcn_log:
2677 case Intrinsic::amdgcn_log_clamp:
2678 case Intrinsic::amdgcn_exp2:
2679 case Intrinsic::amdgcn_sqrt:
2680 return true;
2681 default:
2682 return false;
2683 }
2684 }
2685 default:
2686 return false;
2687 }
2688
2689 llvm_unreachable("covered opcode switch");
2690}
2691
2692bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2693 SDNodeFlags Flags) {
2694 return Flags.hasApproximateFuncs();
2695}
2696
2697bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2698 SDValue Src,
2699 SDNodeFlags Flags) {
2700 return !valueIsKnownNeverF32Denorm(Src) &&
2701 DAG.getMachineFunction()
2702 .getDenormalMode(FPType: APFloat::IEEEsingle())
2703 .Input != DenormalMode::PreserveSign;
2704}
2705
2706SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2707 SDValue Src,
2708 SDNodeFlags Flags) const {
2709 SDLoc SL(Src);
2710 EVT VT = Src.getValueType();
2711 const fltSemantics &Semantics = VT.getFltSemantics();
2712 SDValue SmallestNormal =
2713 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2714
2715 // Want to scale denormals up, but negatives and 0 work just as well on the
2716 // scaled path.
2717 SDValue IsLtSmallestNormal = DAG.getSetCC(
2718 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2719 RHS: SmallestNormal, Cond: ISD::SETOLT);
2720
2721 return IsLtSmallestNormal;
2722}
2723
2724SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2725 SDNodeFlags Flags) const {
2726 SDLoc SL(Src);
2727 EVT VT = Src.getValueType();
2728 const fltSemantics &Semantics = VT.getFltSemantics();
2729 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2730
2731 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2732 SDValue IsFinite = DAG.getSetCC(
2733 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2734 RHS: Inf, Cond: ISD::SETOLT);
2735 return IsFinite;
2736}
2737
2738/// If denormal handling is required return the scaled input to FLOG2, and the
2739/// check for denormal range. Otherwise, return null values.
2740std::pair<SDValue, SDValue>
2741AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2742 SDValue Src, SDNodeFlags Flags) const {
2743 if (!needsDenormHandlingF32(DAG, Src, Flags))
2744 return {};
2745
2746 MVT VT = MVT::f32;
2747 const fltSemantics &Semantics = APFloat::IEEEsingle();
2748 SDValue SmallestNormal =
2749 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2750
2751 SDValue IsLtSmallestNormal = DAG.getSetCC(
2752 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2753 RHS: SmallestNormal, Cond: ISD::SETOLT);
2754
2755 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2756 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2757 SDValue ScaleFactor =
2758 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2759
2760 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2761 return {ScaledInput, IsLtSmallestNormal};
2762}
2763
2764SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2765 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2766 // If we have to handle denormals, scale up the input and adjust the result.
2767
2768 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2769 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2770
2771 SDLoc SL(Op);
2772 EVT VT = Op.getValueType();
2773 SDValue Src = Op.getOperand(i: 0);
2774 SDNodeFlags Flags = Op->getFlags();
2775
2776 if (VT == MVT::f16) {
2777 // Nothing in half is a denormal when promoted to f32.
2778 assert(!isTypeLegal(VT));
2779 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2780 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2781 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2782 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2783 }
2784
2785 auto [ScaledInput, IsLtSmallestNormal] =
2786 getScaledLogInput(DAG, SL, Src, Flags);
2787 if (!ScaledInput)
2788 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2789
2790 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2791
2792 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2793 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2794 SDValue ResultOffset =
2795 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2796 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2797}
2798
2799static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2800 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2801 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2802 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2803}
2804
2805SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2806 SelectionDAG &DAG) const {
2807 SDValue X = Op.getOperand(i: 0);
2808 EVT VT = Op.getValueType();
2809 SDNodeFlags Flags = Op->getFlags();
2810 SDLoc DL(Op);
2811 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2812 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2813
2814 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2815 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2816 // depending on !fpmath metadata.
2817
2818 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2819 !isTypeLegal(VT: MVT::f16));
2820
2821 if (PromoteToF32) {
2822 // Log and multiply in f32 is always good enough for f16.
2823 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2824 }
2825
2826 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2827 if (PromoteToF32) {
2828 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2829 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2830 }
2831
2832 return Lowered;
2833 }
2834
2835 SDValue ScaledInput, IsScaled;
2836 if (VT == MVT::f16)
2837 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2838 else {
2839 std::tie(args&: ScaledInput, args&: IsScaled) = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2840 if (ScaledInput)
2841 X = ScaledInput;
2842 }
2843
2844 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2845
2846 SDValue R;
2847 if (Subtarget->hasFastFMAF32()) {
2848 // c+cc are ln(2)/ln(10) to more than 49 bits
2849 const float c_log10 = 0x1.344134p-2f;
2850 const float cc_log10 = 0x1.09f79ep-26f;
2851
2852 // c + cc is ln(2) to more than 49 bits
2853 const float c_log = 0x1.62e42ep-1f;
2854 const float cc_log = 0x1.efa39ep-25f;
2855
2856 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2857 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2858 // This adds correction terms for which contraction may lead to an increase
2859 // in the error of the approximation, so disable it.
2860 Flags.setAllowContract(false);
2861 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2862 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2863 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2864 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2865 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2866 } else {
2867 // ch+ct is ln(2)/ln(10) to more than 36 bits
2868 const float ch_log10 = 0x1.344000p-2f;
2869 const float ct_log10 = 0x1.3509f6p-18f;
2870
2871 // ch + ct is ln(2) to more than 36 bits
2872 const float ch_log = 0x1.62e000p-1f;
2873 const float ct_log = 0x1.0bfbe8p-15f;
2874
2875 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2876 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2877
2878 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2879 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2880 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2881 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2882 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2883 // This adds correction terms for which contraction may lead to an increase
2884 // in the error of the approximation, so disable it.
2885 Flags.setAllowContract(false);
2886 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2887 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2888 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2889 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2890 }
2891
2892 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2893
2894 // TODO: Check if known finite from source value.
2895 if (!IsFiniteOnly) {
2896 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2897 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2898 }
2899
2900 if (IsScaled) {
2901 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2902 SDValue ShiftK =
2903 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2904 SDValue Shift =
2905 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2906 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2907 }
2908
2909 return R;
2910}
2911
2912SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2913 return LowerFLOGCommon(Op, DAG);
2914}
2915
2916// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2917// promote f16 operation.
2918SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2919 SelectionDAG &DAG, bool IsLog10,
2920 SDNodeFlags Flags) const {
2921 EVT VT = Src.getValueType();
2922 unsigned LogOp =
2923 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2924
2925 double Log2BaseInverted =
2926 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2927
2928 if (VT == MVT::f32) {
2929 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2930 if (ScaledInput) {
2931 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2932 SDValue ScaledResultOffset =
2933 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2934
2935 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2936
2937 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2938 N2: ScaledResultOffset, N3: Zero, Flags);
2939
2940 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2941
2942 if (Subtarget->hasFastFMAF32())
2943 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2944 Flags);
2945 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2946 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2947 }
2948 }
2949
2950 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2951 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2952
2953 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2954 Flags);
2955}
2956
2957// This expansion gives a result slightly better than 1ulp.
2958SDValue AMDGPUTargetLowering::lowerFEXPF64(SDValue Op,
2959 SelectionDAG &DAG) const {
2960 SDLoc DL(Op);
2961 SDValue X = Op.getOperand(i: 0);
2962
2963 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2964 // exp10, which slightly increases ulp.
2965 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2966
2967 SDValue DN, F, T;
2968
2969 if (Op.getOpcode() == ISD::FEXP2) {
2970 // dn = rint(x)
2971 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: X, Flags);
2972 // f = x - dn
2973 F = DAG.getNode(Opcode: ISD::FSUB, DL, VT: MVT::f64, N1: X, N2: DN, Flags);
2974 // t = f*C1 + f*C2
2975 SDValue C1 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
2976 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
2977 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C2, Flags);
2978 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C1, N3: Mul2, Flags);
2979 } else if (Op.getOpcode() == ISD::FEXP10) {
2980 // dn = rint(x * C1)
2981 SDValue C1 = DAG.getConstantFP(Val: 0x1.a934f0979a371p+1, DL, VT: MVT::f64);
2982 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2983 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2984
2985 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2986 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2987 SDValue C2 = DAG.getConstantFP(Val: -0x1.9dc1da994fd21p-59, DL, VT: MVT::f64);
2988 SDValue C3 = DAG.getConstantFP(Val: 0x1.34413509f79ffp-2, DL, VT: MVT::f64);
2989 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2990 F = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2991
2992 // t = FMA(f, C4, f*C5)
2993 SDValue C4 = DAG.getConstantFP(Val: 0x1.26bb1bbb55516p+1, DL, VT: MVT::f64);
2994 SDValue C5 = DAG.getConstantFP(Val: -0x1.f48ad494ea3e9p-53, DL, VT: MVT::f64);
2995 SDValue MulF = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C5, Flags);
2996 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C4, N3: MulF, Flags);
2997 } else { // ISD::FEXP
2998 // dn = rint(x * C1)
2999 SDValue C1 = DAG.getConstantFP(Val: 0x1.71547652b82fep+0, DL, VT: MVT::f64);
3000 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
3001 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
3002
3003 // t = FMA(-dn, C2, FMA(-dn, C3, x))
3004 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
3005 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
3006 SDValue C3 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
3007 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
3008 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
3009 }
3010
3011 // Polynomial expansion for p
3012 SDValue P = DAG.getConstantFP(Val: 0x1.ade156a5dcb37p-26, DL, VT: MVT::f64);
3013 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3014 N3: DAG.getConstantFP(Val: 0x1.28af3fca7ab0cp-22, DL, VT: MVT::f64),
3015 Flags);
3016 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3017 N3: DAG.getConstantFP(Val: 0x1.71dee623fde64p-19, DL, VT: MVT::f64),
3018 Flags);
3019 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3020 N3: DAG.getConstantFP(Val: 0x1.a01997c89e6b0p-16, DL, VT: MVT::f64),
3021 Flags);
3022 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3023 N3: DAG.getConstantFP(Val: 0x1.a01a014761f6ep-13, DL, VT: MVT::f64),
3024 Flags);
3025 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3026 N3: DAG.getConstantFP(Val: 0x1.6c16c1852b7b0p-10, DL, VT: MVT::f64),
3027 Flags);
3028 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3029 N3: DAG.getConstantFP(Val: 0x1.1111111122322p-7, DL, VT: MVT::f64), Flags);
3030 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3031 N3: DAG.getConstantFP(Val: 0x1.55555555502a1p-5, DL, VT: MVT::f64), Flags);
3032 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3033 N3: DAG.getConstantFP(Val: 0x1.5555555555511p-3, DL, VT: MVT::f64), Flags);
3034 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3035 N3: DAG.getConstantFP(Val: 0x1.000000000000bp-1, DL, VT: MVT::f64), Flags);
3036
3037 SDValue One = DAG.getConstantFP(Val: 1.0, DL, VT: MVT::f64);
3038
3039 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3040 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3041
3042 // z = ldexp(p, (int)dn)
3043 SDValue DNInt = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL, VT: MVT::i32, Operand: DN);
3044 SDValue Z = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: P, N2: DNInt, Flags);
3045
3046 // Overflow/underflow guards
3047 SDValue CondHi = DAG.getSetCC(
3048 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: 1024.0, DL, VT: MVT::f64), Cond: ISD::SETULE);
3049
3050 if (!Flags.hasNoInfs()) {
3051 SDValue PInf = DAG.getConstantFP(Val: std::numeric_limits<double>::infinity(),
3052 DL, VT: MVT::f64);
3053 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondHi, LHS: Z, RHS: PInf, Flags);
3054 }
3055
3056 SDValue CondLo = DAG.getSetCC(
3057 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: -1075.0, DL, VT: MVT::f64), Cond: ISD::SETUGE);
3058 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL, VT: MVT::f64);
3059 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondLo, LHS: Z, RHS: Zero, Flags);
3060
3061 return Z;
3062}
3063
3064SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
3065 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3066 // If we have to handle denormals, scale up the input and adjust the result.
3067
3068 EVT VT = Op.getValueType();
3069 if (VT == MVT::f64)
3070 return lowerFEXPF64(Op, DAG);
3071
3072 SDLoc SL(Op);
3073 SDValue Src = Op.getOperand(i: 0);
3074 SDNodeFlags Flags = Op->getFlags();
3075
3076 if (VT == MVT::f16) {
3077 // Nothing in half is a denormal when promoted to f32.
3078 assert(!isTypeLegal(MVT::f16));
3079 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3080 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
3081 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
3082 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3083 }
3084
3085 assert(VT == MVT::f32);
3086
3087 if (!needsDenormHandlingF32(DAG, Src, Flags))
3088 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3089
3090 // bool needs_scaling = x < -0x1.f80000p+6f;
3091 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3092
3093 // -nextafter(128.0, -1)
3094 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
3095
3096 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3097
3098 SDValue NeedsScaling =
3099 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
3100
3101 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3102 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3103
3104 SDValue AddOffset =
3105 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
3106
3107 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
3108 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
3109
3110 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
3111 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
3112 SDValue ResultScale =
3113 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
3114
3115 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
3116}
3117
3118SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
3119 SelectionDAG &DAG,
3120 SDNodeFlags Flags,
3121 bool IsExp10) const {
3122 // exp(x) -> exp2(M_LOG2E_F * x);
3123 // exp10(x) -> exp2(log2(10) * x);
3124 EVT VT = X.getValueType();
3125 SDValue Const =
3126 DAG.getConstantFP(Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, DL: SL, VT);
3127
3128 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
3129 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3130 : (unsigned)ISD::FEXP2,
3131 DL: SL, VT, Operand: Mul, Flags);
3132}
3133
3134SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
3135 SelectionDAG &DAG,
3136 SDNodeFlags Flags) const {
3137 EVT VT = X.getValueType();
3138 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags))
3139 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3140
3141 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3142
3143 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
3144 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3145
3146 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3147
3148 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3149
3150 SDValue AdjustedX =
3151 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3152
3153 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3154 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3155
3156 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3157
3158 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
3159 SDValue AdjustedResult =
3160 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3161
3162 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3163 Flags);
3164}
3165
3166/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3167/// handled correctly.
3168SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3169 SelectionDAG &DAG,
3170 SDNodeFlags Flags) const {
3171 const EVT VT = X.getValueType();
3172
3173 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3174 : static_cast<unsigned>(ISD::FEXP2);
3175
3176 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3177 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3178 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3179 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3180
3181 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3182 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3183 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3184 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3185 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3186 }
3187
3188 // bool s = x < -0x1.2f7030p+5f;
3189 // x += s ? 0x1.0p+5f : 0.0f;
3190 // exp10 = exp2(x * 0x1.a92000p+1f) *
3191 // exp2(x * 0x1.4f0978p-11f) *
3192 // (s ? 0x1.9f623ep-107f : 1.0f);
3193
3194 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3195
3196 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3197 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3198
3199 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3200 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3201 SDValue AdjustedX =
3202 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3203
3204 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3205 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3206
3207 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3208 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3209 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3210 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3211
3212 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3213
3214 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3215 SDValue AdjustedResult =
3216 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3217
3218 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3219 Flags);
3220}
3221
3222SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3223 EVT VT = Op.getValueType();
3224
3225 if (VT == MVT::f64)
3226 return lowerFEXPF64(Op, DAG);
3227
3228 SDLoc SL(Op);
3229 SDValue X = Op.getOperand(i: 0);
3230 SDNodeFlags Flags = Op->getFlags();
3231 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3232
3233 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3234 // library behavior. Also, is known-not-daz source sufficient?
3235 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3236 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3237 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3238 }
3239
3240 if (VT.getScalarType() == MVT::f16) {
3241 if (VT.isVector())
3242 return SDValue();
3243
3244 // Nothing in half is a denormal when promoted to f32.
3245 //
3246 // exp(f16 x) ->
3247 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3248 //
3249 // exp10(f16 x) ->
3250 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3251 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3252 SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3253 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3254 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3255 }
3256
3257 assert(VT == MVT::f32);
3258
3259 // Algorithm:
3260 //
3261 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3262 //
3263 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3264 // n = 64*m + j, 0 <= j < 64
3265 //
3266 // e^x = 2^((64*m + j + f)/64)
3267 // = (2^m) * (2^(j/64)) * 2^(f/64)
3268 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3269 //
3270 // f = x*(64/ln(2)) - n
3271 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3272 //
3273 // e^x = (2^m) * (2^(j/64)) * e^r
3274 //
3275 // (2^(j/64)) is precomputed
3276 //
3277 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3278 // e^r = 1 + q
3279 //
3280 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3281 //
3282 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3283 SDNodeFlags FlagsNoContract = Flags;
3284 FlagsNoContract.setAllowContract(false);
3285
3286 SDValue PH, PL;
3287 if (Subtarget->hasFastFMAF32()) {
3288 const float c_exp = numbers::log2ef;
3289 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3290 const float c_exp10 = 0x1.a934f0p+1f;
3291 const float cc_exp10 = 0x1.2f346ep-24f;
3292
3293 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3294 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3295
3296 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3297 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3298 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3299 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3300 } else {
3301 const float ch_exp = 0x1.714000p+0f;
3302 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3303
3304 const float ch_exp10 = 0x1.a92000p+1f;
3305 const float cl_exp10 = 0x1.4f0978p-11f;
3306
3307 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3308 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3309
3310 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3311 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3312 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3313 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3314 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3315
3316 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3317
3318 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3319 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3320 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3321 }
3322
3323 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3324
3325 // It is unsafe to contract this fsub into the PH multiply.
3326 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3327
3328 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3329 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3330 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3331
3332 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3333
3334 SDValue UnderflowCheckConst =
3335 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3336
3337 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3338 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3339 SDValue Underflow =
3340 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3341
3342 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3343
3344 if (!Flags.hasNoInfs()) {
3345 SDValue OverflowCheckConst =
3346 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3347 SDValue Overflow =
3348 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3349 SDValue Inf =
3350 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3351 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3352 }
3353
3354 return R;
3355}
3356
3357static bool isCtlzOpc(unsigned Opc) {
3358 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
3359}
3360
3361static bool isCttzOpc(unsigned Opc) {
3362 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;
3363}
3364
3365SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3366 SelectionDAG &DAG) const {
3367 auto SL = SDLoc(Op);
3368 auto Opc = Op.getOpcode();
3369 auto Arg = Op.getOperand(i: 0u);
3370 auto ResultVT = Op.getValueType();
3371
3372 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3373 return {};
3374
3375 assert(isCtlzOpc(Opc));
3376 assert(ResultVT == Arg.getValueType());
3377
3378 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3379 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3380 SDValue NewOp;
3381
3382 if (Opc == ISD::CTLZ_ZERO_POISON) {
3383 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3384 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3385 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3386 } else {
3387 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3388 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3389 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3390 }
3391
3392 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3393}
3394
3395SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3396 SDLoc SL(Op);
3397 SDValue Src = Op.getOperand(i: 0);
3398
3399 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3400 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3401 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3402
3403 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||
3404 Op.getOpcode() == ISD::CTTZ_ZERO_POISON;
3405 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3406
3407 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3408 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3409 // (cttz hi:lo) -> (umin (ffbl src), 32)
3410 // (ctlz_zero_poison src) -> (ffbh src)
3411 // (cttz_zero_poison src) -> (ffbl src)
3412
3413 // 64-bit scalar version produce 32-bit result
3414 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3415 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3416 // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)
3417 // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)
3418 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3419 if (!ZeroUndef) {
3420 const SDValue ConstVal = DAG.getConstant(
3421 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3422 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3423 }
3424 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3425 }
3426
3427 SDValue Lo, Hi;
3428 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3429
3430 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3431 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3432
3433 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3434 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3435 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3436 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3437
3438 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3439 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3440 if (Ctlz)
3441 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3442 else
3443 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3444
3445 SDValue NewOpr;
3446 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3447 if (!ZeroUndef) {
3448 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3449 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3450 }
3451
3452 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3453}
3454
3455SDValue AMDGPUTargetLowering::LowerCTLS(SDValue Op, SelectionDAG &DAG) const {
3456 SDLoc SL(Op);
3457 SDValue Src = Op.getOperand(i: 0);
3458 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3459 SDValue Ffbh = DAG.getNode(
3460 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
3461 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sffbh, DL: SL, VT: MVT::i32), N2: Src);
3462 SDValue Clamped = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: Ffbh,
3463 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3464 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: Clamped,
3465 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i32));
3466}
3467
3468SDValue AMDGPUTargetLowering::LowerINT_TO_FP16(SDValue Op, SelectionDAG &DAG,
3469 EVT FP16Ty) const {
3470 assert(FP16Ty == MVT::f16 || FP16Ty == MVT::bf16);
3471 SDLoc SL(Op);
3472 SDValue Src = Op.getOperand(i: 0);
3473 SDValue ToF32 = DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::f32, Operand: Src);
3474 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3475 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: FP16Ty, N1: ToF32, N2: FPRoundFlag);
3476}
3477
3478SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3479 bool Signed) const {
3480 // The regular method converting a 64-bit integer to float roughly consists of
3481 // 2 steps: normalization and rounding. In fact, after normalization, the
3482 // conversion from a 64-bit integer to a float is essentially the same as the
3483 // one from a 32-bit integer. The only difference is that it has more
3484 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3485 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3486 // converted into the correct float number. The basic steps for the unsigned
3487 // conversion are illustrated in the following pseudo code:
3488 //
3489 // f32 uitofp(i64 u) {
3490 // i32 hi, lo = split(u);
3491 // // Only count the leading zeros in hi as we have native support of the
3492 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3493 // // reduced to a 32-bit one automatically.
3494 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3495 // u <<= shamt;
3496 // hi, lo = split(u);
3497 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3498 // // convert it as a 32-bit integer and scale the result back.
3499 // return uitofp(hi) * 2^(32 - shamt);
3500 // }
3501 //
3502 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3503 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3504 // converted instead followed by negation based its sign bit.
3505
3506 SDLoc SL(Op);
3507 SDValue Src = Op.getOperand(i: 0);
3508
3509 SDValue Lo, Hi;
3510 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3511 SDValue Sign;
3512 SDValue ShAmt;
3513 if (Signed && Subtarget->isGCN()) {
3514 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3515 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3516 // account. That is, the maximal shift is
3517 // - 32 if Lo and Hi have opposite signs;
3518 // - 33 if Lo and Hi have the same sign.
3519 //
3520 // Or, MaxShAmt = 33 + OppositeSign, where
3521 //
3522 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3523 // - -1 if Lo and Hi have opposite signs; and
3524 // - 0 otherwise.
3525 //
3526 // All in all, ShAmt is calculated as
3527 //
3528 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3529 //
3530 // or
3531 //
3532 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3533 //
3534 // to reduce the critical path.
3535 SDValue OppositeSign = DAG.getNode(
3536 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3537 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3538 SDValue MaxShAmt =
3539 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3540 N2: OppositeSign);
3541 // Count the leading sign bits.
3542 ShAmt = DAG.getNode(
3543 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
3544 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sffbh, DL: SL, VT: MVT::i32), N2: Hi);
3545 // Different from unsigned conversion, the shift should be one bit less to
3546 // preserve the sign bit.
3547 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3548 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3549 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3550 } else {
3551 if (Signed) {
3552 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3553 // absolute value first.
3554 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3555 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3556 SDValue Abs =
3557 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3558 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3559 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3560 }
3561 // Count the leading zeros.
3562 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3563 // The shift amount for signed integers is [0, 32].
3564 }
3565 // Normalize the given 64-bit integer.
3566 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3567 // Split it again.
3568 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3569 // Calculate the adjust bit for rounding.
3570 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3571 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3572 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3573 // Get the 32-bit normalized integer.
3574 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3575 // Convert the normalized 32-bit integer into f32.
3576
3577 bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3578 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3579 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3580
3581 // Finally, need to scale back the converted floating number as the original
3582 // 64-bit integer is converted as a 32-bit one.
3583 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3584 N2: ShAmt);
3585 // On GCN, use LDEXP directly.
3586 if (UseLDEXP)
3587 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3588
3589 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3590 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3591 // exponent is enough to avoid overflowing into the sign bit.
3592 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3593 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3594 SDValue IVal =
3595 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3596 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3597 if (Signed) {
3598 // Set the sign bit.
3599 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3600 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3601 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3602 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3603 }
3604 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3605}
3606
3607SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3608 bool Signed) const {
3609 SDLoc SL(Op);
3610 SDValue Src = Op.getOperand(i: 0);
3611
3612 SDValue Lo, Hi;
3613 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3614
3615 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3616 DL: SL, VT: MVT::f64, Operand: Hi);
3617
3618 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3619
3620 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3621 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3622 // TODO: Should this propagate fast-math-flags?
3623 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3624}
3625
3626SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3627 SelectionDAG &DAG) const {
3628 // TODO: Factor out code common with LowerSINT_TO_FP.
3629 EVT DestVT = Op.getValueType();
3630 SDValue Src = Op.getOperand(i: 0);
3631 EVT SrcVT = Src.getValueType();
3632
3633 if (SrcVT == MVT::i16) {
3634 if (DestVT == MVT::f16)
3635 return Op;
3636 SDLoc DL(Op);
3637
3638 // Promote src to i32
3639 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3640 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3641 }
3642
3643 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3644 return LowerINT_TO_FP16(Op, DAG, FP16Ty: DestVT);
3645
3646 if (SrcVT != MVT::i64)
3647 return Op;
3648
3649 if (DestVT == MVT::f32)
3650 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3651
3652 assert(DestVT == MVT::f64);
3653 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3654}
3655
3656SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3657 SelectionDAG &DAG) const {
3658 EVT DestVT = Op.getValueType();
3659
3660 SDValue Src = Op.getOperand(i: 0);
3661 EVT SrcVT = Src.getValueType();
3662
3663 if (SrcVT == MVT::i16) {
3664 if (DestVT == MVT::f16)
3665 return Op;
3666
3667 SDLoc DL(Op);
3668 // Promote src to i32
3669 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3670 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3671 }
3672
3673 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3674 return LowerINT_TO_FP16(Op, DAG, FP16Ty: DestVT);
3675
3676 if (SrcVT != MVT::i64)
3677 return Op;
3678
3679 // TODO: Factor out code common with LowerUINT_TO_FP.
3680
3681 if (DestVT == MVT::f32)
3682 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3683
3684 assert(DestVT == MVT::f64);
3685 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3686}
3687
3688SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3689 bool Signed) const {
3690 SDLoc SL(Op);
3691
3692 SDValue Src = Op.getOperand(i: 0);
3693 EVT SrcVT = Src.getValueType();
3694
3695 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3696
3697 // The basic idea of converting a floating point number into a pair of 32-bit
3698 // integers is illustrated as follows:
3699 //
3700 // tf := trunc(val);
3701 // hif := floor(tf * 2^-32);
3702 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3703 // hi := fptoi(hif);
3704 // lo := fptoi(lof);
3705 //
3706 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3707 SDValue Sign;
3708 if (Signed && SrcVT == MVT::f32) {
3709 // However, a 32-bit floating point number has only 23 bits mantissa and
3710 // it's not enough to hold all the significant bits of `lof` if val is
3711 // negative. To avoid the loss of precision, We need to take the absolute
3712 // value after truncating and flip the result back based on the original
3713 // signedness.
3714 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3715 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3716 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3717 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3718 }
3719
3720 SDValue K0, K1;
3721 if (SrcVT == MVT::f64) {
3722 K0 = DAG.getConstantFP(
3723 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3724 VT: SrcVT);
3725 K1 = DAG.getConstantFP(
3726 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3727 VT: SrcVT);
3728 } else {
3729 K0 = DAG.getConstantFP(
3730 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3731 K1 = DAG.getConstantFP(
3732 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3733 }
3734 // TODO: Should this propagate fast-math-flags?
3735 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3736
3737 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3738
3739 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3740
3741 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3742 : ISD::FP_TO_UINT,
3743 DL: SL, VT: MVT::i32, Operand: FloorMul);
3744 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3745
3746 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3747 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3748
3749 if (Signed && SrcVT == MVT::f32) {
3750 assert(Sign);
3751 // Flip the result based on the signedness, which is either all 0s or 1s.
3752 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3753 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3754 // r := xor(r, sign) - sign;
3755 Result =
3756 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3757 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3758 }
3759
3760 return Result;
3761}
3762
3763SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3764 SDLoc DL(Op);
3765 SDValue N0 = Op.getOperand(i: 0);
3766
3767 // Convert to target node to get known bits
3768 if (N0.getValueType() == MVT::f32)
3769 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3770
3771 if (Op->getFlags().hasApproximateFuncs()) {
3772 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3773 return SDValue();
3774 }
3775
3776 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3777}
3778
3779// return node in i32
3780SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3781 SelectionDAG &DAG) const {
3782 assert(Src.getSimpleValueType() == MVT::f64);
3783
3784 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3785 // TODO: We can generate better code for True16.
3786 const unsigned ExpMask = 0x7ff;
3787 const unsigned ExpBiasf64 = 1023;
3788 const unsigned ExpBiasf16 = 15;
3789 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3790 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3791 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3792 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3793 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3794 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3795 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3796 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3797 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3798 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3799 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3800 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3801 // add the f16 bias (15) to get the biased exponent for the f16 format.
3802 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3803 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3804
3805 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3806 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3807 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3808 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3809
3810 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3811 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3812 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3813
3814 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3815 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3816
3817 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3818 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3819 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3820 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3821
3822 // N = M | (E << 12);
3823 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3824 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3825 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3826
3827 // B = clamp(1-E, 0, 13);
3828 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3829 N1: One, N2: E);
3830 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3831 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3832 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3833
3834 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3835 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3836
3837 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3838 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3839 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3840 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3841
3842 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3843 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3844 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3845 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3846 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3847 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3848 True: One, False: Zero, Cond: ISD::SETEQ);
3849 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3850 True: One, False: Zero, Cond: ISD::SETGT);
3851 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3852 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3853
3854 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3855 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3856 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3857 True: I, False: V, Cond: ISD::SETEQ);
3858
3859 // Extract the sign bit.
3860 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3861 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3862 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3863 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3864
3865 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3866}
3867
3868SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3869 SelectionDAG &DAG) const {
3870 SDValue Src = Op.getOperand(i: 0);
3871 unsigned OpOpcode = Op.getOpcode();
3872 EVT SrcVT = Src.getValueType();
3873 EVT DestVT = Op.getValueType();
3874
3875 // Will be selected natively
3876 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3877 return Op;
3878
3879 if (SrcVT == MVT::bf16 || (SrcVT == MVT::f16 && DestVT == MVT::i32)) {
3880 SDLoc DL(Op);
3881 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3882 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3883 }
3884
3885 // Promote i16 to i32
3886 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3887 SDLoc DL(Op);
3888
3889 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3890 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3891 }
3892
3893 if (DestVT != MVT::i64)
3894 return Op;
3895
3896 if (SrcVT == MVT::f16 ||
3897 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3898 SDLoc DL(Op);
3899
3900 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3901 unsigned Ext =
3902 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3903 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3904 }
3905
3906 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3907 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3908
3909 return SDValue();
3910}
3911
3912SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3913 SelectionDAG &DAG) const {
3914 SDValue Src = Op.getOperand(i: 0);
3915 unsigned OpOpcode = Op.getOpcode();
3916 EVT SrcVT = Src.getValueType();
3917 EVT DstVT = Op.getValueType();
3918 SDValue SatVTOp = Op.getNode()->getOperand(Num: 1);
3919 EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3920 SDLoc DL(Op);
3921
3922 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3923 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3924 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3925
3926 // Scalar cases will be selected natively to v_cvt_/s_cvt_ instructions.
3927 // v2f32 -> v2i16 will be selected natively to v_cvt_pk_[iu]16_f32.
3928 if (SatWidth == DstWidth) {
3929 if ((DstVT == MVT::i32 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
3930 (DstVT == MVT::i16 && (SrcVT == MVT::f16 || SrcVT == MVT::f32)) ||
3931 (DstVT == MVT::v2i16 && SrcVT == MVT::v2f32))
3932 return Op;
3933 }
3934
3935 // Vectors can only be selected natively.
3936 if (DstVT.isVector())
3937 return SDValue();
3938
3939 // Perform all saturation at selected width (i16 or i32) and truncate
3940 if (SatWidth < DstWidth && SatWidth <= 32) {
3941 // For f16 conversion with sub-i16 saturation perform saturation
3942 // at i16, if available in the target. This removes the need for extra f16
3943 // to f32 conversion. For all the others use i32.
3944 MVT ResultVT =
3945 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3946 ? MVT::i16
3947 : MVT::i32;
3948
3949 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3950 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3951
3952 // First, convert input float into selected integer (i16 or i32)
3953 SDValue FpToInt = DAG.getNode(Opcode: OpOpcode, DL, VT: ResultVT, N1: Src, N2: ResultVTOp);
3954 SDValue IntSatVal;
3955
3956 // Then, clamp at the saturation width using either i16 or i32 instructions
3957 if (OpOpcode == ISD::FP_TO_SINT_SAT) {
3958 SDValue MinConst = DAG.getConstant(
3959 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: ResultWidth), DL, VT: ResultVT);
3960 SDValue MaxConst = DAG.getConstant(
3961 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: ResultWidth), DL, VT: ResultVT);
3962 SDValue MinVal = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ResultVT, N1: FpToInt, N2: MinConst);
3963 IntSatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ResultVT, N1: MinVal, N2: MaxConst);
3964 } else {
3965 SDValue MinConst = DAG.getConstant(
3966 Val: APInt::getMaxValue(numBits: SatWidth).zext(width: ResultWidth), DL, VT: ResultVT);
3967 IntSatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: ResultVT, N1: FpToInt, N2: MinConst);
3968 }
3969
3970 // Finally, after saturating at i16 or i32 fit into the destination type
3971 return DAG.getExtOrTrunc(IsSigned: OpOpcode == ISD::FP_TO_SINT_SAT, Op: IntSatVal, DL,
3972 VT: DstVT);
3973 }
3974
3975 // SatWidth == DstWidth or SatWidth > 32
3976
3977 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3978 // below)
3979 if (DstVT == MVT::i64 &&
3980 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3981 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3982 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3983 return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VTOp);
3984 }
3985
3986 // Promote f16/bf16 src to f32 for i32 conversion
3987 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3988 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3989 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3990 }
3991
3992 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3993 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3994 // saturation; this covers i16.f32 and i16.f64
3995 if (DstWidth < 32) {
3996 // Note: this triggers SatWidth < DstWidth above to generate saturated
3997 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
3998 MVT PromoteVT =
3999 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
4000 SDValue FpToInt = DAG.getNode(Opcode: OpOpcode, DL, VT: PromoteVT, N1: Src, N2: SatVTOp);
4001 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt);
4002 }
4003
4004 // TODO: can we implement i64 dst for f32/f64?
4005
4006 return SDValue();
4007}
4008
4009SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
4010 SelectionDAG &DAG) const {
4011 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
4012 MVT VT = Op.getSimpleValueType();
4013 MVT ScalarVT = VT.getScalarType();
4014
4015 assert(VT.isVector());
4016
4017 SDValue Src = Op.getOperand(i: 0);
4018 SDLoc DL(Op);
4019
4020 // TODO: Don't scalarize on Evergreen?
4021 unsigned NElts = VT.getVectorNumElements();
4022 SmallVector<SDValue, 8> Args;
4023 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
4024
4025 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4026 for (unsigned I = 0; I < NElts; ++I)
4027 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
4028
4029 return DAG.getBuildVector(VT, DL, Ops: Args);
4030}
4031
4032//===----------------------------------------------------------------------===//
4033// Custom DAG optimizations
4034//===----------------------------------------------------------------------===//
4035
4036static bool isU24(SDValue Op, SelectionDAG &DAG) {
4037 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4038}
4039
4040static bool isI24(SDValue Op, SelectionDAG &DAG) {
4041 EVT VT = Op.getValueType();
4042 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4043 // as unsigned 24-bit values.
4044 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
4045}
4046
4047static SDValue simplifyMul24(SDNode *Node24,
4048 TargetLowering::DAGCombinerInfo &DCI) {
4049 SelectionDAG &DAG = DCI.DAG;
4050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4051 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4052
4053 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
4054 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
4055 unsigned NewOpcode = Node24->getOpcode();
4056 if (IsIntrin) {
4057 unsigned IID = Node24->getConstantOperandVal(Num: 0);
4058 switch (IID) {
4059 case Intrinsic::amdgcn_mul_i24:
4060 NewOpcode = AMDGPUISD::MUL_I24;
4061 break;
4062 case Intrinsic::amdgcn_mul_u24:
4063 NewOpcode = AMDGPUISD::MUL_U24;
4064 break;
4065 case Intrinsic::amdgcn_mulhi_i24:
4066 NewOpcode = AMDGPUISD::MULHI_I24;
4067 break;
4068 case Intrinsic::amdgcn_mulhi_u24:
4069 NewOpcode = AMDGPUISD::MULHI_U24;
4070 break;
4071 default:
4072 llvm_unreachable("Expected 24-bit mul intrinsic");
4073 }
4074 }
4075
4076 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
4077
4078 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4079 // the operands to have other uses, but will only perform simplifications that
4080 // involve bypassing some nodes for this user.
4081 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
4082 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
4083 if (DemandedLHS || DemandedRHS)
4084 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
4085 N1: DemandedLHS ? DemandedLHS : LHS,
4086 N2: DemandedRHS ? DemandedRHS : RHS);
4087
4088 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4089 // operands if this node is the only user.
4090 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
4091 return SDValue(Node24, 0);
4092 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
4093 return SDValue(Node24, 0);
4094
4095 return SDValue();
4096}
4097
4098template <typename IntTy>
4099static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
4100 uint32_t Width, const SDLoc &DL) {
4101 if (Width + Offset < 32) {
4102 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4103 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4104 if constexpr (std::is_signed_v<IntTy>) {
4105 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
4106 } else {
4107 return DAG.getConstant(Result, DL, MVT::i32);
4108 }
4109 }
4110
4111 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4112}
4113
4114static bool hasVolatileUser(SDNode *Val) {
4115 for (SDNode *U : Val->users()) {
4116 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
4117 if (M->isVolatile())
4118 return true;
4119 }
4120 }
4121
4122 return false;
4123}
4124
4125bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
4126 // i32 vectors are the canonical memory type.
4127 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4128 return false;
4129
4130 if (!VT.isByteSized())
4131 return false;
4132
4133 unsigned Size = VT.getStoreSize();
4134
4135 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4136 return false;
4137
4138 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4139 return false;
4140
4141 return true;
4142}
4143
4144// Replace load of an illegal type with a bitcast from a load of a friendlier
4145// type.
4146SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
4147 DAGCombinerInfo &DCI) const {
4148 if (!DCI.isBeforeLegalize())
4149 return SDValue();
4150
4151 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
4152 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
4153 return SDValue();
4154
4155 SDLoc SL(N);
4156 SelectionDAG &DAG = DCI.DAG;
4157 EVT VT = LN->getMemoryVT();
4158
4159 unsigned Size = VT.getStoreSize();
4160 Align Alignment = LN->getAlign();
4161 if (Alignment < Size && isTypeLegal(VT)) {
4162 unsigned IsFast;
4163 unsigned AS = LN->getAddressSpace();
4164
4165 // Expand unaligned loads earlier than legalization. Due to visitation order
4166 // problems during legalization, the emitted instructions to pack and unpack
4167 // the bytes again are not eliminated in the case of an unaligned copy.
4168 if (!allowsMisalignedMemoryAccesses(
4169 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4170 if (VT.isVector())
4171 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
4172
4173 SDValue Ops[2];
4174 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
4175
4176 return DAG.getMergeValues(Ops, dl: SDLoc(N));
4177 }
4178
4179 if (!IsFast)
4180 return SDValue();
4181 }
4182
4183 if (!shouldCombineMemoryType(VT))
4184 return SDValue();
4185
4186 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4187
4188 SDValue NewLoad
4189 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4190 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4191
4192 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4193 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
4194 return SDValue(N, 0);
4195}
4196
4197// Replace store of an illegal type with a store of a bitcast to a friendlier
4198// type.
4199SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4200 DAGCombinerInfo &DCI) const {
4201 if (!DCI.isBeforeLegalize())
4202 return SDValue();
4203
4204 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4205 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
4206 return SDValue();
4207
4208 EVT VT = SN->getMemoryVT();
4209 unsigned Size = VT.getStoreSize();
4210
4211 SDLoc SL(N);
4212 SelectionDAG &DAG = DCI.DAG;
4213 Align Alignment = SN->getAlign();
4214 if (Alignment < Size && isTypeLegal(VT)) {
4215 unsigned IsFast;
4216 unsigned AS = SN->getAddressSpace();
4217
4218 // Expand unaligned stores earlier than legalization. Due to visitation
4219 // order problems during legalization, the emitted instructions to pack and
4220 // unpack the bytes again are not eliminated in the case of an unaligned
4221 // copy.
4222 if (!allowsMisalignedMemoryAccesses(
4223 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4224 if (VT.isVector())
4225 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
4226
4227 return expandUnalignedStore(ST: SN, DAG);
4228 }
4229
4230 if (!IsFast)
4231 return SDValue();
4232 }
4233
4234 if (!shouldCombineMemoryType(VT))
4235 return SDValue();
4236
4237 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4238 SDValue Val = SN->getValue();
4239
4240 // DCI.AddToWorklist(Val.getNode());
4241
4242 bool OtherUses = !Val.hasOneUse();
4243 SDValue CastVal = DAG.getBitcast(VT: NewVT, V: Val);
4244 if (OtherUses) {
4245 SDValue CastBack = DAG.getBitcast(VT, V: CastVal);
4246 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4247 }
4248
4249 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4250 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4251}
4252
4253// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4254// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4255// issues.
4256SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4257 DAGCombinerInfo &DCI) const {
4258 SelectionDAG &DAG = DCI.DAG;
4259 SDValue N0 = N->getOperand(Num: 0);
4260
4261 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4262 // (vt2 (truncate (assertzext vt0:x, vt1)))
4263 if (N0.getOpcode() == ISD::TRUNCATE) {
4264 SDValue N1 = N->getOperand(Num: 1);
4265 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4266 SDLoc SL(N);
4267
4268 SDValue Src = N0.getOperand(i: 0);
4269 EVT SrcVT = Src.getValueType();
4270 if (SrcVT.bitsGE(VT: ExtVT)) {
4271 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4272 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
4273 }
4274 }
4275
4276 return SDValue();
4277}
4278
4279SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4280 SDNode *N, DAGCombinerInfo &DCI) const {
4281 unsigned IID = N->getConstantOperandVal(Num: 0);
4282 switch (IID) {
4283 case Intrinsic::amdgcn_mul_i24:
4284 case Intrinsic::amdgcn_mul_u24:
4285 case Intrinsic::amdgcn_mulhi_i24:
4286 case Intrinsic::amdgcn_mulhi_u24:
4287 return simplifyMul24(Node24: N, DCI);
4288 case Intrinsic::amdgcn_fract:
4289 case Intrinsic::amdgcn_rsq:
4290 case Intrinsic::amdgcn_rcp_legacy:
4291 case Intrinsic::amdgcn_rsq_legacy:
4292 case Intrinsic::amdgcn_rsq_clamp:
4293 case Intrinsic::amdgcn_tanh:
4294 case Intrinsic::amdgcn_prng_b32: {
4295 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4296 SDValue Src = N->getOperand(Num: 1);
4297 return Src.isUndef() ? Src : SDValue();
4298 }
4299 case Intrinsic::amdgcn_frexp_exp: {
4300 // frexp_exp (fneg x) -> frexp_exp x
4301 // frexp_exp (fabs x) -> frexp_exp x
4302 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4303 SDValue Src = N->getOperand(Num: 1);
4304 SDValue PeekSign = peekFPSignOps(Val: Src);
4305 if (PeekSign == Src)
4306 return SDValue();
4307 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4308 0);
4309 }
4310 default:
4311 return SDValue();
4312 }
4313}
4314
4315/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4316/// binary operation \p Opc to it with the corresponding constant operands.
4317SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4318 DAGCombinerInfo &DCI, const SDLoc &SL,
4319 unsigned Opc, SDValue LHS,
4320 uint32_t ValLo, uint32_t ValHi) const {
4321 SelectionDAG &DAG = DCI.DAG;
4322 SDValue Lo, Hi;
4323 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4324
4325 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4326 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4327
4328 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4329 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4330
4331 // Re-visit the ands. It's possible we eliminated one of them and it could
4332 // simplify the vector.
4333 DCI.AddToWorklist(N: Lo.getNode());
4334 DCI.AddToWorklist(N: Hi.getNode());
4335
4336 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4337 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4338}
4339
4340SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4341 DAGCombinerInfo &DCI) const {
4342 EVT VT = N->getValueType(ResNo: 0);
4343 SDValue LHS = N->getOperand(Num: 0);
4344 SDValue RHS = N->getOperand(Num: 1);
4345 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4346 SDLoc SL(N);
4347 SelectionDAG &DAG = DCI.DAG;
4348
4349 unsigned RHSVal;
4350 if (CRHS) {
4351 RHSVal = CRHS->getZExtValue();
4352 if (!RHSVal)
4353 return LHS;
4354
4355 switch (LHS->getOpcode()) {
4356 default:
4357 break;
4358 case ISD::ZERO_EXTEND:
4359 case ISD::SIGN_EXTEND:
4360 case ISD::ANY_EXTEND: {
4361 SDValue X = LHS->getOperand(Num: 0);
4362
4363 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4364 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4365 // Prefer build_vector as the canonical form if packed types are legal.
4366 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4367 SDValue Vec = DAG.getBuildVector(
4368 VT: MVT::v2i16, DL: SL,
4369 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4370 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4371 }
4372
4373 // shl (ext x) => zext (shl x), if shift does not overflow int
4374 if (VT != MVT::i64)
4375 break;
4376 KnownBits Known = DAG.computeKnownBits(Op: X);
4377 unsigned LZ = Known.countMinLeadingZeros();
4378 if (LZ < RHSVal)
4379 break;
4380 EVT XVT = X.getValueType();
4381 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4382 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4383 }
4384 }
4385 }
4386
4387 if (VT.getScalarType() != MVT::i64)
4388 return SDValue();
4389
4390 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4391 // common case, splitting this into a move and a 32-bit shift is faster and
4392 // the same code size.
4393 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4394
4395 EVT ElementType = VT.getScalarType();
4396 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4397 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4398
4399 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4400 return SDValue();
4401 SDValue ShiftAmt;
4402
4403 if (CRHS) {
4404 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4405 VT: TargetType);
4406 } else {
4407 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4408 const SDValue ShiftMask =
4409 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4410 // This AND instruction will clamp out of bounds shift values.
4411 // It will also be removed during later instruction selection.
4412 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4413 }
4414
4415 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4416 SDValue NewShift =
4417 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4418
4419 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4420 SDValue Vec;
4421
4422 if (VT.isVector()) {
4423 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4424 unsigned NElts = TargetType.getVectorNumElements();
4425 SmallVector<SDValue, 8> HiOps;
4426 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4427
4428 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4429 for (unsigned I = 0; I != NElts; ++I)
4430 HiAndLoOps[2 * I + 1] = HiOps[I];
4431 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4432 } else {
4433 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4434 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4435 }
4436 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4437}
4438
4439SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4440 DAGCombinerInfo &DCI) const {
4441 SDValue RHS = N->getOperand(Num: 1);
4442 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4443 EVT VT = N->getValueType(ResNo: 0);
4444 SDValue LHS = N->getOperand(Num: 0);
4445 SelectionDAG &DAG = DCI.DAG;
4446 SDLoc SL(N);
4447
4448 if (VT.getScalarType() != MVT::i64)
4449 return SDValue();
4450
4451 // For C >= 32
4452 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4453
4454 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4455 // common case, splitting this into a move and a 32-bit shift is faster and
4456 // the same code size.
4457 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4458
4459 EVT ElementType = VT.getScalarType();
4460 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4461 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4462
4463 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4464 return SDValue();
4465
4466 SDValue ShiftFullAmt =
4467 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4468 SDValue ShiftAmt;
4469 if (CRHS) {
4470 unsigned RHSVal = CRHS->getZExtValue();
4471 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4472 VT: TargetType);
4473 } else if (Known.getMinValue().getZExtValue() ==
4474 (ElementType.getSizeInBits() - 1)) {
4475 ShiftAmt = ShiftFullAmt;
4476 } else {
4477 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4478 const SDValue ShiftMask =
4479 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4480 // This AND instruction will clamp out of bounds shift values.
4481 // It will also be removed during later instruction selection.
4482 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4483 }
4484
4485 EVT ConcatType;
4486 SDValue Hi;
4487 SDLoc LHSSL(LHS);
4488 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4489 if (VT.isVector()) {
4490 unsigned NElts = TargetType.getVectorNumElements();
4491 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4492 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4493 SmallVector<SDValue, 8> HiOps(NElts);
4494 SmallVector<SDValue, 16> HiAndLoOps;
4495
4496 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4497 for (unsigned I = 0; I != NElts; ++I) {
4498 HiOps[I] = HiAndLoOps[2 * I + 1];
4499 }
4500 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4501 } else {
4502 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4503 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4504 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4505 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4506 }
4507
4508 KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4509 SDValue NewShift, HiShift;
4510 if (KnownLHS.isNegative()) {
4511 HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4512 NewShift =
4513 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4514 } else if (CRHS &&
4515 CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {
4516 NewShift = HiShift =
4517 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4518 } else {
4519 Hi = DAG.getFreeze(V: Hi);
4520 HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4521 NewShift =
4522 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4523 }
4524
4525 SDValue Vec;
4526 if (VT.isVector()) {
4527 unsigned NElts = TargetType.getVectorNumElements();
4528 SmallVector<SDValue, 8> HiOps;
4529 SmallVector<SDValue, 8> LoOps;
4530 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4531
4532 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4533 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4534 for (unsigned I = 0; I != NElts; ++I) {
4535 HiAndLoOps[2 * I + 1] = HiOps[I];
4536 HiAndLoOps[2 * I] = LoOps[I];
4537 }
4538 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4539 } else {
4540 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4541 }
4542 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4543}
4544
4545SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4546 DAGCombinerInfo &DCI) const {
4547 SDValue RHS = N->getOperand(Num: 1);
4548 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4549 EVT VT = N->getValueType(ResNo: 0);
4550 SDValue LHS = N->getOperand(Num: 0);
4551 SelectionDAG &DAG = DCI.DAG;
4552 SDLoc SL(N);
4553 unsigned RHSVal;
4554
4555 if (CRHS) {
4556 RHSVal = CRHS->getZExtValue();
4557
4558 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4559 // this improves the ability to match BFE patterns in isel.
4560 if (LHS.getOpcode() == ISD::AND) {
4561 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4562 unsigned MaskIdx, MaskLen;
4563 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4564 MaskIdx == RHSVal) {
4565 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4566 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4567 N2: N->getOperand(Num: 1)),
4568 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4569 N2: N->getOperand(Num: 1)));
4570 }
4571 }
4572 }
4573 }
4574
4575 if (VT.getScalarType() != MVT::i64)
4576 return SDValue();
4577
4578 // for C >= 32
4579 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4580
4581 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4582 // common case, splitting this into a move and a 32-bit shift is faster and
4583 // the same code size.
4584 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4585
4586 EVT ElementType = VT.getScalarType();
4587 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4588 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4589
4590 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4591 return SDValue();
4592
4593 SDValue ShiftAmt;
4594 if (CRHS) {
4595 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4596 VT: TargetType);
4597 } else {
4598 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4599 const SDValue ShiftMask =
4600 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4601 // This AND instruction will clamp out of bounds shift values.
4602 // It will also be removed during later instruction selection.
4603 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4604 }
4605
4606 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4607 EVT ConcatType;
4608 SDValue Hi;
4609 SDLoc LHSSL(LHS);
4610 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4611 if (VT.isVector()) {
4612 unsigned NElts = TargetType.getVectorNumElements();
4613 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4614 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4615 SmallVector<SDValue, 8> HiOps(NElts);
4616 SmallVector<SDValue, 16> HiAndLoOps;
4617
4618 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4619 for (unsigned I = 0; I != NElts; ++I)
4620 HiOps[I] = HiAndLoOps[2 * I + 1];
4621 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4622 } else {
4623 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4624 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4625 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4626 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4627 }
4628
4629 SDValue NewShift =
4630 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4631
4632 SDValue Vec;
4633 if (VT.isVector()) {
4634 unsigned NElts = TargetType.getVectorNumElements();
4635 SmallVector<SDValue, 8> LoOps;
4636 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4637
4638 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4639 for (unsigned I = 0; I != NElts; ++I)
4640 HiAndLoOps[2 * I] = LoOps[I];
4641 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4642 } else {
4643 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4644 }
4645 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4646}
4647
4648SDValue AMDGPUTargetLowering::performTruncateCombine(
4649 SDNode *N, DAGCombinerInfo &DCI) const {
4650 SDLoc SL(N);
4651 SelectionDAG &DAG = DCI.DAG;
4652 EVT VT = N->getValueType(ResNo: 0);
4653 SDValue Src = N->getOperand(Num: 0);
4654
4655 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4656 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4657 SDValue Vec = Src.getOperand(i: 0);
4658 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4659 SDValue Elt0 = Vec.getOperand(i: 0);
4660 EVT EltVT = Elt0.getValueType();
4661 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4662 if (EltVT.isFloatingPoint()) {
4663 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4664 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4665 }
4666
4667 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4668 }
4669 }
4670 }
4671
4672 // Equivalent of above for accessing the high element of a vector as an
4673 // integer operation.
4674 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4675 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4676 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4677 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4678 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4679 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4680 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4681 unsigned BitIndex = K->getZExtValue();
4682 unsigned PartIndex = BitIndex / SrcEltSize;
4683
4684 if (PartIndex * SrcEltSize == BitIndex &&
4685 PartIndex < BV.getNumOperands()) {
4686 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4687 SDValue SrcElt =
4688 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4689 Operand: BV.getOperand(i: PartIndex));
4690 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4691 }
4692 }
4693 }
4694 }
4695 }
4696
4697 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4698 //
4699 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4700 // i16 (trunc (srl (i32 (trunc x), K)))
4701 if (VT.getScalarSizeInBits() < 32) {
4702 EVT SrcVT = Src.getValueType();
4703 if (SrcVT.getScalarSizeInBits() > 32 &&
4704 (Src.getOpcode() == ISD::SRL ||
4705 Src.getOpcode() == ISD::SRA ||
4706 Src.getOpcode() == ISD::SHL)) {
4707 SDValue Amt = Src.getOperand(i: 1);
4708 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4709
4710 // - For left shifts, do the transform as long as the shift
4711 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4712 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4713 // losing information stored in the high bits when truncating.
4714 const unsigned MaxCstSize =
4715 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4716 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4717 EVT MidVT = VT.isVector() ?
4718 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4719 NumElements: VT.getVectorNumElements()) : MVT::i32;
4720
4721 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4722 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4723 Operand: Src.getOperand(i: 0));
4724 DCI.AddToWorklist(N: Trunc.getNode());
4725
4726 if (Amt.getValueType() != NewShiftVT) {
4727 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4728 DCI.AddToWorklist(N: Amt.getNode());
4729 }
4730
4731 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4732 N1: Trunc, N2: Amt);
4733 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4734 }
4735 }
4736 }
4737
4738 return SDValue();
4739}
4740
4741// We need to specifically handle i64 mul here to avoid unnecessary conversion
4742// instructions. If we only match on the legalized i64 mul expansion,
4743// SimplifyDemandedBits will be unable to remove them because there will be
4744// multiple uses due to the separate mul + mulh[su].
4745static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4746 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4747 if (Size <= 32) {
4748 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4749 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4750 }
4751
4752 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4753 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4754
4755 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4756 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4757
4758 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4759}
4760
4761/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4762/// return SDValue().
4763static SDValue getAddOneOp(const SDNode *V) {
4764 if (V->getOpcode() != ISD::ADD)
4765 return SDValue();
4766
4767 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4768}
4769
4770SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4771 DAGCombinerInfo &DCI) const {
4772 assert(N->getOpcode() == ISD::MUL);
4773 EVT VT = N->getValueType(ResNo: 0);
4774
4775 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4776 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4777 // unnecessarily). isDivergent() is used as an approximation of whether the
4778 // value is in an SGPR.
4779 if (!N->isDivergent())
4780 return SDValue();
4781
4782 unsigned Size = VT.getSizeInBits();
4783 if (VT.isVector() || Size > 64)
4784 return SDValue();
4785
4786 SelectionDAG &DAG = DCI.DAG;
4787 SDLoc DL(N);
4788
4789 SDValue N0 = N->getOperand(Num: 0);
4790 SDValue N1 = N->getOperand(Num: 1);
4791
4792 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4793 // matching.
4794
4795 // mul x, (add y, 1) -> add (mul x, y), x
4796 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4797 SDValue AddOp = getAddOneOp(V: V.getNode());
4798 if (!AddOp)
4799 return SDValue();
4800
4801 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4802 return U->getOpcode() == ISD::MUL;
4803 }))
4804 return AddOp;
4805
4806 return SDValue();
4807 };
4808
4809 // FIXME: The selection pattern is not properly checking for commuted
4810 // operands, so we have to place the mul in the LHS
4811 if (SDValue MulOper = IsFoldableAdd(N0)) {
4812 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4813 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4814 }
4815
4816 if (SDValue MulOper = IsFoldableAdd(N1)) {
4817 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4818 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4819 }
4820
4821 // There are i16 integer mul/mad.
4822 if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4823 return SDValue();
4824
4825 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4826 // in the source into any_extends if the result of the mul is truncated. Since
4827 // we can assume the high bits are whatever we want, use the underlying value
4828 // to avoid the unknown high bits from interfering.
4829 if (N0.getOpcode() == ISD::ANY_EXTEND)
4830 N0 = N0.getOperand(i: 0);
4831
4832 if (N1.getOpcode() == ISD::ANY_EXTEND)
4833 N1 = N1.getOperand(i: 0);
4834
4835 SDValue Mul;
4836
4837 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4838 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4839 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4840 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4841 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4842 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4843 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4844 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4845 } else {
4846 return SDValue();
4847 }
4848
4849 // We need to use sext even for MUL_U24, because MUL_U24 is used
4850 // for signed multiply of 8 and 16-bit types.
4851 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4852}
4853
4854SDValue
4855AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4856 DAGCombinerInfo &DCI) const {
4857 if (N->getValueType(ResNo: 0) != MVT::i32)
4858 return SDValue();
4859
4860 SelectionDAG &DAG = DCI.DAG;
4861 SDLoc DL(N);
4862
4863 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4864 SDValue N0 = N->getOperand(Num: 0);
4865 SDValue N1 = N->getOperand(Num: 1);
4866
4867 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4868 // in the source into any_extends if the result of the mul is truncated. Since
4869 // we can assume the high bits are whatever we want, use the underlying value
4870 // to avoid the unknown high bits from interfering.
4871 if (N0.getOpcode() == ISD::ANY_EXTEND)
4872 N0 = N0.getOperand(i: 0);
4873 if (N1.getOpcode() == ISD::ANY_EXTEND)
4874 N1 = N1.getOperand(i: 0);
4875
4876 // Try to use two fast 24-bit multiplies (one for each half of the result)
4877 // instead of one slow extending multiply.
4878 unsigned LoOpcode = 0;
4879 unsigned HiOpcode = 0;
4880 if (Signed) {
4881 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4882 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4883 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4884 LoOpcode = AMDGPUISD::MUL_I24;
4885 HiOpcode = AMDGPUISD::MULHI_I24;
4886 }
4887 } else {
4888 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4889 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4890 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4891 LoOpcode = AMDGPUISD::MUL_U24;
4892 HiOpcode = AMDGPUISD::MULHI_U24;
4893 }
4894 }
4895 if (!LoOpcode)
4896 return SDValue();
4897
4898 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4899 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4900 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4901 return SDValue(N, 0);
4902}
4903
4904SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4905 DAGCombinerInfo &DCI) const {
4906 EVT VT = N->getValueType(ResNo: 0);
4907
4908 if (!Subtarget->hasMulI24() || VT.isVector())
4909 return SDValue();
4910
4911 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4912 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4913 // unnecessarily). isDivergent() is used as an approximation of whether the
4914 // value is in an SGPR.
4915 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4916 // valu op anyway)
4917 if (Subtarget->hasSMulHi() && !N->isDivergent())
4918 return SDValue();
4919
4920 SelectionDAG &DAG = DCI.DAG;
4921 SDLoc DL(N);
4922
4923 SDValue N0 = N->getOperand(Num: 0);
4924 SDValue N1 = N->getOperand(Num: 1);
4925
4926 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4927 return SDValue();
4928
4929 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4930 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4931
4932 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4933 DCI.AddToWorklist(N: Mulhi.getNode());
4934 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4935}
4936
4937SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4938 DAGCombinerInfo &DCI) const {
4939 EVT VT = N->getValueType(ResNo: 0);
4940
4941 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4942 return SDValue();
4943
4944 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4945 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4946 // unnecessarily). isDivergent() is used as an approximation of whether the
4947 // value is in an SGPR.
4948 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4949 // valu op anyway)
4950 if (!N->isDivergent() && Subtarget->hasSMulHi())
4951 return SDValue();
4952
4953 SelectionDAG &DAG = DCI.DAG;
4954 SDLoc DL(N);
4955
4956 SDValue N0 = N->getOperand(Num: 0);
4957 SDValue N1 = N->getOperand(Num: 1);
4958
4959 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4960 return SDValue();
4961
4962 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4963 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4964
4965 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4966 DCI.AddToWorklist(N: Mulhi.getNode());
4967 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4968}
4969
4970SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4971 SDValue Op,
4972 const SDLoc &DL,
4973 unsigned Opc) const {
4974 EVT VT = Op.getValueType();
4975 if (VT.bitsGT(VT: MVT::i32))
4976 return SDValue();
4977
4978 if (VT != MVT::i32)
4979 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4980
4981 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4982 if (VT != MVT::i32)
4983 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4984
4985 return FFBX;
4986}
4987
4988// The native instructions return -1 on 0 input. Optimize out a select that
4989// produces -1 on 0.
4990//
4991// TODO: If zero is not undef, we could also do this if the output is compared
4992// against the bitwidth.
4993//
4994// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4995SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4996 SDValue LHS, SDValue RHS,
4997 DAGCombinerInfo &DCI) const {
4998 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4999 return SDValue();
5000
5001 SelectionDAG &DAG = DCI.DAG;
5002 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
5003 SDValue CmpLHS = Cond.getOperand(i: 0);
5004
5005 // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x
5006 // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x
5007 if (CCOpcode == ISD::SETEQ &&
5008 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
5009 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
5010 unsigned Opc =
5011 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5012 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
5013 }
5014
5015 // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x
5016 // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x
5017 if (CCOpcode == ISD::SETNE &&
5018 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
5019 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
5020 unsigned Opc =
5021 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5022
5023 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
5024 }
5025
5026 return SDValue();
5027}
5028
5029static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
5030 unsigned Op,
5031 const SDLoc &SL,
5032 SDValue Cond,
5033 SDValue N1,
5034 SDValue N2) {
5035 SelectionDAG &DAG = DCI.DAG;
5036 EVT VT = N1.getValueType();
5037
5038 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
5039 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
5040 DCI.AddToWorklist(N: NewSelect.getNode());
5041 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
5042}
5043
5044// Pull a free FP operation out of a select so it may fold into uses.
5045//
5046// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5047// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5048//
5049// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5050// select c, (fabs x), +k -> fabs (select c, x, k)
5051SDValue
5052AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
5053 SDValue N) const {
5054 SelectionDAG &DAG = DCI.DAG;
5055 SDValue Cond = N.getOperand(i: 0);
5056 SDValue LHS = N.getOperand(i: 1);
5057 SDValue RHS = N.getOperand(i: 2);
5058
5059 EVT VT = N.getValueType();
5060 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5061 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5062 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5063 return SDValue();
5064
5065 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
5066 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
5067 }
5068
5069 bool Inv = false;
5070 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5071 std::swap(a&: LHS, b&: RHS);
5072 Inv = true;
5073 }
5074
5075 // TODO: Support vector constants.
5076 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
5077 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5078 !selectSupportsSourceMods(N: N.getNode())) {
5079 SDLoc SL(N);
5080 // If one side is an fneg/fabs and the other is a constant, we can push the
5081 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5082 SDValue NewLHS = LHS.getOperand(i: 0);
5083 SDValue NewRHS = RHS;
5084
5085 // Careful: if the neg can be folded up, don't try to pull it back down.
5086 bool ShouldFoldNeg = true;
5087
5088 if (NewLHS.hasOneUse()) {
5089 unsigned Opc = NewLHS.getOpcode();
5090 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
5091 ShouldFoldNeg = false;
5092 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5093 ShouldFoldNeg = false;
5094 }
5095
5096 if (ShouldFoldNeg) {
5097 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5098 return SDValue();
5099
5100 // We're going to be forced to use a source modifier anyway, there's no
5101 // point to pulling the negate out unless we can get a size reduction by
5102 // negating the constant.
5103 //
5104 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5105 // about cheaper constants.
5106 if (NewLHS.getOpcode() == ISD::FABS &&
5107 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
5108 return SDValue();
5109
5110 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5111 return SDValue();
5112
5113 if (LHS.getOpcode() == ISD::FNEG)
5114 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5115
5116 if (Inv)
5117 std::swap(a&: NewLHS, b&: NewRHS);
5118
5119 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
5120 N1: Cond, N2: NewLHS, N3: NewRHS);
5121 DCI.AddToWorklist(N: NewSelect.getNode());
5122 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
5123 }
5124 }
5125
5126 return SDValue();
5127}
5128
5129SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
5130 DAGCombinerInfo &DCI) const {
5131 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
5132 return Folded;
5133
5134 SDValue Cond = N->getOperand(Num: 0);
5135 if (Cond.getOpcode() != ISD::SETCC)
5136 return SDValue();
5137
5138 EVT VT = N->getValueType(ResNo: 0);
5139 SDValue LHS = Cond.getOperand(i: 0);
5140 SDValue RHS = Cond.getOperand(i: 1);
5141 SDValue CC = Cond.getOperand(i: 2);
5142
5143 SDValue True = N->getOperand(Num: 1);
5144 SDValue False = N->getOperand(Num: 2);
5145
5146 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5147 SelectionDAG &DAG = DCI.DAG;
5148 if (DAG.isConstantValueOfAnyType(N: True) &&
5149 !DAG.isConstantValueOfAnyType(N: False)) {
5150 // Swap cmp + select pair to move constant to false input.
5151 // This will allow using VOPC cndmasks more often.
5152 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5153
5154 SDLoc SL(N);
5155 ISD::CondCode NewCC =
5156 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
5157
5158 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
5159 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
5160 }
5161
5162 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5163 SDValue MinMax
5164 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5165 // Revisit this node so we can catch min3/max3/med3 patterns.
5166 //DCI.AddToWorklist(MinMax.getNode());
5167 return MinMax;
5168 }
5169 }
5170
5171 // There's no reason to not do this if the condition has other uses.
5172 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
5173}
5174
5175static bool isInv2Pi(const APFloat &APF) {
5176 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5177 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5178 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5179
5180 return APF.bitwiseIsEqual(RHS: KF16) ||
5181 APF.bitwiseIsEqual(RHS: KF32) ||
5182 APF.bitwiseIsEqual(RHS: KF64);
5183}
5184
5185// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5186// additional cost to negate them.
5187TargetLowering::NegatibleCost
5188AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
5189 if (C->isZero())
5190 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5191
5192 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5193 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5194
5195 return NegatibleCost::Neutral;
5196}
5197
5198bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5199 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5200 return getConstantNegateCost(C) == NegatibleCost::Expensive;
5201 return false;
5202}
5203
5204bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5205 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5206 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5207 return false;
5208}
5209
5210static unsigned inverseMinMax(unsigned Opc) {
5211 switch (Opc) {
5212 case ISD::FMAXNUM:
5213 return ISD::FMINNUM;
5214 case ISD::FMINNUM:
5215 return ISD::FMAXNUM;
5216 case ISD::FMAXNUM_IEEE:
5217 return ISD::FMINNUM_IEEE;
5218 case ISD::FMINNUM_IEEE:
5219 return ISD::FMAXNUM_IEEE;
5220 case ISD::FMAXIMUM:
5221 return ISD::FMINIMUM;
5222 case ISD::FMINIMUM:
5223 return ISD::FMAXIMUM;
5224 case ISD::FMAXIMUMNUM:
5225 return ISD::FMINIMUMNUM;
5226 case ISD::FMINIMUMNUM:
5227 return ISD::FMAXIMUMNUM;
5228 case AMDGPUISD::FMAX_LEGACY:
5229 return AMDGPUISD::FMIN_LEGACY;
5230 case AMDGPUISD::FMIN_LEGACY:
5231 return AMDGPUISD::FMAX_LEGACY;
5232 default:
5233 llvm_unreachable("invalid min/max opcode");
5234 }
5235}
5236
5237/// \return true if it's profitable to try to push an fneg into its source
5238/// instruction.
5239bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5240 // If the input has multiple uses and we can either fold the negate down, or
5241 // the other uses cannot, give up. This both prevents unprofitable
5242 // transformations and infinite loops: we won't repeatedly try to fold around
5243 // a negate that has no 'good' form.
5244 if (N0.hasOneUse()) {
5245 // This may be able to fold into the source, but at a code size cost. Don't
5246 // fold if the fold into the user is free.
5247 if (allUsesHaveSourceMods(N, CostThreshold: 0))
5248 return false;
5249 } else {
5250 if (fnegFoldsIntoOp(N: N0.getNode()) &&
5251 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
5252 return false;
5253 }
5254
5255 return true;
5256}
5257
5258SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5259 DAGCombinerInfo &DCI) const {
5260 SelectionDAG &DAG = DCI.DAG;
5261 SDValue N0 = N->getOperand(Num: 0);
5262 EVT VT = N->getValueType(ResNo: 0);
5263
5264 unsigned Opc = N0.getOpcode();
5265
5266 if (!shouldFoldFNegIntoSrc(N, N0))
5267 return SDValue();
5268
5269 SDLoc SL(N);
5270 switch (Opc) {
5271 case ISD::FADD: {
5272 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5273 return SDValue();
5274
5275 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5276 SDValue LHS = N0.getOperand(i: 0);
5277 SDValue RHS = N0.getOperand(i: 1);
5278
5279 if (LHS.getOpcode() != ISD::FNEG)
5280 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5281 else
5282 LHS = LHS.getOperand(i: 0);
5283
5284 if (RHS.getOpcode() != ISD::FNEG)
5285 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5286 else
5287 RHS = RHS.getOperand(i: 0);
5288
5289 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5290 if (Res.getOpcode() != ISD::FADD)
5291 return SDValue(); // Op got folded away.
5292 if (!N0.hasOneUse())
5293 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5294 return Res;
5295 }
5296 case ISD::FMUL:
5297 case AMDGPUISD::FMUL_LEGACY: {
5298 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5299 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5300 SDValue LHS = N0.getOperand(i: 0);
5301 SDValue RHS = N0.getOperand(i: 1);
5302
5303 if (LHS.getOpcode() == ISD::FNEG)
5304 LHS = LHS.getOperand(i: 0);
5305 else if (RHS.getOpcode() == ISD::FNEG)
5306 RHS = RHS.getOperand(i: 0);
5307 else
5308 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5309
5310 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5311 if (Res.getOpcode() != Opc)
5312 return SDValue(); // Op got folded away.
5313 if (!N0.hasOneUse())
5314 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5315 return Res;
5316 }
5317 case ISD::FMA:
5318 case ISD::FMAD: {
5319 // TODO: handle llvm.amdgcn.fma.legacy
5320 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5321 return SDValue();
5322
5323 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5324 SDValue LHS = N0.getOperand(i: 0);
5325 SDValue MHS = N0.getOperand(i: 1);
5326 SDValue RHS = N0.getOperand(i: 2);
5327
5328 if (LHS.getOpcode() == ISD::FNEG)
5329 LHS = LHS.getOperand(i: 0);
5330 else if (MHS.getOpcode() == ISD::FNEG)
5331 MHS = MHS.getOperand(i: 0);
5332 else
5333 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5334
5335 if (RHS.getOpcode() != ISD::FNEG)
5336 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5337 else
5338 RHS = RHS.getOperand(i: 0);
5339
5340 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5341 if (Res.getOpcode() != Opc)
5342 return SDValue(); // Op got folded away.
5343 if (!N0.hasOneUse())
5344 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5345 return Res;
5346 }
5347 case ISD::FMAXNUM:
5348 case ISD::FMINNUM:
5349 case ISD::FMAXNUM_IEEE:
5350 case ISD::FMINNUM_IEEE:
5351 case ISD::FMINIMUM:
5352 case ISD::FMAXIMUM:
5353 case ISD::FMINIMUMNUM:
5354 case ISD::FMAXIMUMNUM:
5355 case AMDGPUISD::FMAX_LEGACY:
5356 case AMDGPUISD::FMIN_LEGACY: {
5357 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5358 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5359 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5360 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5361
5362 SDValue LHS = N0.getOperand(i: 0);
5363 SDValue RHS = N0.getOperand(i: 1);
5364
5365 // 0 doesn't have a negated inline immediate.
5366 // TODO: This constant check should be generalized to other operations.
5367 if (isConstantCostlierToNegate(N: RHS))
5368 return SDValue();
5369
5370 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5371 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5372 unsigned Opposite = inverseMinMax(Opc);
5373
5374 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5375 if (Res.getOpcode() != Opposite)
5376 return SDValue(); // Op got folded away.
5377 if (!N0.hasOneUse())
5378 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5379 return Res;
5380 }
5381 case AMDGPUISD::FMED3: {
5382 // med3 sorts a NaN input as smaller than everything regardless of its sign,
5383 // so negating all operands does not sign-flip the median when an input may
5384 // be NaN.
5385 if (!N0->getFlags().hasNoNaNs())
5386 return SDValue();
5387
5388 SDValue Ops[3];
5389 for (unsigned I = 0; I < 3; ++I)
5390 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5391
5392 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5393 if (Res.getOpcode() != AMDGPUISD::FMED3)
5394 return SDValue(); // Op got folded away.
5395
5396 if (!N0.hasOneUse()) {
5397 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5398 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5399
5400 for (SDNode *U : Neg->users())
5401 DCI.AddToWorklist(N: U);
5402 }
5403
5404 return Res;
5405 }
5406 case ISD::FP_EXTEND:
5407 case ISD::FTRUNC:
5408 case ISD::FRINT:
5409 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5410 case ISD::FROUNDEVEN:
5411 case ISD::FSIN:
5412 case ISD::FCANONICALIZE:
5413 case AMDGPUISD::RCP:
5414 case AMDGPUISD::RCP_LEGACY:
5415 case AMDGPUISD::RCP_IFLAG:
5416 case AMDGPUISD::SIN_HW: {
5417 SDValue CvtSrc = N0.getOperand(i: 0);
5418 if (CvtSrc.getOpcode() == ISD::FNEG) {
5419 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5420 // (fneg (rcp (fneg x))) -> (rcp x)
5421 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5422 }
5423
5424 if (!N0.hasOneUse())
5425 return SDValue();
5426
5427 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5428 // (fneg (rcp x)) -> (rcp (fneg x))
5429 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5430 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5431 }
5432 case ISD::FP_ROUND: {
5433 SDValue CvtSrc = N0.getOperand(i: 0);
5434
5435 if (CvtSrc.getOpcode() == ISD::FNEG) {
5436 // (fneg (fp_round (fneg x))) -> (fp_round x)
5437 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5438 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5439 }
5440
5441 if (!N0.hasOneUse())
5442 return SDValue();
5443
5444 // (fneg (fp_round x)) -> (fp_round (fneg x))
5445 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5446 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5447 }
5448 case ISD::FP16_TO_FP: {
5449 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5450 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5451 // Put the fneg back as a legal source operation that can be matched later.
5452 SDLoc SL(N);
5453
5454 SDValue Src = N0.getOperand(i: 0);
5455 EVT SrcVT = Src.getValueType();
5456
5457 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5458 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5459 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5460 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5461 }
5462 case ISD::SELECT: {
5463 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5464 // TODO: Invert conditions of foldFreeOpFromSelect
5465 return SDValue();
5466 }
5467 case ISD::BITCAST: {
5468 SDLoc SL(N);
5469 SDValue BCSrc = N0.getOperand(i: 0);
5470 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5471 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5472 if (VT != MVT::f64 || HighBits.getValueType().getSizeInBits() != 32 ||
5473 !fnegFoldsIntoOp(N: HighBits.getNode()))
5474 return SDValue();
5475
5476 // f64 fneg only really needs to operate on the high half of of the
5477 // register, so try to force it to an f32 operation to help make use of
5478 // source modifiers.
5479 //
5480 //
5481 // fneg (f64 (bitcast (build_vector x, y))) ->
5482 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5483 // (fneg (bitcast i32:y to f32)))
5484
5485 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5486 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5487 SDValue CastBack =
5488 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5489
5490 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5491 Ops.back() = CastBack;
5492 DCI.AddToWorklist(N: NegHi.getNode());
5493 SDValue Build =
5494 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5495 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5496
5497 if (!N0.hasOneUse())
5498 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5499 return Result;
5500 }
5501
5502 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5503 BCSrc.hasOneUse()) {
5504 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5505 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5506
5507 // TODO: Cast back result for multiple uses is beneficial in some cases.
5508
5509 SDValue LHS =
5510 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5511 SDValue RHS =
5512 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5513
5514 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5515 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5516
5517 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5518 N3: NegRHS);
5519 }
5520
5521 return SDValue();
5522 }
5523 default:
5524 return SDValue();
5525 }
5526}
5527
5528SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5529 DAGCombinerInfo &DCI) const {
5530 SelectionDAG &DAG = DCI.DAG;
5531 SDValue N0 = N->getOperand(Num: 0);
5532
5533 if (!N0.hasOneUse())
5534 return SDValue();
5535
5536 switch (N0.getOpcode()) {
5537 case ISD::FP16_TO_FP: {
5538 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5539 SDLoc SL(N);
5540 SDValue Src = N0.getOperand(i: 0);
5541 EVT SrcVT = Src.getValueType();
5542
5543 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5544 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5545 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5546 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5547 }
5548 default:
5549 return SDValue();
5550 }
5551}
5552
5553SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5554 DAGCombinerInfo &DCI) const {
5555 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5556 if (!CFP)
5557 return SDValue();
5558
5559 // XXX - Should this flush denormals?
5560 const APFloat &Val = CFP->getValueAPF();
5561 APFloat One(Val.getSemantics(), "1.0");
5562 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5563}
5564
5565bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const {
5566 if (!Subtarget->isGCN())
5567 return false;
5568
5569 ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(Val: N);
5570 ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(Val: N);
5571 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5572 const auto *TII = ST.getInstrInfo();
5573
5574 if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))
5575 return false;
5576
5577 if (ST.has64BitLiterals())
5578 return true;
5579
5580 if (SDConstant) {
5581 const APInt &APVal = SDConstant->getAPIntValue();
5582 return isUInt<32>(x: APVal.getZExtValue()) || TII->isInlineConstant(Imm: APVal);
5583 }
5584
5585 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5586 return isUInt<32>(x: Val.getZExtValue()) || TII->isInlineConstant(Imm: Val);
5587}
5588
5589SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5590 DAGCombinerInfo &DCI) const {
5591 SelectionDAG &DAG = DCI.DAG;
5592 SDLoc DL(N);
5593
5594 switch(N->getOpcode()) {
5595 default:
5596 break;
5597 case ISD::BITCAST: {
5598 EVT DestVT = N->getValueType(ResNo: 0);
5599
5600 // Push casts through vector builds. This helps avoid emitting a large
5601 // number of copies when materializing floating point vector constants.
5602 //
5603 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5604 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5605 if (DestVT.isVector()) {
5606 SDValue Src = N->getOperand(Num: 0);
5607 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5608 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5609 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5610 EVT SrcVT = Src.getValueType();
5611 unsigned NElts = DestVT.getVectorNumElements();
5612
5613 if (SrcVT.getVectorNumElements() == NElts) {
5614 EVT DestEltVT = DestVT.getVectorElementType();
5615
5616 SmallVector<SDValue, 8> CastedElts;
5617 SDLoc SL(N);
5618 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5619 SDValue Elt = Src.getOperand(i: I);
5620 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5621 }
5622
5623 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5624 }
5625 }
5626 }
5627
5628 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5629 break;
5630
5631 // Fold bitcasts of constants.
5632 //
5633 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5634 // TODO: Generalize and move to DAGCombiner
5635 SDValue Src = N->getOperand(Num: 0);
5636 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5637 SDLoc SL(N);
5638 if (isInt64ImmLegal(N: C, DAG))
5639 break;
5640 uint64_t CVal = C->getZExtValue();
5641 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5642 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5643 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5644 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5645 }
5646
5647 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5648 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5649 SDLoc SL(N);
5650 if (isInt64ImmLegal(N: C, DAG))
5651 break;
5652 uint64_t CVal = Val.getZExtValue();
5653 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5654 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5655 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5656
5657 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5658 }
5659
5660 break;
5661 }
5662 case ISD::SHL:
5663 case ISD::SRA:
5664 case ISD::SRL: {
5665 // Range metadata can be invalidated when loads are converted to legal types
5666 // (e.g. v2i64 -> v4i32).
5667 // Try to convert vector shl/sra/srl before type legalization so that range
5668 // metadata can be utilized.
5669 if (!(N->getValueType(ResNo: 0).isVector() &&
5670 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5671 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5672 break;
5673 if (N->getOpcode() == ISD::SHL)
5674 return performShlCombine(N, DCI);
5675 if (N->getOpcode() == ISD::SRA)
5676 return performSraCombine(N, DCI);
5677 return performSrlCombine(N, DCI);
5678 }
5679 case ISD::TRUNCATE:
5680 return performTruncateCombine(N, DCI);
5681 case ISD::MUL:
5682 return performMulCombine(N, DCI);
5683 case AMDGPUISD::MUL_U24:
5684 case AMDGPUISD::MUL_I24: {
5685 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5686 return Simplified;
5687 break;
5688 }
5689 case AMDGPUISD::MULHI_I24:
5690 case AMDGPUISD::MULHI_U24:
5691 return simplifyMul24(Node24: N, DCI);
5692 case ISD::SMUL_LOHI:
5693 case ISD::UMUL_LOHI:
5694 return performMulLoHiCombine(N, DCI);
5695 case ISD::MULHS:
5696 return performMulhsCombine(N, DCI);
5697 case ISD::MULHU:
5698 return performMulhuCombine(N, DCI);
5699 case ISD::SELECT:
5700 return performSelectCombine(N, DCI);
5701 case ISD::FNEG:
5702 return performFNegCombine(N, DCI);
5703 case ISD::FABS:
5704 return performFAbsCombine(N, DCI);
5705 case AMDGPUISD::BFE_I32:
5706 case AMDGPUISD::BFE_U32: {
5707 assert(!N->getValueType(0).isVector() &&
5708 "Vector handling of BFE not implemented");
5709 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5710 if (!Width)
5711 break;
5712
5713 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5714 if (WidthVal == 0)
5715 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5716
5717 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5718 if (!Offset)
5719 break;
5720
5721 SDValue BitsFrom = N->getOperand(Num: 0);
5722 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5723
5724 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5725
5726 if (OffsetVal == 0) {
5727 // This is already sign / zero extended, so try to fold away extra BFEs.
5728 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5729
5730 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5731 if (OpSignBits >= SignBits)
5732 return BitsFrom;
5733
5734 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5735 if (Signed) {
5736 // This is a sign_extend_inreg. Replace it to take advantage of existing
5737 // DAG Combines. If not eliminated, we will match back to BFE during
5738 // selection.
5739
5740 // TODO: The sext_inreg of extended types ends, although we can could
5741 // handle them in a single BFE.
5742 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5743 N2: DAG.getValueType(SmallVT));
5744 }
5745
5746 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5747 }
5748
5749 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5750 if (Signed) {
5751 return constantFoldBFE<int32_t>(DAG,
5752 Src0: CVal->getSExtValue(),
5753 Offset: OffsetVal,
5754 Width: WidthVal,
5755 DL);
5756 }
5757
5758 return constantFoldBFE<uint32_t>(DAG,
5759 Src0: CVal->getZExtValue(),
5760 Offset: OffsetVal,
5761 Width: WidthVal,
5762 DL);
5763 }
5764
5765 if ((OffsetVal + WidthVal) >= 32 &&
5766 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5767 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5768 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5769 N1: BitsFrom, N2: ShiftVal);
5770 }
5771
5772 if (BitsFrom.hasOneUse()) {
5773 APInt Demanded = APInt::getBitsSet(numBits: 32,
5774 loBit: OffsetVal,
5775 hiBit: OffsetVal + WidthVal);
5776
5777 KnownBits Known;
5778 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5779 !DCI.isBeforeLegalizeOps());
5780 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5781 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5782 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5783 DCI.CommitTargetLoweringOpt(TLO);
5784 }
5785 }
5786
5787 break;
5788 }
5789 case ISD::LOAD:
5790 return performLoadCombine(N, DCI);
5791 case ISD::STORE:
5792 return performStoreCombine(N, DCI);
5793 case AMDGPUISD::RCP:
5794 case AMDGPUISD::RCP_IFLAG:
5795 return performRcpCombine(N, DCI);
5796 case ISD::AssertZext:
5797 case ISD::AssertSext:
5798 return performAssertSZExtCombine(N, DCI);
5799 case ISD::INTRINSIC_WO_CHAIN:
5800 return performIntrinsicWOChainCombine(N, DCI);
5801 case AMDGPUISD::FMAD_FTZ: {
5802 SDValue N0 = N->getOperand(Num: 0);
5803 SDValue N1 = N->getOperand(Num: 1);
5804 SDValue N2 = N->getOperand(Num: 2);
5805 EVT VT = N->getValueType(ResNo: 0);
5806
5807 // FMAD_FTZ is a FMAD + flush denormals to zero.
5808 // We flush the inputs, the intermediate step, and the output.
5809 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5810 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5811 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5812 if (N0CFP && N1CFP && N2CFP) {
5813 const auto FTZ = [](const APFloat &V) {
5814 if (V.isDenormal()) {
5815 APFloat Zero(V.getSemantics(), 0);
5816 return V.isNegative() ? -Zero : Zero;
5817 }
5818 return V;
5819 };
5820
5821 APFloat V0 = FTZ(N0CFP->getValueAPF());
5822 APFloat V1 = FTZ(N1CFP->getValueAPF());
5823 APFloat V2 = FTZ(N2CFP->getValueAPF());
5824 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5825 V0 = FTZ(V0);
5826 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5827 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5828 }
5829 break;
5830 }
5831 }
5832 return SDValue();
5833}
5834
5835bool AMDGPUTargetLowering::SimplifyDemandedBitsForTargetNode(
5836 SDValue Op, const APInt &OriginalDemandedBits,
5837 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5838 unsigned Depth) const {
5839 switch (Op.getOpcode()) {
5840 case ISD::INTRINSIC_WO_CHAIN: {
5841 switch (Op.getConstantOperandVal(i: 0)) {
5842 case Intrinsic::amdgcn_readfirstlane:
5843 case Intrinsic::amdgcn_readlane:
5844 case Intrinsic::amdgcn_set_inactive:
5845 case Intrinsic::amdgcn_wwm: {
5846 if (SimplifyDemandedBits(Op: Op.getOperand(i: 1), DemandedBits: OriginalDemandedBits,
5847 DemandedElts: OriginalDemandedElts, Known, TLO, Depth: Depth + 1))
5848 return true;
5849 break;
5850 }
5851 default:
5852 break;
5853 }
5854 break;
5855 }
5856 default:
5857 break;
5858 }
5859
5860 return false;
5861}
5862
5863//===----------------------------------------------------------------------===//
5864// Helper functions
5865//===----------------------------------------------------------------------===//
5866
5867SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5868 const TargetRegisterClass *RC,
5869 Register Reg, EVT VT,
5870 const SDLoc &SL,
5871 bool RawReg) const {
5872 MachineFunction &MF = DAG.getMachineFunction();
5873 MachineRegisterInfo &MRI = MF.getRegInfo();
5874 Register VReg;
5875
5876 if (!MRI.isLiveIn(Reg)) {
5877 VReg = MRI.createVirtualRegister(RegClass: RC);
5878 MRI.addLiveIn(Reg, vreg: VReg);
5879 } else {
5880 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5881 }
5882
5883 if (RawReg)
5884 return DAG.getRegister(Reg: VReg, VT);
5885
5886 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5887}
5888
5889// This may be called multiple times, and nothing prevents creating multiple
5890// objects at the same offset. See if we already defined this object.
5891static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5892 int64_t Offset) {
5893 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5894 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5895 assert(MFI.getObjectSize(I) == Size);
5896 return I;
5897 }
5898 }
5899
5900 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5901}
5902
5903SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5904 EVT VT,
5905 const SDLoc &SL,
5906 int64_t Offset) const {
5907 MachineFunction &MF = DAG.getMachineFunction();
5908 MachineFrameInfo &MFI = MF.getFrameInfo();
5909 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5910
5911 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5912 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5913
5914 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5915 MMOFlags: MachineMemOperand::MODereferenceable |
5916 MachineMemOperand::MOInvariant);
5917}
5918
5919SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5920 const SDLoc &SL,
5921 SDValue Chain,
5922 SDValue ArgVal,
5923 int64_t Offset) const {
5924 MachineFunction &MF = DAG.getMachineFunction();
5925 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5926 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5927
5928 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5929 // Stores to the argument stack area are relative to the stack pointer.
5930 SDValue SP =
5931 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5932 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5933 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5934 MMOFlags: MachineMemOperand::MODereferenceable);
5935 return Store;
5936}
5937
5938SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5939 const TargetRegisterClass *RC,
5940 EVT VT, const SDLoc &SL,
5941 const ArgDescriptor &Arg) const {
5942 assert(Arg && "Attempting to load missing argument");
5943
5944 SDValue V = Arg.isRegister() ?
5945 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5946 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5947
5948 if (!Arg.isMasked())
5949 return V;
5950
5951 unsigned Mask = Arg.getMask();
5952 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5953 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5954 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5955 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5956 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5957}
5958
5959uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5960 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5961 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5962 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5963 uint64_t ArgOffset =
5964 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5965 switch (Param) {
5966 case FIRST_IMPLICIT:
5967 return ArgOffset;
5968 case PRIVATE_BASE:
5969 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5970 case SHARED_BASE:
5971 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5972 case QUEUE_PTR:
5973 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5974 }
5975 llvm_unreachable("unexpected implicit parameter type");
5976}
5977
5978uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5979 const MachineFunction &MF, const ImplicitParameter Param) const {
5980 const AMDGPUMachineFunctionInfo *MFI =
5981 MF.getInfo<AMDGPUMachineFunctionInfo>();
5982 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5983}
5984
5985SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5986 SelectionDAG &DAG, int Enabled,
5987 int &RefinementSteps,
5988 bool &UseOneConstNR,
5989 bool Reciprocal) const {
5990 EVT VT = Operand.getValueType();
5991
5992 if (VT == MVT::f32) {
5993 RefinementSteps = 0;
5994 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5995 }
5996
5997 // TODO: There is also f64 rsq instruction, but the documentation is less
5998 // clear on its precision.
5999
6000 return SDValue();
6001}
6002
6003SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
6004 SelectionDAG &DAG, int Enabled,
6005 int &RefinementSteps) const {
6006 EVT VT = Operand.getValueType();
6007
6008 if (VT == MVT::f32) {
6009 // Reciprocal, < 1 ulp error.
6010 //
6011 // This reciprocal approximation converges to < 0.5 ulp error with one
6012 // newton rhapson performed with two fused multiple adds (FMAs).
6013
6014 RefinementSteps = 0;
6015 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
6016 }
6017
6018 // TODO: There is also f64 rcp instruction, but the documentation is less
6019 // clear on its precision.
6020
6021 return SDValue();
6022}
6023
6024static unsigned workitemIntrinsicDim(unsigned ID) {
6025 switch (ID) {
6026 case Intrinsic::amdgcn_workitem_id_x:
6027 return 0;
6028 case Intrinsic::amdgcn_workitem_id_y:
6029 return 1;
6030 case Intrinsic::amdgcn_workitem_id_z:
6031 return 2;
6032 default:
6033 llvm_unreachable("not a workitem intrinsic");
6034 }
6035}
6036
6037void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
6038 const SDValue Op, KnownBits &Known,
6039 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6040
6041 Known.resetAll(); // Don't know anything.
6042
6043 unsigned Opc = Op.getOpcode();
6044
6045 switch (Opc) {
6046 default:
6047 break;
6048 case AMDGPUISD::CARRY:
6049 case AMDGPUISD::BORROW: {
6050 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
6051 break;
6052 }
6053
6054 case AMDGPUISD::BFE_I32:
6055 case AMDGPUISD::BFE_U32: {
6056 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6057 if (!CWidth)
6058 return;
6059
6060 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6061
6062 if (Opc == AMDGPUISD::BFE_U32)
6063 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
6064
6065 break;
6066 }
6067 case AMDGPUISD::FP_TO_FP16: {
6068 unsigned BitWidth = Known.getBitWidth();
6069
6070 // High bits are zero.
6071 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
6072 break;
6073 }
6074 case AMDGPUISD::MUL_U24:
6075 case AMDGPUISD::MUL_I24: {
6076 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6077 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6078 unsigned BitWidth = Op.getScalarValueSizeInBits();
6079
6080 // Sign/Zero extend from 24 bits.
6081 if (Opc == AMDGPUISD::MUL_I24) {
6082 LHSKnown = LHSKnown.trunc(BitWidth: 24).sext(BitWidth);
6083 RHSKnown = RHSKnown.trunc(BitWidth: 24).sext(BitWidth);
6084 } else {
6085 LHSKnown = LHSKnown.trunc(BitWidth: 24).zext(BitWidth);
6086 RHSKnown = RHSKnown.trunc(BitWidth: 24).zext(BitWidth);
6087 }
6088
6089 // TODO: SelfMultiply can be poison, but not undef.
6090 bool SelfMultiply = Op.getOperand(i: 0) == Op.getOperand(i: 1);
6091 if (SelfMultiply)
6092 SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(
6093 Op: Op.getOperand(i: 0), DemandedElts, Kind: UndefPoisonKind::UndefOrPoison,
6094 Depth: Depth + 1);
6095
6096 Known = KnownBits::mul(LHS: LHSKnown, RHS: RHSKnown, NoUndefSelfMultiply: SelfMultiply);
6097 break;
6098 }
6099 case AMDGPUISD::PERM: {
6100 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6101 if (!CMask)
6102 return;
6103
6104 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6105 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6106 unsigned Sel = CMask->getZExtValue();
6107
6108 for (unsigned I = 0; I < 32; I += 8) {
6109 unsigned SelBits = Sel & 0xff;
6110 if (SelBits < 4) {
6111 SelBits *= 8;
6112 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6113 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6114 } else if (SelBits < 7) {
6115 SelBits = (SelBits & 3) * 8;
6116 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6117 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6118 } else if (SelBits == 0x0c) {
6119 Known.Zero |= 0xFFull << I;
6120 } else if (SelBits > 0x0c) {
6121 Known.One |= 0xFFull << I;
6122 }
6123 Sel >>= 8;
6124 }
6125 break;
6126 }
6127 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6128 Known.Zero.setHighBits(24);
6129 break;
6130 }
6131 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6132 Known.Zero.setHighBits(16);
6133 break;
6134 }
6135 case AMDGPUISD::LDS: {
6136 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
6137 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
6138
6139 Known.Zero.setHighBits(16);
6140 Known.Zero.setLowBits(Log2(A: Alignment));
6141 break;
6142 }
6143 case AMDGPUISD::SMIN3:
6144 case AMDGPUISD::SMAX3:
6145 case AMDGPUISD::SMED3:
6146 case AMDGPUISD::UMIN3:
6147 case AMDGPUISD::UMAX3:
6148 case AMDGPUISD::UMED3: {
6149 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6150 if (Known2.isUnknown())
6151 break;
6152
6153 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6154 if (Known1.isUnknown())
6155 break;
6156
6157 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6158 if (Known0.isUnknown())
6159 break;
6160
6161 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6162 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6163 Known.One = Known0.One & Known1.One & Known2.One;
6164 break;
6165 }
6166 case ISD::INTRINSIC_WO_CHAIN: {
6167 unsigned IID = Op.getConstantOperandVal(i: 0);
6168 switch (IID) {
6169 case Intrinsic::amdgcn_workitem_id_x:
6170 case Intrinsic::amdgcn_workitem_id_y:
6171 case Intrinsic::amdgcn_workitem_id_z: {
6172 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6173 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
6174 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
6175 break;
6176 }
6177 default:
6178 break;
6179 }
6180 }
6181 }
6182}
6183
6184unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6185 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6186 unsigned Depth) const {
6187 switch (Op.getOpcode()) {
6188 case AMDGPUISD::BFE_I32: {
6189 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6190 if (!Width)
6191 return 1;
6192
6193 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6194 if (!isNullConstant(V: Op.getOperand(i: 1)))
6195 return SignBits;
6196
6197 // TODO: Could probably figure something out with non-0 offsets.
6198 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6199 return std::max(a: SignBits, b: Op0SignBits);
6200 }
6201
6202 case AMDGPUISD::BFE_U32: {
6203 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6204 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6205 }
6206
6207 case AMDGPUISD::CARRY:
6208 case AMDGPUISD::BORROW:
6209 return 31;
6210 case AMDGPUISD::BUFFER_LOAD_BYTE:
6211 return 25;
6212 case AMDGPUISD::BUFFER_LOAD_SHORT:
6213 return 17;
6214 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6215 return 24;
6216 case AMDGPUISD::BUFFER_LOAD_USHORT:
6217 return 16;
6218 case AMDGPUISD::FP_TO_FP16:
6219 return 16;
6220 case AMDGPUISD::SMIN3:
6221 case AMDGPUISD::SMAX3:
6222 case AMDGPUISD::SMED3:
6223 case AMDGPUISD::UMIN3:
6224 case AMDGPUISD::UMAX3:
6225 case AMDGPUISD::UMED3: {
6226 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6227 if (Tmp2 == 1)
6228 return 1; // Early out.
6229
6230 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6231 if (Tmp1 == 1)
6232 return 1; // Early out.
6233
6234 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6235 if (Tmp0 == 1)
6236 return 1; // Early out.
6237
6238 return std::min(l: {Tmp0, Tmp1, Tmp2});
6239 }
6240 default:
6241 return 1;
6242 }
6243}
6244
6245unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6246 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6247 const MachineRegisterInfo &MRI, unsigned Depth) const {
6248 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6249 if (!MI)
6250 return 1;
6251
6252 // TODO: Check range metadata on MMO.
6253 switch (MI->getOpcode()) {
6254 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6255 return 25;
6256 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6257 return 17;
6258 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6259 return 24;
6260 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6261 return 16;
6262 case AMDGPU::G_AMDGPU_SMED3:
6263 case AMDGPU::G_AMDGPU_UMED3: {
6264 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6265 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6266 if (Tmp2 == 1)
6267 return 1;
6268 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6269 if (Tmp1 == 1)
6270 return 1;
6271 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6272 if (Tmp0 == 1)
6273 return 1;
6274 return std::min(l: {Tmp0, Tmp1, Tmp2});
6275 }
6276 default:
6277 return 1;
6278 }
6279}
6280
6281bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6282 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6283 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
6284 unsigned Opcode = Op.getOpcode();
6285 switch (Opcode) {
6286 case AMDGPUISD::BFE_I32:
6287 case AMDGPUISD::BFE_U32:
6288 return false;
6289 }
6290 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6291 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
6292}
6293
6294bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6295 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6296 unsigned Depth) const {
6297 unsigned Opcode = Op.getOpcode();
6298 switch (Opcode) {
6299 case AMDGPUISD::FMIN_LEGACY:
6300 case AMDGPUISD::FMAX_LEGACY: {
6301 if (SNaN)
6302 return true;
6303
6304 // TODO: Can check no nans on one of the operands for each one, but which
6305 // one?
6306 return false;
6307 }
6308 case AMDGPUISD::FMUL_LEGACY:
6309 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6310 if (SNaN)
6311 return true;
6312 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6313 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6314 }
6315 case AMDGPUISD::FMED3:
6316 case AMDGPUISD::FMIN3:
6317 case AMDGPUISD::FMAX3:
6318 case AMDGPUISD::FMINIMUM3:
6319 case AMDGPUISD::FMAXIMUM3:
6320 case AMDGPUISD::FMAD_FTZ: {
6321 if (SNaN)
6322 return true;
6323 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6324 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6325 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6326 }
6327 case AMDGPUISD::CVT_F32_UBYTE0:
6328 case AMDGPUISD::CVT_F32_UBYTE1:
6329 case AMDGPUISD::CVT_F32_UBYTE2:
6330 case AMDGPUISD::CVT_F32_UBYTE3:
6331 return true;
6332
6333 case AMDGPUISD::RCP:
6334 case AMDGPUISD::RSQ:
6335 case AMDGPUISD::RCP_LEGACY:
6336 case AMDGPUISD::RSQ_CLAMP: {
6337 if (SNaN)
6338 return true;
6339
6340 // TODO: Need is known positive check.
6341 return false;
6342 }
6343 case ISD::FLDEXP:
6344 case AMDGPUISD::FRACT: {
6345 if (SNaN)
6346 return true;
6347 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6348 }
6349 case AMDGPUISD::DIV_SCALE:
6350 case AMDGPUISD::DIV_FMAS:
6351 case AMDGPUISD::DIV_FIXUP:
6352 // TODO: Refine on operands.
6353 return SNaN;
6354 case AMDGPUISD::SIN_HW:
6355 case AMDGPUISD::COS_HW: {
6356 // TODO: Need check for infinity
6357 return SNaN;
6358 }
6359 case ISD::INTRINSIC_WO_CHAIN: {
6360 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6361 // TODO: Handle more intrinsics
6362 switch (IntrinsicID) {
6363 case Intrinsic::amdgcn_cubeid:
6364 case Intrinsic::amdgcn_cvt_off_f32_i4:
6365 return true;
6366
6367 case Intrinsic::amdgcn_frexp_mant: {
6368 if (SNaN)
6369 return true;
6370 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6371 }
6372 case Intrinsic::amdgcn_cvt_pkrtz: {
6373 if (SNaN)
6374 return true;
6375 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6376 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6377 }
6378 case Intrinsic::amdgcn_rcp:
6379 case Intrinsic::amdgcn_rsq:
6380 case Intrinsic::amdgcn_rcp_legacy:
6381 case Intrinsic::amdgcn_rsq_legacy:
6382 case Intrinsic::amdgcn_rsq_clamp:
6383 case Intrinsic::amdgcn_tanh: {
6384 if (SNaN)
6385 return true;
6386
6387 // TODO: Need is known positive check.
6388 return false;
6389 }
6390 case Intrinsic::amdgcn_trig_preop:
6391 case Intrinsic::amdgcn_fdot2:
6392 // TODO: Refine on operand
6393 return SNaN;
6394 case Intrinsic::amdgcn_fma_legacy:
6395 if (SNaN)
6396 return true;
6397 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6398 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6399 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6400 default:
6401 return false;
6402 }
6403 }
6404 default:
6405 return false;
6406 }
6407}
6408
6409bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6410 Register N0, Register N1) const {
6411 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6412}
6413