1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUSelectionDAGInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Target/TargetMachine.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
35static cl::opt<bool> AMDGPUBypassSlowDiv(
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(Val: true));
39
40// Find a larger type to do a load / store of a vector with.
41EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
48
49 return VT;
50}
51
52unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53 return DAG.computeKnownBits(Op).countMaxActiveBits();
54}
55
56unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
62AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
68 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
69 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
70 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
73 MaxGluedStoresPerMemcpy = 16;
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
172 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
173
174 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
175 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
176
177 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
178 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
179
180 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
181 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
186 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
187 Action: Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
195 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
196 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
197 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
198 }
199 }
200
201 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
204 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
205 Action: Expand);
206
207 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
208 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
209 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
210 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
212 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
219 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
221
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
226 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
227 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
228
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
239 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
240 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
241
242 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
243 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
244
245 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
246 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
247
248 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
249 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
250
251 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
252 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
253
254 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
255 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
256
257 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
258 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
259
260 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
261 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
262
263 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
264 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
265
266 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
267 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
268
269 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
270 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
271
272 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
273 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
274
275 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
276 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
277
278 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
279 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
280
281 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
282 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
283
284 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
285 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
286
287 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
288 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
289
290 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
291 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
292
293 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
294 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
295
296 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
297 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
298
299 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
300 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
301
302 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
303 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
304
305 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
306 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
307
308 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
309 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
310
311 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
312 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
313
314 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
315 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
316
317 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
318 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
319
320 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
321 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
322
323 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
324 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
325 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
326 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
327
328 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
329 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
330 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
331 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
332
333 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
334 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
338 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
339 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
345 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
346 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
348
349 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
350 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
352
353 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
354 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
355 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
356
357 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
358
359 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
362 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
363 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
364 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
365 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
366
367 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
369 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
370 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
372
373 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
374 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
376
377 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
378 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
379 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
380
381 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
382 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
383 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
384
385 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
386 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
387 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
388
389 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
390 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
391 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
392 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
393 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
394 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
395 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
396
397 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
398 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
399
400 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
404 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
408 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409 ISD::FROUNDEVEN, ISD::FTRUNC},
410 VTs: {MVT::f16, MVT::f32}, Action: Legal);
411 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
412
413 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
414 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
415 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
416 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
417
418 setOperationAction(
419 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
420 Action: Custom);
421 setOperationAction(Ops: {ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f64, Action: Custom);
422
423 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
424
425 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
426
427 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
428 Action: Expand);
429
430 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
431 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
432 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
433
434 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
435 Action: Custom);
436
437 setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
438
439 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
440 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
441 // default unless marked custom/legal.
442 setOperationAction(Ops: ISD::IS_FPCLASS,
443 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
444 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
445 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
446 MVT::v16f64},
447 Action: Custom);
448
449 // Expand to fneg + fadd.
450 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
451
452 setOperationAction(Ops: ISD::CONCAT_VECTORS,
453 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
454 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
455 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
456 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
457 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
458 Action: Custom);
459
460 setOperationAction(
461 Ops: ISD::EXTRACT_SUBVECTOR,
462 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
463 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
464 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
465 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
466 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
467 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
468 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
469 Action: Custom);
470
471 setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
472 Action: Expand);
473 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
474
475 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
476 for (MVT VT : ScalarIntVTs) {
477 // These should use [SU]DIVREM, so set them to expand
478 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
479 Action: Expand);
480
481 // GPU does not have divrem function for signed or unsigned.
482 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
483
484 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
485 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
486
487 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
488
489 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
490 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
491 }
492
493 // The hardware supports 32-bit FSHR, but not FSHL.
494 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
495
496 setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
497
498 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
499
500 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
501 setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
502 ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
503 ISD::FP_TO_UINT_SAT},
504 VT: MVT::i64, Action: Custom);
505 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
506
507 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
508 Action: Legal);
509
510 setOperationAction(
511 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
512 VT: MVT::i64, Action: Custom);
513
514 for (auto VT : {MVT::i8, MVT::i16})
515 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
516
517 static const MVT::SimpleValueType VectorIntTypes[] = {
518 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
519 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
520
521 for (MVT VT : VectorIntTypes) {
522 // Expand the following operations for the current type by default.
523 // clang-format off
524 setOperationAction(Ops: {ISD::ADD, ISD::AND,
525 ISD::FP_TO_SINT, ISD::FP_TO_UINT,
526 ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
527 ISD::MUL, ISD::MULHU,
528 ISD::MULHS, ISD::OR,
529 ISD::SHL, ISD::SRA,
530 ISD::SRL, ISD::ROTL,
531 ISD::ROTR, ISD::SUB,
532 ISD::SINT_TO_FP, ISD::UINT_TO_FP,
533 ISD::SDIV, ISD::UDIV,
534 ISD::SREM, ISD::UREM,
535 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
536 ISD::SDIVREM, ISD::UDIVREM,
537 ISD::SELECT, ISD::VSELECT,
538 ISD::SELECT_CC, ISD::XOR,
539 ISD::BSWAP, ISD::CTPOP,
540 ISD::CTTZ, ISD::CTLZ,
541 ISD::VECTOR_SHUFFLE, ISD::SETCC,
542 ISD::ADDRSPACECAST},
543 VT, Action: Expand);
544 // clang-format on
545 }
546
547 static const MVT::SimpleValueType FloatVectorTypes[] = {
548 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
549 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
550
551 for (MVT VT : FloatVectorTypes) {
552 setOperationAction(
553 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
554 ISD::FADD, ISD::FCEIL, ISD::FCOS,
555 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
556 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
557 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
558 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
559 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
560 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
561 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
562 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
563 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
564 VT, Action: Expand);
565 }
566
567 // This causes using an unrolled select operation rather than expansion with
568 // bit operations. This is in general better, but the alternative using BFI
569 // instructions may be better if the select sources are SGPRs.
570 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
571 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
572
573 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
574 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
575
576 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
577 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
578
579 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
580 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
581
582 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
583 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
584
585 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
586 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
587
588 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
589 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
590
591 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
592 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
593
594 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
595 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
596
597 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
598 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
599
600 setSchedulingPreference(Sched::RegPressure);
601 setJumpIsExpensive(true);
602
603 setMinCmpXchgSizeInBits(32);
604 setSupportsUnalignedAtomics(false);
605
606 PredictableSelectIsExpensive = false;
607
608 // We want to find all load dependencies for long chains of stores to enable
609 // merging into very wide vectors. The problem is with vectors with > 4
610 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
611 // vectors are a legal type, even though we have to split the loads
612 // usually. When we can more precisely specify load legality per address
613 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
614 // smarter so that they can figure out what to do in 2 iterations without all
615 // N > 4 stores on the same chain.
616 GatherAllAliasesMaxDepth = 16;
617
618 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
619 // about these during lowering.
620 MaxStoresPerMemcpy = 0xffffffff;
621 MaxStoresPerMemmove = 0xffffffff;
622 MaxStoresPerMemset = 0xffffffff;
623
624 // The expansion for 64-bit division is enormous.
625 if (AMDGPUBypassSlowDiv)
626 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
627
628 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
629 ISD::SRA, ISD::SRL,
630 ISD::TRUNCATE, ISD::MUL,
631 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
632 ISD::MULHU, ISD::MULHS,
633 ISD::SELECT, ISD::SELECT_CC,
634 ISD::STORE, ISD::FADD,
635 ISD::FSUB, ISD::FNEG,
636 ISD::FABS, ISD::AssertZext,
637 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
638
639 setMaxAtomicSizeInBitsSupported(64);
640 setMaxDivRemBitWidthSupported(64);
641 setMaxLargeFPConvertBitWidthSupported(64);
642}
643
644bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
645 const auto Flags = Op.getNode()->getFlags();
646 if (Flags.hasNoSignedZeros())
647 return true;
648
649 return false;
650}
651
652//===----------------------------------------------------------------------===//
653// Target Information
654//===----------------------------------------------------------------------===//
655
656LLVM_READNONE
657static bool fnegFoldsIntoOpcode(unsigned Opc) {
658 switch (Opc) {
659 case ISD::FADD:
660 case ISD::FSUB:
661 case ISD::FMUL:
662 case ISD::FMA:
663 case ISD::FMAD:
664 case ISD::FMINNUM:
665 case ISD::FMAXNUM:
666 case ISD::FMINNUM_IEEE:
667 case ISD::FMAXNUM_IEEE:
668 case ISD::FMINIMUM:
669 case ISD::FMAXIMUM:
670 case ISD::FMINIMUMNUM:
671 case ISD::FMAXIMUMNUM:
672 case ISD::SELECT:
673 case ISD::FSIN:
674 case ISD::FTRUNC:
675 case ISD::FRINT:
676 case ISD::FNEARBYINT:
677 case ISD::FROUNDEVEN:
678 case ISD::FCANONICALIZE:
679 case AMDGPUISD::RCP:
680 case AMDGPUISD::RCP_LEGACY:
681 case AMDGPUISD::RCP_IFLAG:
682 case AMDGPUISD::SIN_HW:
683 case AMDGPUISD::FMUL_LEGACY:
684 case AMDGPUISD::FMIN_LEGACY:
685 case AMDGPUISD::FMAX_LEGACY:
686 case AMDGPUISD::FMED3:
687 // TODO: handle llvm.amdgcn.fma.legacy
688 return true;
689 case ISD::BITCAST:
690 llvm_unreachable("bitcast is special cased");
691 default:
692 return false;
693 }
694}
695
696static bool fnegFoldsIntoOp(const SDNode *N) {
697 unsigned Opc = N->getOpcode();
698 if (Opc == ISD::BITCAST) {
699 // TODO: Is there a benefit to checking the conditions performFNegCombine
700 // does? We don't for the other cases.
701 SDValue BCSrc = N->getOperand(Num: 0);
702 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
703 return BCSrc.getNumOperands() == 2 &&
704 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
705 }
706
707 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
708 }
709
710 return fnegFoldsIntoOpcode(Opc);
711}
712
713/// \p returns true if the operation will definitely need to use a 64-bit
714/// encoding, and thus will use a VOP3 encoding regardless of the source
715/// modifiers.
716LLVM_READONLY
717static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
718 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
719 VT == MVT::f64;
720}
721
722/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
723/// type for ISD::SELECT.
724LLVM_READONLY
725static bool selectSupportsSourceMods(const SDNode *N) {
726 // TODO: Only applies if select will be vector
727 return N->getValueType(ResNo: 0) == MVT::f32;
728}
729
730// Most FP instructions support source modifiers, but this could be refined
731// slightly.
732LLVM_READONLY
733static bool hasSourceMods(const SDNode *N) {
734 if (isa<MemSDNode>(Val: N))
735 return false;
736
737 switch (N->getOpcode()) {
738 case ISD::CopyToReg:
739 case ISD::FDIV:
740 case ISD::FREM:
741 case ISD::INLINEASM:
742 case ISD::INLINEASM_BR:
743 case AMDGPUISD::DIV_SCALE:
744 case ISD::INTRINSIC_W_CHAIN:
745
746 // TODO: Should really be looking at the users of the bitcast. These are
747 // problematic because bitcasts are used to legalize all stores to integer
748 // types.
749 case ISD::BITCAST:
750 return false;
751 case ISD::INTRINSIC_WO_CHAIN: {
752 switch (N->getConstantOperandVal(Num: 0)) {
753 case Intrinsic::amdgcn_interp_p1:
754 case Intrinsic::amdgcn_interp_p2:
755 case Intrinsic::amdgcn_interp_mov:
756 case Intrinsic::amdgcn_interp_p1_f16:
757 case Intrinsic::amdgcn_interp_p2_f16:
758 return false;
759 default:
760 return true;
761 }
762 }
763 case ISD::SELECT:
764 return selectSupportsSourceMods(N);
765 default:
766 return true;
767 }
768}
769
770bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
771 unsigned CostThreshold) {
772 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
773 // it is truly free to use a source modifier in all cases. If there are
774 // multiple users but for each one will necessitate using VOP3, there will be
775 // a code size increase. Try to avoid increasing code size unless we know it
776 // will save on the instruction count.
777 unsigned NumMayIncreaseSize = 0;
778 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
779
780 assert(!N->use_empty());
781
782 // XXX - Should this limit number of uses to check?
783 for (const SDNode *U : N->users()) {
784 if (!hasSourceMods(N: U))
785 return false;
786
787 if (!opMustUseVOP3Encoding(N: U, VT)) {
788 if (++NumMayIncreaseSize > CostThreshold)
789 return false;
790 }
791 }
792
793 return true;
794}
795
796EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
797 ISD::NodeType ExtendKind) const {
798 assert(!VT.isVector() && "only scalar expected");
799
800 // Round to the next multiple of 32-bits.
801 unsigned Size = VT.getSizeInBits();
802 if (Size <= 32)
803 return MVT::i32;
804 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
805}
806
807unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
808 return 32;
809}
810
811bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
812 return true;
813}
814
815// The backend supports 32 and 64 bit floating point immediates.
816// FIXME: Why are we reporting vectors of FP immediates as legal?
817bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
818 bool ForCodeSize) const {
819 return isTypeLegal(VT: VT.getScalarType());
820}
821
822// We don't want to shrink f64 / f32 constants.
823bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
826}
827
828bool AMDGPUTargetLowering::shouldReduceLoadWidth(
829 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
830 std::optional<unsigned> ByteOffset) const {
831 // TODO: This may be worth removing. Check regression tests for diffs.
832 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
833 return false;
834
835 unsigned NewSize = NewVT.getStoreSizeInBits();
836
837 // If we are reducing to a 32-bit load or a smaller multi-dword load,
838 // this is always better.
839 if (NewSize >= 32)
840 return true;
841
842 EVT OldVT = N->getValueType(ResNo: 0);
843 unsigned OldSize = OldVT.getStoreSizeInBits();
844
845 MemSDNode *MN = cast<MemSDNode>(Val: N);
846 unsigned AS = MN->getAddressSpace();
847 // Do not shrink an aligned scalar load to sub-dword.
848 // Scalar engine cannot do sub-dword loads.
849 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
850 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
851 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
852 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
853 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
854 MN->isInvariant())) &&
855 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
856 return false;
857
858 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
859 // extloads, so doing one requires using a buffer_load. In cases where we
860 // still couldn't use a scalar load, using the wider load shouldn't really
861 // hurt anything.
862
863 // If the old size already had to be an extload, there's no harm in continuing
864 // to reduce the width.
865 return (OldSize < 32);
866}
867
868bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
869 const SelectionDAG &DAG,
870 const MachineMemOperand &MMO) const {
871
872 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
873
874 if (LoadTy.getScalarType() == MVT::i32)
875 return false;
876
877 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
878 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
879
880 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
881 return false;
882
883 unsigned Fast = 0;
884 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
885 VT: CastTy, MMO, Fast: &Fast) &&
886 Fast;
887}
888
889// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
890// profitable with the expansion for 64-bit since it's generally good to
891// speculate things.
892bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
893 return true;
894}
895
896bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
897 return true;
898}
899
900bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
901 switch (N->getOpcode()) {
902 case ISD::EntryToken:
903 case ISD::TokenFactor:
904 return true;
905 case ISD::INTRINSIC_WO_CHAIN: {
906 unsigned IntrID = N->getConstantOperandVal(Num: 0);
907 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
908 }
909 case ISD::INTRINSIC_W_CHAIN: {
910 unsigned IntrID = N->getConstantOperandVal(Num: 1);
911 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
912 }
913 case ISD::LOAD:
914 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
915 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
916 return true;
917 return false;
918 case AMDGPUISD::SETCC: // ballot-style instruction
919 return true;
920 }
921 return false;
922}
923
924SDValue AMDGPUTargetLowering::getNegatedExpression(
925 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
926 NegatibleCost &Cost, unsigned Depth) const {
927
928 switch (Op.getOpcode()) {
929 case ISD::FMA:
930 case ISD::FMAD: {
931 // Negating a fma is not free if it has users without source mods.
932 if (!allUsesHaveSourceMods(N: Op.getNode()))
933 return SDValue();
934 break;
935 }
936 case AMDGPUISD::RCP: {
937 SDValue Src = Op.getOperand(i: 0);
938 EVT VT = Op.getValueType();
939 SDLoc SL(Op);
940
941 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
942 ForCodeSize, Cost, Depth: Depth + 1);
943 if (NegSrc)
944 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
945 return SDValue();
946 }
947 default:
948 break;
949 }
950
951 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
952 OptForSize: ForCodeSize, Cost, Depth);
953}
954
955//===---------------------------------------------------------------------===//
956// Target Properties
957//===---------------------------------------------------------------------===//
958
959bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
960 assert(VT.isFloatingPoint());
961
962 // Packed operations do not have a fabs modifier.
963 // Report this based on the end legalized type.
964 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
965}
966
967bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
968 assert(VT.isFloatingPoint());
969 // Report this based on the end legalized type.
970 VT = VT.getScalarType();
971 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
972}
973
974bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
975 unsigned NumElem,
976 unsigned AS) const {
977 return true;
978}
979
980bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
981 // There are few operations which truly have vector input operands. Any vector
982 // operation is going to involve operations on each component, and a
983 // build_vector will be a copy per element, so it always makes sense to use a
984 // build_vector input in place of the extracted element to avoid a copy into a
985 // super register.
986 //
987 // We should probably only do this if all users are extracts only, but this
988 // should be the common case.
989 return true;
990}
991
992bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
993 // Truncate is just accessing a subregister.
994
995 unsigned SrcSize = Source.getSizeInBits();
996 unsigned DestSize = Dest.getSizeInBits();
997
998 return DestSize < SrcSize && DestSize % 32 == 0 ;
999}
1000
1001bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
1002 // Truncate is just accessing a subregister.
1003
1004 unsigned SrcSize = Source->getScalarSizeInBits();
1005 unsigned DestSize = Dest->getScalarSizeInBits();
1006
1007 if (DestSize== 16 && Subtarget->has16BitInsts())
1008 return SrcSize >= 32;
1009
1010 return DestSize < SrcSize && DestSize % 32 == 0;
1011}
1012
1013bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1014 unsigned SrcSize = Src->getScalarSizeInBits();
1015 unsigned DestSize = Dest->getScalarSizeInBits();
1016
1017 if (SrcSize == 16 && Subtarget->has16BitInsts())
1018 return DestSize >= 32;
1019
1020 return SrcSize == 32 && DestSize == 64;
1021}
1022
1023bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1024 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1025 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1026 // this will enable reducing 64-bit operations the 32-bit, which is always
1027 // good.
1028
1029 if (Src == MVT::i16)
1030 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1031
1032 return Src == MVT::i32 && Dest == MVT::i64;
1033}
1034
1035bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1036 EVT DestVT) const {
1037 switch (N->getOpcode()) {
1038 case ISD::ADD:
1039 case ISD::SUB:
1040 case ISD::SHL:
1041 case ISD::SRL:
1042 case ISD::SRA:
1043 case ISD::AND:
1044 case ISD::OR:
1045 case ISD::XOR:
1046 case ISD::MUL:
1047 case ISD::SETCC:
1048 case ISD::SELECT:
1049 case ISD::SMIN:
1050 case ISD::SMAX:
1051 case ISD::UMIN:
1052 case ISD::UMAX:
1053 if (isTypeLegal(VT: MVT::i16) &&
1054 (!DestVT.isVector() ||
1055 !isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1056 // Don't narrow back down to i16 if promoted to i32 already.
1057 if (!N->isDivergent() && DestVT.isInteger() &&
1058 DestVT.getScalarSizeInBits() > 1 &&
1059 DestVT.getScalarSizeInBits() <= 16 &&
1060 SrcVT.getScalarSizeInBits() > 16) {
1061 return false;
1062 }
1063 }
1064 return true;
1065 default:
1066 break;
1067 }
1068
1069 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1070 // limited number of native 64-bit operations. Shrinking an operation to fit
1071 // in a single 32-bit register should always be helpful. As currently used,
1072 // this is much less general than the name suggests, and is only used in
1073 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1074 // not profitable, and may actually be harmful.
1075 if (isa<LoadSDNode>(Val: N))
1076 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1077
1078 return true;
1079}
1080
1081bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1082 const SDNode* N, CombineLevel Level) const {
1083 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1084 N->getOpcode() == ISD::SRL) &&
1085 "Expected shift op");
1086
1087 SDValue ShiftLHS = N->getOperand(Num: 0);
1088 if (!ShiftLHS->hasOneUse())
1089 return false;
1090
1091 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1092 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1093 return false;
1094
1095 // Always commute pre-type legalization and right shifts.
1096 // We're looking for shl(or(x,y),z) patterns.
1097 if (Level < CombineLevel::AfterLegalizeTypes ||
1098 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1099 return true;
1100
1101 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1102 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1103 (N->user_begin()->getOpcode() == ISD::SRA ||
1104 N->user_begin()->getOpcode() == ISD::SRL))
1105 return false;
1106
1107 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1108 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1109 if (LHS.getOpcode() != ISD::SHL)
1110 return false;
1111 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1112 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1113 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1114 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1115 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1116 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1117 };
1118 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1119 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1120 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1121}
1122
1123//===---------------------------------------------------------------------===//
1124// TargetLowering Callbacks
1125//===---------------------------------------------------------------------===//
1126
1127CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1128 bool IsVarArg) {
1129 switch (CC) {
1130 case CallingConv::AMDGPU_VS:
1131 case CallingConv::AMDGPU_GS:
1132 case CallingConv::AMDGPU_PS:
1133 case CallingConv::AMDGPU_CS:
1134 case CallingConv::AMDGPU_HS:
1135 case CallingConv::AMDGPU_ES:
1136 case CallingConv::AMDGPU_LS:
1137 return CC_AMDGPU;
1138 case CallingConv::AMDGPU_CS_Chain:
1139 case CallingConv::AMDGPU_CS_ChainPreserve:
1140 return CC_AMDGPU_CS_CHAIN;
1141 case CallingConv::C:
1142 case CallingConv::Fast:
1143 case CallingConv::Cold:
1144 return CC_AMDGPU_Func;
1145 case CallingConv::AMDGPU_Gfx:
1146 case CallingConv::AMDGPU_Gfx_WholeWave:
1147 return CC_SI_Gfx;
1148 case CallingConv::AMDGPU_KERNEL:
1149 case CallingConv::SPIR_KERNEL:
1150 default:
1151 reportFatalUsageError(reason: "unsupported calling convention for call");
1152 }
1153}
1154
1155CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1156 bool IsVarArg) {
1157 switch (CC) {
1158 case CallingConv::AMDGPU_KERNEL:
1159 case CallingConv::SPIR_KERNEL:
1160 llvm_unreachable("kernels should not be handled here");
1161 case CallingConv::AMDGPU_VS:
1162 case CallingConv::AMDGPU_GS:
1163 case CallingConv::AMDGPU_PS:
1164 case CallingConv::AMDGPU_CS:
1165 case CallingConv::AMDGPU_CS_Chain:
1166 case CallingConv::AMDGPU_CS_ChainPreserve:
1167 case CallingConv::AMDGPU_HS:
1168 case CallingConv::AMDGPU_ES:
1169 case CallingConv::AMDGPU_LS:
1170 return RetCC_SI_Shader;
1171 case CallingConv::AMDGPU_Gfx:
1172 case CallingConv::AMDGPU_Gfx_WholeWave:
1173 return RetCC_SI_Gfx;
1174 case CallingConv::C:
1175 case CallingConv::Fast:
1176 case CallingConv::Cold:
1177 return RetCC_AMDGPU_Func;
1178 default:
1179 reportFatalUsageError(reason: "unsupported calling convention");
1180 }
1181}
1182
1183/// The SelectionDAGBuilder will automatically promote function arguments
1184/// with illegal types. However, this does not work for the AMDGPU targets
1185/// since the function arguments are stored in memory as these illegal types.
1186/// In order to handle this properly we need to get the original types sizes
1187/// from the LLVM IR Function and fixup the ISD:InputArg values before
1188/// passing them to AnalyzeFormalArguments()
1189
1190/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1191/// input values across multiple registers. Each item in the Ins array
1192/// represents a single value that will be stored in registers. Ins[x].VT is
1193/// the value type of the value that will be stored in the register, so
1194/// whatever SDNode we lower the argument to needs to be this type.
1195///
1196/// In order to correctly lower the arguments we need to know the size of each
1197/// argument. Since Ins[x].VT gives us the size of the register that will
1198/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1199/// for the original function argument so that we can deduce the correct memory
1200/// type to use for Ins[x]. In most cases the correct memory type will be
1201/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1202/// we have a kernel argument of type v8i8, this argument will be split into
1203/// 8 parts and each part will be represented by its own item in the Ins array.
1204/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1205/// the argument before it was split. From this, we deduce that the memory type
1206/// for each individual part is i8. We pass the memory type as LocVT to the
1207/// calling convention analysis function and the register type (Ins[x].VT) as
1208/// the ValVT.
1209void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1210 CCState &State,
1211 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1212 const MachineFunction &MF = State.getMachineFunction();
1213 const Function &Fn = MF.getFunction();
1214 LLVMContext &Ctx = Fn.getContext();
1215 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1216 CallingConv::ID CC = Fn.getCallingConv();
1217
1218 Align MaxAlign = Align(1);
1219 uint64_t ExplicitArgOffset = 0;
1220 const DataLayout &DL = Fn.getDataLayout();
1221
1222 unsigned InIndex = 0;
1223
1224 for (const Argument &Arg : Fn.args()) {
1225 const bool IsByRef = Arg.hasByRefAttr();
1226 Type *BaseArgTy = Arg.getType();
1227 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1228 Align Alignment = DL.getValueOrABITypeAlignment(
1229 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1230 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1231 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1232
1233 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1234 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1235
1236 // We're basically throwing away everything passed into us and starting over
1237 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1238 // to us as computed in Ins.
1239 //
1240 // We also need to figure out what type legalization is trying to do to get
1241 // the correct memory offsets.
1242
1243 SmallVector<EVT, 16> ValueVTs;
1244 SmallVector<uint64_t, 16> Offsets;
1245 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1246 FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1247
1248 for (unsigned Value = 0, NumValues = ValueVTs.size();
1249 Value != NumValues; ++Value) {
1250 uint64_t BasePartOffset = Offsets[Value];
1251
1252 EVT ArgVT = ValueVTs[Value];
1253 EVT MemVT = ArgVT;
1254 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1255 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1256
1257 if (NumRegs == 1) {
1258 // This argument is not split, so the IR type is the memory type.
1259 if (ArgVT.isExtended()) {
1260 // We have an extended type, like i24, so we should just use the
1261 // register type.
1262 MemVT = RegisterVT;
1263 } else {
1264 MemVT = ArgVT;
1265 }
1266 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1267 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1268 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1269 // We have a vector value which has been split into a vector with
1270 // the same scalar type, but fewer elements. This should handle
1271 // all the floating-point vector types.
1272 MemVT = RegisterVT;
1273 } else if (ArgVT.isVector() &&
1274 ArgVT.getVectorNumElements() == NumRegs) {
1275 // This arg has been split so that each element is stored in a separate
1276 // register.
1277 MemVT = ArgVT.getScalarType();
1278 } else if (ArgVT.isExtended()) {
1279 // We have an extended type, like i65.
1280 MemVT = RegisterVT;
1281 } else {
1282 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1283 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1284 if (RegisterVT.isInteger()) {
1285 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1286 } else if (RegisterVT.isVector()) {
1287 assert(!RegisterVT.getScalarType().isFloatingPoint());
1288 unsigned NumElements = RegisterVT.getVectorNumElements();
1289 assert(MemoryBits % NumElements == 0);
1290 // This vector type has been split into another vector type with
1291 // a different elements size.
1292 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1293 BitWidth: MemoryBits / NumElements);
1294 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1295 } else {
1296 llvm_unreachable("cannot deduce memory type.");
1297 }
1298 }
1299
1300 // Convert one element vectors to scalar.
1301 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1302 MemVT = MemVT.getScalarType();
1303
1304 // Round up vec3/vec5 argument.
1305 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1306 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1307 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1308 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1309 }
1310
1311 unsigned PartOffset = 0;
1312 for (unsigned i = 0; i != NumRegs; ++i) {
1313 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1314 Offset: BasePartOffset + PartOffset,
1315 LocVT: MemVT.getSimpleVT(),
1316 HTP: CCValAssign::Full));
1317 PartOffset += MemVT.getStoreSize();
1318 }
1319 }
1320 }
1321}
1322
1323SDValue AMDGPUTargetLowering::LowerReturn(
1324 SDValue Chain, CallingConv::ID CallConv,
1325 bool isVarArg,
1326 const SmallVectorImpl<ISD::OutputArg> &Outs,
1327 const SmallVectorImpl<SDValue> &OutVals,
1328 const SDLoc &DL, SelectionDAG &DAG) const {
1329 // FIXME: Fails for r600 tests
1330 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1331 // "wave terminate should not have return values");
1332 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1333}
1334
1335//===---------------------------------------------------------------------===//
1336// Target specific lowering
1337//===---------------------------------------------------------------------===//
1338
1339/// Selects the correct CCAssignFn for a given CallingConvention value.
1340CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1341 bool IsVarArg) {
1342 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1343}
1344
1345CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1346 bool IsVarArg) {
1347 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1348}
1349
1350SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1351 SelectionDAG &DAG,
1352 MachineFrameInfo &MFI,
1353 int ClobberedFI) const {
1354 SmallVector<SDValue, 8> ArgChains;
1355 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1356 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1357
1358 // Include the original chain at the beginning of the list. When this is
1359 // used by target LowerCall hooks, this helps legalize find the
1360 // CALLSEQ_BEGIN node.
1361 ArgChains.push_back(Elt: Chain);
1362
1363 // Add a chain value for each stack argument corresponding
1364 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1365 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1366 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1367 if (FI->getIndex() < 0) {
1368 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1369 int64_t InLastByte = InFirstByte;
1370 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1371
1372 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1373 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1374 ArgChains.push_back(Elt: SDValue(L, 1));
1375 }
1376 }
1377 }
1378 }
1379
1380 // Build a tokenfactor for all the chains.
1381 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1382}
1383
1384SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1385 SmallVectorImpl<SDValue> &InVals,
1386 StringRef Reason) const {
1387 SDValue Callee = CLI.Callee;
1388 SelectionDAG &DAG = CLI.DAG;
1389
1390 const Function &Fn = DAG.getMachineFunction().getFunction();
1391
1392 StringRef FuncName("<unknown>");
1393
1394 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1395 FuncName = G->getSymbol();
1396 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1397 FuncName = G->getGlobal()->getName();
1398
1399 DAG.getContext()->diagnose(
1400 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1401
1402 if (!CLI.IsTailCall) {
1403 for (ISD::InputArg &Arg : CLI.Ins)
1404 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1405 }
1406
1407 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1408 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1409 return CLI.Chain;
1410
1411 SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: 0, OutSize: 0, DL: CLI.DL);
1412 return DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, /*InGlue=*/Glue: SDValue(), DL: CLI.DL);
1413}
1414
1415SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1418}
1419
1420SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1424 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1427 return DAG.getMergeValues(Ops, dl: SDLoc());
1428}
1429
1430SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(OS&: errs(), G: &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1438 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1440 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM:
1443 return LowerSDIVREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1468 case ISD::FP_TO_SINT_SAT:
1469 case ISD::FP_TO_UINT_SAT:
1470 return LowerFP_TO_INT_SAT(Op, DAG);
1471 case ISD::CTTZ:
1472 case ISD::CTTZ_ZERO_UNDEF:
1473 case ISD::CTLZ:
1474 case ISD::CTLZ_ZERO_UNDEF:
1475 return LowerCTLZ_CTTZ(Op, DAG);
1476 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1477 }
1478 return Op;
1479}
1480
1481void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1482 SmallVectorImpl<SDValue> &Results,
1483 SelectionDAG &DAG) const {
1484 switch (N->getOpcode()) {
1485 case ISD::SIGN_EXTEND_INREG:
1486 // Different parts of legalization seem to interpret which type of
1487 // sign_extend_inreg is the one to check for custom lowering. The extended
1488 // from type is what really matters, but some places check for custom
1489 // lowering of the result type. This results in trying to use
1490 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1491 // nothing here and let the illegal result integer be handled normally.
1492 return;
1493 case ISD::FLOG2:
1494 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1495 Results.push_back(Elt: Lowered);
1496 return;
1497 case ISD::FLOG:
1498 case ISD::FLOG10:
1499 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1500 Results.push_back(Elt: Lowered);
1501 return;
1502 case ISD::FEXP2:
1503 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1504 Results.push_back(Elt: Lowered);
1505 return;
1506 case ISD::FEXP:
1507 case ISD::FEXP10:
1508 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1509 Results.push_back(Elt: Lowered);
1510 return;
1511 case ISD::CTLZ:
1512 case ISD::CTLZ_ZERO_UNDEF:
1513 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1514 Results.push_back(Elt: Lowered);
1515 return;
1516 default:
1517 return;
1518 }
1519}
1520
1521SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1522 SDValue Op,
1523 SelectionDAG &DAG) const {
1524
1525 const DataLayout &DL = DAG.getDataLayout();
1526 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1527 const GlobalValue *GV = G->getGlobal();
1528
1529 if (!MFI->isModuleEntryFunction()) {
1530 auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1531 if (std::optional<uint32_t> Address =
1532 AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1533 if (IsNamedBarrier) {
1534 unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / 16;
1535 MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1536 }
1537 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1538 } else if (IsNamedBarrier) {
1539 llvm_unreachable("named barrier should have an assigned address");
1540 }
1541 }
1542
1543 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1544 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1545 if (!MFI->isModuleEntryFunction() &&
1546 GV->getName() != "llvm.amdgcn.module.lds" &&
1547 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1548 SDLoc DL(Op);
1549 const Function &Fn = DAG.getMachineFunction().getFunction();
1550 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1551 Fn, "local memory global used by non-kernel function",
1552 DL.getDebugLoc(), DS_Warning));
1553
1554 // We currently don't have a way to correctly allocate LDS objects that
1555 // aren't directly associated with a kernel. We do force inlining of
1556 // functions that use local objects. However, if these dead functions are
1557 // not eliminated, we don't want a compile time error. Just emit a warning
1558 // and a trap, since there should be no callable path here.
1559 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1560 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1561 N1: Trap, N2: DAG.getRoot());
1562 DAG.setRoot(OutputChain);
1563 return DAG.getPOISON(VT: Op.getValueType());
1564 }
1565
1566 // XXX: What does the value of G->getOffset() mean?
1567 assert(G->getOffset() == 0 &&
1568 "Do not know what to do with an non-zero offset");
1569
1570 // TODO: We could emit code to handle the initialization somewhere.
1571 // We ignore the initializer for now and legalize it to allow selection.
1572 // The initializer will anyway get errored out during assembly emission.
1573 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1574 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1575 }
1576 return SDValue();
1577}
1578
1579SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1580 SelectionDAG &DAG) const {
1581 SmallVector<SDValue, 8> Args;
1582 SDLoc SL(Op);
1583
1584 EVT VT = Op.getValueType();
1585 if (VT.getVectorElementType().getSizeInBits() < 32) {
1586 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1587 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1588 unsigned NewNumElt = OpBitSize / 32;
1589 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1590 : EVT::getVectorVT(Context&: *DAG.getContext(),
1591 VT: MVT::i32, NumElements: NewNumElt);
1592 for (const SDUse &U : Op->ops()) {
1593 SDValue In = U.get();
1594 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1595 if (NewNumElt > 1)
1596 DAG.ExtractVectorElements(Op: NewIn, Args);
1597 else
1598 Args.push_back(Elt: NewIn);
1599 }
1600
1601 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1602 NumElements: NewNumElt * Op.getNumOperands());
1603 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1604 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1605 }
1606 }
1607
1608 for (const SDUse &U : Op->ops())
1609 DAG.ExtractVectorElements(Op: U.get(), Args);
1610
1611 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1612}
1613
1614SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1615 SelectionDAG &DAG) const {
1616 SDLoc SL(Op);
1617 SmallVector<SDValue, 8> Args;
1618 unsigned Start = Op.getConstantOperandVal(i: 1);
1619 EVT VT = Op.getValueType();
1620 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1621
1622 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1623 unsigned NumElt = VT.getVectorNumElements();
1624 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1625 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1626
1627 // Extract 32-bit registers at a time.
1628 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1629 EVT NewVT = NumElt == 2
1630 ? MVT::i32
1631 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1632 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1633
1634 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1635 if (NumElt == 2)
1636 Tmp = Args[0];
1637 else
1638 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1639
1640 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1641 }
1642
1643 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1644 Count: VT.getVectorNumElements());
1645
1646 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1647}
1648
1649// TODO: Handle fabs too
1650static SDValue peekFNeg(SDValue Val) {
1651 if (Val.getOpcode() == ISD::FNEG)
1652 return Val.getOperand(i: 0);
1653
1654 return Val;
1655}
1656
1657static SDValue peekFPSignOps(SDValue Val) {
1658 if (Val.getOpcode() == ISD::FNEG)
1659 Val = Val.getOperand(i: 0);
1660 if (Val.getOpcode() == ISD::FABS)
1661 Val = Val.getOperand(i: 0);
1662 if (Val.getOpcode() == ISD::FCOPYSIGN)
1663 Val = Val.getOperand(i: 0);
1664 return Val;
1665}
1666
1667SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1668 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1669 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1670 SelectionDAG &DAG = DCI.DAG;
1671 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1672 switch (CCOpcode) {
1673 case ISD::SETOEQ:
1674 case ISD::SETONE:
1675 case ISD::SETUNE:
1676 case ISD::SETNE:
1677 case ISD::SETUEQ:
1678 case ISD::SETEQ:
1679 case ISD::SETFALSE:
1680 case ISD::SETFALSE2:
1681 case ISD::SETTRUE:
1682 case ISD::SETTRUE2:
1683 case ISD::SETUO:
1684 case ISD::SETO:
1685 break;
1686 case ISD::SETULE:
1687 case ISD::SETULT: {
1688 if (LHS == True)
1689 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1690 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1691 }
1692 case ISD::SETOLE:
1693 case ISD::SETOLT:
1694 case ISD::SETLE:
1695 case ISD::SETLT: {
1696 // Ordered. Assume ordered for undefined.
1697
1698 // Only do this after legalization to avoid interfering with other combines
1699 // which might occur.
1700 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1701 !DCI.isCalledByLegalizer())
1702 return SDValue();
1703
1704 // We need to permute the operands to get the correct NaN behavior. The
1705 // selected operand is the second one based on the failing compare with NaN,
1706 // so permute it based on the compare type the hardware uses.
1707 if (LHS == True)
1708 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1709 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1710 }
1711 case ISD::SETUGE:
1712 case ISD::SETUGT: {
1713 if (LHS == True)
1714 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1715 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1716 }
1717 case ISD::SETGT:
1718 case ISD::SETGE:
1719 case ISD::SETOGE:
1720 case ISD::SETOGT: {
1721 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1722 !DCI.isCalledByLegalizer())
1723 return SDValue();
1724
1725 if (LHS == True)
1726 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1727 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1728 }
1729 case ISD::SETCC_INVALID:
1730 llvm_unreachable("Invalid setcc condcode!");
1731 }
1732 return SDValue();
1733}
1734
1735/// Generate Min/Max node
1736SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1737 SDValue LHS, SDValue RHS,
1738 SDValue True, SDValue False,
1739 SDValue CC,
1740 DAGCombinerInfo &DCI) const {
1741 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1742 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1743
1744 SelectionDAG &DAG = DCI.DAG;
1745
1746 // If we can't directly match this, try to see if we can fold an fneg to
1747 // match.
1748
1749 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1750 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1751 SDValue NegTrue = peekFNeg(Val: True);
1752
1753 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1754 // fmin/fmax.
1755 //
1756 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1757 // -> fneg (fmin_legacy lhs, K)
1758 //
1759 // TODO: Use getNegatedExpression
1760 if (LHS == NegTrue && CFalse && CRHS) {
1761 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1762 if (NegRHS == CFalse->getValueAPF()) {
1763 SDValue Combined =
1764 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1765 if (Combined)
1766 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1767 return SDValue();
1768 }
1769 }
1770
1771 return SDValue();
1772}
1773
1774std::pair<SDValue, SDValue>
1775AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1776 SDLoc SL(Op);
1777
1778 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1779
1780 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1781 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1782
1783 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1784 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1785
1786 return std::pair(Lo, Hi);
1787}
1788
1789SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1790 SDLoc SL(Op);
1791
1792 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1793 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1794 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1795}
1796
1797SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1798 SDLoc SL(Op);
1799
1800 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1801 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1802 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1803}
1804
1805// Split a vector type into two parts. The first part is a power of two vector.
1806// The second part is whatever is left over, and is a scalar if it would
1807// otherwise be a 1-vector.
1808std::pair<EVT, EVT>
1809AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1810 EVT LoVT, HiVT;
1811 EVT EltVT = VT.getVectorElementType();
1812 unsigned NumElts = VT.getVectorNumElements();
1813 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1814 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1815 HiVT = NumElts - LoNumElts == 1
1816 ? EltVT
1817 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1818 return std::pair(LoVT, HiVT);
1819}
1820
1821// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1822// scalar.
1823std::pair<SDValue, SDValue>
1824AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1825 const EVT &LoVT, const EVT &HiVT,
1826 SelectionDAG &DAG) const {
1827 EVT VT = N.getValueType();
1828 assert(LoVT.getVectorNumElements() +
1829 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1830 VT.getVectorNumElements() &&
1831 "More vector elements requested than available!");
1832 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1833 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1834
1835 unsigned LoNumElts = LoVT.getVectorNumElements();
1836
1837 if (HiVT.isVector()) {
1838 unsigned HiNumElts = HiVT.getVectorNumElements();
1839 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1840 // Avoid creating an extract_subvector with an index that isn't a multiple
1841 // of the result type.
1842 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1843 N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1844 return {Lo, Hi};
1845 }
1846
1847 SmallVector<SDValue, 8> Elts;
1848 DAG.ExtractVectorElements(Op: N, Args&: Elts, /*Start=*/LoNumElts,
1849 /*Count=*/HiNumElts);
1850 SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1851 return {Lo, Hi};
1852 }
1853
1854 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1855 N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1856 return {Lo, Hi};
1857}
1858
1859SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1860 SelectionDAG &DAG) const {
1861 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1862 EVT VT = Op.getValueType();
1863 SDLoc SL(Op);
1864
1865
1866 // If this is a 2 element vector, we really want to scalarize and not create
1867 // weird 1 element vectors.
1868 if (VT.getVectorNumElements() == 2) {
1869 SDValue Ops[2];
1870 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1871 return DAG.getMergeValues(Ops, dl: SL);
1872 }
1873
1874 SDValue BasePtr = Load->getBasePtr();
1875 EVT MemVT = Load->getMemoryVT();
1876
1877 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1878
1879 EVT LoVT, HiVT;
1880 EVT LoMemVT, HiMemVT;
1881 SDValue Lo, Hi;
1882
1883 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1884 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1885 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1886
1887 unsigned Size = LoMemVT.getStoreSize();
1888 Align BaseAlign = Load->getAlign();
1889 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1890
1891 SDValue LoLoad = DAG.getExtLoad(
1892 ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1893 MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1894 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1895 SDValue HiLoad = DAG.getExtLoad(
1896 ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1897 PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1898 MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1899
1900 SDValue Join;
1901 if (LoVT == HiVT) {
1902 // This is the case that the vector is power of two so was evenly split.
1903 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1904 } else {
1905 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1906 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1907 Join = DAG.getNode(
1908 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1909 VT, N1: Join, N2: HiLoad,
1910 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1911 }
1912
1913 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1914 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1915
1916 return DAG.getMergeValues(Ops, dl: SL);
1917}
1918
1919SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1920 SelectionDAG &DAG) const {
1921 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1922 EVT VT = Op.getValueType();
1923 SDValue BasePtr = Load->getBasePtr();
1924 EVT MemVT = Load->getMemoryVT();
1925 SDLoc SL(Op);
1926 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1927 Align BaseAlign = Load->getAlign();
1928 unsigned NumElements = MemVT.getVectorNumElements();
1929
1930 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1931 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1932 if (NumElements != 3 ||
1933 (BaseAlign < Align(8) &&
1934 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1935 return SplitVectorLoad(Op, DAG);
1936
1937 assert(NumElements == 3);
1938
1939 EVT WideVT =
1940 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1941 EVT WideMemVT =
1942 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1943 SDValue WideLoad = DAG.getExtLoad(
1944 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1945 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1946 return DAG.getMergeValues(
1947 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1948 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1949 WideLoad.getValue(R: 1)},
1950 dl: SL);
1951}
1952
1953SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1954 SelectionDAG &DAG) const {
1955 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1956 SDValue Val = Store->getValue();
1957 EVT VT = Val.getValueType();
1958
1959 // If this is a 2 element vector, we really want to scalarize and not create
1960 // weird 1 element vectors.
1961 if (VT.getVectorNumElements() == 2)
1962 return scalarizeVectorStore(ST: Store, DAG);
1963
1964 EVT MemVT = Store->getMemoryVT();
1965 SDValue Chain = Store->getChain();
1966 SDValue BasePtr = Store->getBasePtr();
1967 SDLoc SL(Op);
1968
1969 EVT LoVT, HiVT;
1970 EVT LoMemVT, HiMemVT;
1971 SDValue Lo, Hi;
1972
1973 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1974 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1975 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1976
1977 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1978
1979 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1980 Align BaseAlign = Store->getAlign();
1981 unsigned Size = LoMemVT.getStoreSize();
1982 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1983
1984 SDValue LoStore =
1985 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1986 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1987 SDValue HiStore = DAG.getTruncStore(
1988 Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
1989 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1990
1991 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1992}
1993
1994// This is a shortcut for integer division because we have fast i32<->f32
1995// conversions, and fast f32 reciprocal instructions. The fractional part of a
1996// float is enough to accurately represent up to a 24-bit signed integer.
1997SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1998 bool Sign) const {
1999 SDLoc DL(Op);
2000 EVT VT = Op.getValueType();
2001 SDValue LHS = Op.getOperand(i: 0);
2002 SDValue RHS = Op.getOperand(i: 1);
2003 MVT IntVT = MVT::i32;
2004 MVT FltVT = MVT::f32;
2005
2006 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2007 if (LHSSignBits < 9)
2008 return SDValue();
2009
2010 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2011 if (RHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned BitSize = VT.getSizeInBits();
2015 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2016 unsigned DivBits = BitSize - SignBits;
2017 if (Sign)
2018 ++DivBits;
2019
2020 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2021 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2022
2023 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
2024
2025 if (Sign) {
2026 // char|short jq = ia ^ ib;
2027 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2028
2029 // jq = jq >> (bitsize - 2)
2030 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2031 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
2032
2033 // jq = jq | 0x1
2034 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
2035 }
2036
2037 // int ia = (int)LHS;
2038 SDValue ia = LHS;
2039
2040 // int ib, (int)RHS;
2041 SDValue ib = RHS;
2042
2043 // float fa = (float)ia;
2044 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2045
2046 // float fb = (float)ib;
2047 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2048
2049 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2050 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2051
2052 // fq = trunc(fq);
2053 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2054
2055 // float fqneg = -fq;
2056 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2057
2058 MachineFunction &MF = DAG.getMachineFunction();
2059
2060 bool UseFmadFtz = false;
2061 if (Subtarget->isGCN()) {
2062 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2063 UseFmadFtz =
2064 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2065 }
2066
2067 // float fr = mad(fqneg, fb, fa);
2068 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2069 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2070 : (unsigned)ISD::FMAD;
2071 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2072
2073 // int iq = (int)fq;
2074 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2075
2076 // fr = fabs(fr);
2077 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2078
2079 // fb = fabs(fb);
2080 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2081
2082 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2083
2084 // int cv = fr >= fb;
2085 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2086
2087 // jq = (cv ? jq : 0);
2088 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2089
2090 // dst = iq + jq;
2091 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2092
2093 // Rem needs compensation, it's easier to recompute it
2094 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2095 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2096
2097 // Truncate to number of bits this divide really is.
2098 if (Sign) {
2099 SDValue InRegSize
2100 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2101 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2102 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2103 } else {
2104 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2105 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2106 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2107 }
2108
2109 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2110}
2111
2112void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2113 SelectionDAG &DAG,
2114 SmallVectorImpl<SDValue> &Results) const {
2115 SDLoc DL(Op);
2116 EVT VT = Op.getValueType();
2117
2118 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2119
2120 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2121
2122 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2123 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2124
2125 //HiLo split
2126 SDValue LHS_Lo, LHS_Hi;
2127 SDValue LHS = Op.getOperand(i: 0);
2128 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2129
2130 SDValue RHS_Lo, RHS_Hi;
2131 SDValue RHS = Op.getOperand(i: 1);
2132 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2133
2134 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2135 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2136
2137 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2138 N1: LHS_Lo, N2: RHS_Lo);
2139
2140 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2141 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2142
2143 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2144 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2145 return;
2146 }
2147
2148 if (isTypeLegal(VT: MVT::i64)) {
2149 // The algorithm here is based on ideas from "Software Integer Division",
2150 // Tom Rodeheffer, August 2008.
2151
2152 MachineFunction &MF = DAG.getMachineFunction();
2153 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2154
2155 // Compute denominator reciprocal.
2156 unsigned FMAD =
2157 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2158 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2159 ? (unsigned)ISD::FMAD
2160 : (unsigned)AMDGPUISD::FMAD_FTZ;
2161
2162 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2163 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2164 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2165 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2166 N3: Cvt_Lo);
2167 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2168 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2169 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2170 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2171 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2172 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2173 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2174 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2175 N3: Mul1);
2176 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2177 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2178 SDValue Rcp64 = DAG.getBitcast(VT,
2179 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2180
2181 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2182 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2183 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2184 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2185
2186 // First round of UNR (Unsigned integer Newton-Raphson).
2187 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2188 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2189 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2190 SDValue Mulhi1_Lo, Mulhi1_Hi;
2191 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2192 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2193 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2194 N2: Mulhi1_Lo, N3: Zero1);
2195 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2196 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2197 SDValue Add1 = DAG.getBitcast(VT,
2198 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2199
2200 // Second round of UNR.
2201 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2202 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2203 SDValue Mulhi2_Lo, Mulhi2_Hi;
2204 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2205 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2206 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2207 N2: Mulhi2_Lo, N3: Zero1);
2208 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2209 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2210 SDValue Add2 = DAG.getBitcast(VT,
2211 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2212
2213 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2214
2215 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2216
2217 SDValue Mul3_Lo, Mul3_Hi;
2218 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2219 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2220 N2: Mul3_Lo, N3: Zero1);
2221 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2222 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2223 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2224 SDValue Sub1 = DAG.getBitcast(VT,
2225 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2226
2227 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2228 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2229 Cond: ISD::SETUGE);
2230 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2231 Cond: ISD::SETUGE);
2232 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2233
2234 // TODO: Here and below portions of the code can be enclosed into if/endif.
2235 // Currently control flow is unconditional and we have 4 selects after
2236 // potential endif to substitute PHIs.
2237
2238 // if C3 != 0 ...
2239 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2240 N2: RHS_Lo, N3: Zero1);
2241 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2242 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2243 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2244 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2245 SDValue Sub2 = DAG.getBitcast(VT,
2246 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2247
2248 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2249
2250 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2251 Cond: ISD::SETUGE);
2252 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2253 Cond: ISD::SETUGE);
2254 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2255
2256 // if (C6 != 0)
2257 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2258
2259 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2260 N2: RHS_Lo, N3: Zero1);
2261 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2262 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2263 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2264 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2265 SDValue Sub3 = DAG.getBitcast(VT,
2266 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2267
2268 // endif C6
2269 // endif C3
2270
2271 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2272 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2273
2274 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2275 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2276
2277 Results.push_back(Elt: Div);
2278 Results.push_back(Elt: Rem);
2279
2280 return;
2281 }
2282
2283 // r600 expandion.
2284 // Get Speculative values
2285 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2286 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2287
2288 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2289 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2290 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2291
2292 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2293 SDValue DIV_Lo = Zero;
2294
2295 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2296
2297 for (unsigned i = 0; i < halfBitWidth; ++i) {
2298 const unsigned bitPos = halfBitWidth - i - 1;
2299 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2300 // Get value of high bit
2301 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2302 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2303 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2304
2305 // Shift
2306 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2307 // Add LHS high bit
2308 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2309
2310 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2311 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2312
2313 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2314
2315 // Update REM
2316 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2317 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2318 }
2319
2320 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2321 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2322 Results.push_back(Elt: DIV);
2323 Results.push_back(Elt: REM);
2324}
2325
2326SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2327 SelectionDAG &DAG) const {
2328 SDLoc DL(Op);
2329 EVT VT = Op.getValueType();
2330
2331 if (VT == MVT::i64) {
2332 SmallVector<SDValue, 2> Results;
2333 LowerUDIVREM64(Op, DAG, Results);
2334 return DAG.getMergeValues(Ops: Results, dl: DL);
2335 }
2336
2337 if (VT == MVT::i32) {
2338 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2339 return Res;
2340 }
2341
2342 SDValue X = Op.getOperand(i: 0);
2343 SDValue Y = Op.getOperand(i: 1);
2344
2345 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2346 // algorithm used here.
2347
2348 // Initial estimate of inv(y).
2349 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2350
2351 // One round of UNR.
2352 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2353 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2354 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2355 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2356
2357 // Quotient/remainder estimate.
2358 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2359 SDValue R =
2360 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2361
2362 // First quotient/remainder refinement.
2363 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2364 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2365 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2366 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2367 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2368 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2369 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2370
2371 // Second quotient/remainder refinement.
2372 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2373 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2374 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2375 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2376 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2377
2378 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2379}
2380
2381SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2382 SelectionDAG &DAG) const {
2383 SDLoc DL(Op);
2384 EVT VT = Op.getValueType();
2385
2386 SDValue LHS = Op.getOperand(i: 0);
2387 SDValue RHS = Op.getOperand(i: 1);
2388
2389 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2390 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2391
2392 if (VT == MVT::i32) {
2393 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2394 return Res;
2395 }
2396
2397 if (VT == MVT::i64 &&
2398 DAG.ComputeNumSignBits(Op: LHS) > 32 &&
2399 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2400 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2401
2402 //HiLo split
2403 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2404 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2405 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2406 N1: LHS_Lo, N2: RHS_Lo);
2407 SDValue Res[2] = {
2408 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2409 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2410 };
2411 return DAG.getMergeValues(Ops: Res, dl: DL);
2412 }
2413
2414 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2415 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2416 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2417 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2418
2419 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2420 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2421
2422 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2423 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2424
2425 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2426 SDValue Rem = Div.getValue(R: 1);
2427
2428 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2429 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2430
2431 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2432 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2433
2434 SDValue Res[2] = {
2435 Div,
2436 Rem
2437 };
2438 return DAG.getMergeValues(Ops: Res, dl: DL);
2439}
2440
2441SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2442 SDLoc SL(Op);
2443 SDValue Src = Op.getOperand(i: 0);
2444
2445 // result = trunc(src)
2446 // if (src > 0.0 && src != result)
2447 // result += 1.0
2448
2449 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2450
2451 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2452 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2453
2454 EVT SetCCVT =
2455 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2456
2457 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2458 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2459 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2460
2461 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2462 // TODO: Should this propagate fast-math-flags?
2463 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2464}
2465
2466static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2467 SelectionDAG &DAG) {
2468 const unsigned FractBits = 52;
2469 const unsigned ExpBits = 11;
2470
2471 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2472 N1: Hi,
2473 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2474 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2475 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2476 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2477
2478 return Exp;
2479}
2480
2481SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2482 SDLoc SL(Op);
2483 SDValue Src = Op.getOperand(i: 0);
2484
2485 assert(Op.getValueType() == MVT::f64);
2486
2487 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2488
2489 // Extract the upper half, since this is where we will find the sign and
2490 // exponent.
2491 SDValue Hi = getHiHalf64(Op: Src, DAG);
2492
2493 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2494
2495 const unsigned FractBits = 52;
2496
2497 // Extract the sign bit.
2498 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2499 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2500
2501 // Extend back to 64-bits.
2502 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2503 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2504
2505 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2506 const SDValue FractMask
2507 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2508
2509 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2510 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2511 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2512
2513 EVT SetCCVT =
2514 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2515
2516 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2517
2518 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2519 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2520
2521 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2522 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2523
2524 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2525}
2526
2527SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2528 SelectionDAG &DAG) const {
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(i: 0);
2531
2532 assert(Op.getValueType() == MVT::f64);
2533
2534 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2535 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2536 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2537
2538 // TODO: Should this propagate fast-math-flags?
2539
2540 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2541 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2542
2543 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2544
2545 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2546 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2547
2548 EVT SetCCVT =
2549 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2550 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2551
2552 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2553}
2554
2555SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2556 SelectionDAG &DAG) const {
2557 // FNEARBYINT and FRINT are the same, except in their handling of FP
2558 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2559 // rint, so just treat them as equivalent.
2560 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2561 Operand: Op.getOperand(i: 0));
2562}
2563
2564SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2565 auto VT = Op.getValueType();
2566 auto Arg = Op.getOperand(i: 0u);
2567 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2568}
2569
2570// XXX - May require not supporting f32 denormals?
2571
2572// Don't handle v2f16. The extra instructions to scalarize and repack around the
2573// compare and vselect end up producing worse code than scalarizing the whole
2574// operation.
2575SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2576 SDLoc SL(Op);
2577 SDValue X = Op.getOperand(i: 0);
2578 EVT VT = Op.getValueType();
2579
2580 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2581
2582 // TODO: Should this propagate fast-math-flags?
2583
2584 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2585
2586 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2587
2588 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2589 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2590
2591 EVT SetCCVT =
2592 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2593
2594 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2595 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2596 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2597
2598 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2599 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2600}
2601
2602SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2603 SDLoc SL(Op);
2604 SDValue Src = Op.getOperand(i: 0);
2605
2606 // result = trunc(src);
2607 // if (src < 0.0 && src != result)
2608 // result += -1.0.
2609
2610 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2611
2612 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2613 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2614
2615 EVT SetCCVT =
2616 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2617
2618 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2619 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2620 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2621
2622 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2625}
2626
2627/// Return true if it's known that \p Src can never be an f32 denormal value.
2628static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2629 switch (Src.getOpcode()) {
2630 case ISD::FP_EXTEND:
2631 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2632 case ISD::FP16_TO_FP:
2633 case ISD::FFREXP:
2634 case ISD::FSQRT:
2635 case AMDGPUISD::LOG:
2636 case AMDGPUISD::EXP:
2637 return true;
2638 case ISD::INTRINSIC_WO_CHAIN: {
2639 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2640 switch (IntrinsicID) {
2641 case Intrinsic::amdgcn_frexp_mant:
2642 case Intrinsic::amdgcn_log:
2643 case Intrinsic::amdgcn_log_clamp:
2644 case Intrinsic::amdgcn_exp2:
2645 case Intrinsic::amdgcn_sqrt:
2646 return true;
2647 default:
2648 return false;
2649 }
2650 }
2651 default:
2652 return false;
2653 }
2654
2655 llvm_unreachable("covered opcode switch");
2656}
2657
2658bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2659 SDNodeFlags Flags) {
2660 return Flags.hasApproximateFuncs();
2661}
2662
2663bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2664 SDValue Src,
2665 SDNodeFlags Flags) {
2666 return !valueIsKnownNeverF32Denorm(Src) &&
2667 DAG.getMachineFunction()
2668 .getDenormalMode(FPType: APFloat::IEEEsingle())
2669 .Input != DenormalMode::PreserveSign;
2670}
2671
2672SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2673 SDValue Src,
2674 SDNodeFlags Flags) const {
2675 SDLoc SL(Src);
2676 EVT VT = Src.getValueType();
2677 const fltSemantics &Semantics = VT.getFltSemantics();
2678 SDValue SmallestNormal =
2679 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2680
2681 // Want to scale denormals up, but negatives and 0 work just as well on the
2682 // scaled path.
2683 SDValue IsLtSmallestNormal = DAG.getSetCC(
2684 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2685 RHS: SmallestNormal, Cond: ISD::SETOLT);
2686
2687 return IsLtSmallestNormal;
2688}
2689
2690SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2691 SDNodeFlags Flags) const {
2692 SDLoc SL(Src);
2693 EVT VT = Src.getValueType();
2694 const fltSemantics &Semantics = VT.getFltSemantics();
2695 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2696
2697 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2698 SDValue IsFinite = DAG.getSetCC(
2699 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2700 RHS: Inf, Cond: ISD::SETOLT);
2701 return IsFinite;
2702}
2703
2704/// If denormal handling is required return the scaled input to FLOG2, and the
2705/// check for denormal range. Otherwise, return null values.
2706std::pair<SDValue, SDValue>
2707AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2708 SDValue Src, SDNodeFlags Flags) const {
2709 if (!needsDenormHandlingF32(DAG, Src, Flags))
2710 return {};
2711
2712 MVT VT = MVT::f32;
2713 const fltSemantics &Semantics = APFloat::IEEEsingle();
2714 SDValue SmallestNormal =
2715 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2716
2717 SDValue IsLtSmallestNormal = DAG.getSetCC(
2718 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2719 RHS: SmallestNormal, Cond: ISD::SETOLT);
2720
2721 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2722 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2723 SDValue ScaleFactor =
2724 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2725
2726 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2727 return {ScaledInput, IsLtSmallestNormal};
2728}
2729
2730SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2731 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2732 // If we have to handle denormals, scale up the input and adjust the result.
2733
2734 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2735 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2736
2737 SDLoc SL(Op);
2738 EVT VT = Op.getValueType();
2739 SDValue Src = Op.getOperand(i: 0);
2740 SDNodeFlags Flags = Op->getFlags();
2741
2742 if (VT == MVT::f16) {
2743 // Nothing in half is a denormal when promoted to f32.
2744 assert(!isTypeLegal(VT));
2745 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2746 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2747 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2748 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2749 }
2750
2751 auto [ScaledInput, IsLtSmallestNormal] =
2752 getScaledLogInput(DAG, SL, Src, Flags);
2753 if (!ScaledInput)
2754 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2755
2756 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2757
2758 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2759 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2760 SDValue ResultOffset =
2761 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2762 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2763}
2764
2765static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2766 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2767 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2768 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2769}
2770
2771SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2772 SelectionDAG &DAG) const {
2773 SDValue X = Op.getOperand(i: 0);
2774 EVT VT = Op.getValueType();
2775 SDNodeFlags Flags = Op->getFlags();
2776 SDLoc DL(Op);
2777 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2778 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2779
2780 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2781 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2782 // depending on !fpmath metadata.
2783
2784 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2785 !isTypeLegal(VT: MVT::f16));
2786
2787 if (PromoteToF32) {
2788 // Log and multiply in f32 is always good enough for f16.
2789 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2790 }
2791
2792 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2793 if (PromoteToF32) {
2794 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2795 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2796 }
2797
2798 return Lowered;
2799 }
2800
2801 SDValue ScaledInput, IsScaled;
2802 if (VT == MVT::f16)
2803 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2804 else {
2805 std::tie(args&: ScaledInput, args&: IsScaled) = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2806 if (ScaledInput)
2807 X = ScaledInput;
2808 }
2809
2810 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2811
2812 SDValue R;
2813 if (Subtarget->hasFastFMAF32()) {
2814 // c+cc are ln(2)/ln(10) to more than 49 bits
2815 const float c_log10 = 0x1.344134p-2f;
2816 const float cc_log10 = 0x1.09f79ep-26f;
2817
2818 // c + cc is ln(2) to more than 49 bits
2819 const float c_log = 0x1.62e42ep-1f;
2820 const float cc_log = 0x1.efa39ep-25f;
2821
2822 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2823 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2824 // This adds correction terms for which contraction may lead to an increase
2825 // in the error of the approximation, so disable it.
2826 Flags.setAllowContract(false);
2827 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2828 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2829 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2830 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2831 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2832 } else {
2833 // ch+ct is ln(2)/ln(10) to more than 36 bits
2834 const float ch_log10 = 0x1.344000p-2f;
2835 const float ct_log10 = 0x1.3509f6p-18f;
2836
2837 // ch + ct is ln(2) to more than 36 bits
2838 const float ch_log = 0x1.62e000p-1f;
2839 const float ct_log = 0x1.0bfbe8p-15f;
2840
2841 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2842 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2843
2844 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2845 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2846 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2847 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2848 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2849 // This adds correction terms for which contraction may lead to an increase
2850 // in the error of the approximation, so disable it.
2851 Flags.setAllowContract(false);
2852 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2853 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2854 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2855 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2856 }
2857
2858 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2859
2860 // TODO: Check if known finite from source value.
2861 if (!IsFiniteOnly) {
2862 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2863 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2864 }
2865
2866 if (IsScaled) {
2867 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2868 SDValue ShiftK =
2869 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2870 SDValue Shift =
2871 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2872 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2873 }
2874
2875 return R;
2876}
2877
2878SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2879 return LowerFLOGCommon(Op, DAG);
2880}
2881
2882// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2883// promote f16 operation.
2884SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2885 SelectionDAG &DAG, bool IsLog10,
2886 SDNodeFlags Flags) const {
2887 EVT VT = Src.getValueType();
2888 unsigned LogOp =
2889 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2890
2891 double Log2BaseInverted =
2892 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2893
2894 if (VT == MVT::f32) {
2895 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2896 if (ScaledInput) {
2897 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2898 SDValue ScaledResultOffset =
2899 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2900
2901 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2902
2903 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2904 N2: ScaledResultOffset, N3: Zero, Flags);
2905
2906 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2907
2908 if (Subtarget->hasFastFMAF32())
2909 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2910 Flags);
2911 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2912 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2913 }
2914 }
2915
2916 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2917 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2918
2919 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2920 Flags);
2921}
2922
2923// This expansion gives a result slightly better than 1ulp.
2924SDValue AMDGPUTargetLowering::lowerFEXPF64(SDValue Op,
2925 SelectionDAG &DAG) const {
2926 SDLoc DL(Op);
2927 SDValue X = Op.getOperand(i: 0);
2928
2929 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2930 // exp10, which slightly increases ulp.
2931 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2932
2933 SDValue DN, F, T;
2934
2935 if (Op.getOpcode() == ISD::FEXP2) {
2936 // dn = rint(x)
2937 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: X, Flags);
2938 // f = x - dn
2939 F = DAG.getNode(Opcode: ISD::FSUB, DL, VT: MVT::f64, N1: X, N2: DN, Flags);
2940 // t = f*C1 + f*C2
2941 SDValue C1 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
2942 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
2943 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C2, Flags);
2944 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C1, N3: Mul2, Flags);
2945 } else if (Op.getOpcode() == ISD::FEXP10) {
2946 // dn = rint(x * C1)
2947 SDValue C1 = DAG.getConstantFP(Val: 0x1.a934f0979a371p+1, DL, VT: MVT::f64);
2948 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2949 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2950
2951 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2952 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2953 SDValue C2 = DAG.getConstantFP(Val: -0x1.9dc1da994fd21p-59, DL, VT: MVT::f64);
2954 SDValue C3 = DAG.getConstantFP(Val: 0x1.34413509f79ffp-2, DL, VT: MVT::f64);
2955 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2956 F = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2957
2958 // t = FMA(f, C4, f*C5)
2959 SDValue C4 = DAG.getConstantFP(Val: 0x1.26bb1bbb55516p+1, DL, VT: MVT::f64);
2960 SDValue C5 = DAG.getConstantFP(Val: -0x1.f48ad494ea3e9p-53, DL, VT: MVT::f64);
2961 SDValue MulF = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: F, N2: C5, Flags);
2962 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: F, N2: C4, N3: MulF, Flags);
2963 } else { // ISD::FEXP
2964 // dn = rint(x * C1)
2965 SDValue C1 = DAG.getConstantFP(Val: 0x1.71547652b82fep+0, DL, VT: MVT::f64);
2966 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: X, N2: C1, Flags);
2967 DN = DAG.getNode(Opcode: ISD::FRINT, DL, VT: MVT::f64, Operand: Mul, Flags);
2968
2969 // t = FMA(-dn, C2, FMA(-dn, C3, x))
2970 SDValue NegDN = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: DN, Flags);
2971 SDValue C2 = DAG.getConstantFP(Val: 0x1.abc9e3b39803fp-56, DL, VT: MVT::f64);
2972 SDValue C3 = DAG.getConstantFP(Val: 0x1.62e42fefa39efp-1, DL, VT: MVT::f64);
2973 SDValue Inner = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C3, N3: X, Flags);
2974 T = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegDN, N2: C2, N3: Inner, Flags);
2975 }
2976
2977 // Polynomial expansion for p
2978 SDValue P = DAG.getConstantFP(Val: 0x1.ade156a5dcb37p-26, DL, VT: MVT::f64);
2979 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2980 N3: DAG.getConstantFP(Val: 0x1.28af3fca7ab0cp-22, DL, VT: MVT::f64),
2981 Flags);
2982 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2983 N3: DAG.getConstantFP(Val: 0x1.71dee623fde64p-19, DL, VT: MVT::f64),
2984 Flags);
2985 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2986 N3: DAG.getConstantFP(Val: 0x1.a01997c89e6b0p-16, DL, VT: MVT::f64),
2987 Flags);
2988 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2989 N3: DAG.getConstantFP(Val: 0x1.a01a014761f6ep-13, DL, VT: MVT::f64),
2990 Flags);
2991 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2992 N3: DAG.getConstantFP(Val: 0x1.6c16c1852b7b0p-10, DL, VT: MVT::f64),
2993 Flags);
2994 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2995 N3: DAG.getConstantFP(Val: 0x1.1111111122322p-7, DL, VT: MVT::f64), Flags);
2996 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2997 N3: DAG.getConstantFP(Val: 0x1.55555555502a1p-5, DL, VT: MVT::f64), Flags);
2998 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
2999 N3: DAG.getConstantFP(Val: 0x1.5555555555511p-3, DL, VT: MVT::f64), Flags);
3000 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P,
3001 N3: DAG.getConstantFP(Val: 0x1.000000000000bp-1, DL, VT: MVT::f64), Flags);
3002
3003 SDValue One = DAG.getConstantFP(Val: 1.0, DL, VT: MVT::f64);
3004
3005 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3006 P = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: T, N2: P, N3: One, Flags);
3007
3008 // z = ldexp(p, (int)dn)
3009 SDValue DNInt = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL, VT: MVT::i32, Operand: DN);
3010 SDValue Z = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: P, N2: DNInt, Flags);
3011
3012 // Overflow/underflow guards
3013 SDValue CondHi = DAG.getSetCC(
3014 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: 1024.0, DL, VT: MVT::f64), Cond: ISD::SETULE);
3015
3016 if (!Flags.hasNoInfs()) {
3017 SDValue PInf = DAG.getConstantFP(Val: std::numeric_limits<double>::infinity(),
3018 DL, VT: MVT::f64);
3019 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondHi, LHS: Z, RHS: PInf, Flags);
3020 }
3021
3022 SDValue CondLo = DAG.getSetCC(
3023 DL, VT: MVT::i1, LHS: X, RHS: DAG.getConstantFP(Val: -1075.0, DL, VT: MVT::f64), Cond: ISD::SETUGE);
3024 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL, VT: MVT::f64);
3025 Z = DAG.getSelect(DL, VT: MVT::f64, Cond: CondLo, LHS: Z, RHS: Zero, Flags);
3026
3027 return Z;
3028}
3029
3030SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
3031 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3032 // If we have to handle denormals, scale up the input and adjust the result.
3033
3034 EVT VT = Op.getValueType();
3035 if (VT == MVT::f64)
3036 return lowerFEXPF64(Op, DAG);
3037
3038 SDLoc SL(Op);
3039 SDValue Src = Op.getOperand(i: 0);
3040 SDNodeFlags Flags = Op->getFlags();
3041
3042 if (VT == MVT::f16) {
3043 // Nothing in half is a denormal when promoted to f32.
3044 assert(!isTypeLegal(MVT::f16));
3045 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3046 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
3047 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
3048 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3049 }
3050
3051 assert(VT == MVT::f32);
3052
3053 if (!needsDenormHandlingF32(DAG, Src, Flags))
3054 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
3055
3056 // bool needs_scaling = x < -0x1.f80000p+6f;
3057 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3058
3059 // -nextafter(128.0, -1)
3060 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
3061
3062 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3063
3064 SDValue NeedsScaling =
3065 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
3066
3067 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3068 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3069
3070 SDValue AddOffset =
3071 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
3072
3073 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
3074 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
3075
3076 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
3077 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
3078 SDValue ResultScale =
3079 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
3080
3081 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
3082}
3083
3084SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
3085 SelectionDAG &DAG,
3086 SDNodeFlags Flags,
3087 bool IsExp10) const {
3088 // exp(x) -> exp2(M_LOG2E_F * x);
3089 // exp10(x) -> exp2(log2(10) * x);
3090 EVT VT = X.getValueType();
3091 SDValue Const =
3092 DAG.getConstantFP(Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, DL: SL, VT);
3093
3094 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
3095 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3096 : (unsigned)ISD::FEXP2,
3097 DL: SL, VT, Operand: Mul, Flags);
3098}
3099
3100SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
3101 SelectionDAG &DAG,
3102 SDNodeFlags Flags) const {
3103 EVT VT = X.getValueType();
3104 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags))
3105 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3106
3107 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3108
3109 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
3110 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3111
3112 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3113
3114 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3115
3116 SDValue AdjustedX =
3117 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3118
3119 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3120 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3121
3122 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3123
3124 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
3125 SDValue AdjustedResult =
3126 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3127
3128 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3129 Flags);
3130}
3131
3132/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3133/// handled correctly.
3134SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3135 SelectionDAG &DAG,
3136 SDNodeFlags Flags) const {
3137 const EVT VT = X.getValueType();
3138
3139 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3140 : static_cast<unsigned>(ISD::FEXP2);
3141
3142 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3143 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3144 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3145 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3146
3147 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3148 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3149 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3150 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3151 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3152 }
3153
3154 // bool s = x < -0x1.2f7030p+5f;
3155 // x += s ? 0x1.0p+5f : 0.0f;
3156 // exp10 = exp2(x * 0x1.a92000p+1f) *
3157 // exp2(x * 0x1.4f0978p-11f) *
3158 // (s ? 0x1.9f623ep-107f : 1.0f);
3159
3160 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3161
3162 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3163 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3164
3165 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3166 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3167 SDValue AdjustedX =
3168 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3169
3170 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3171 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3172
3173 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3174 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3175 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3176 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3177
3178 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3179
3180 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3181 SDValue AdjustedResult =
3182 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3183
3184 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3185 Flags);
3186}
3187
3188SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3189 EVT VT = Op.getValueType();
3190
3191 if (VT == MVT::f64)
3192 return lowerFEXPF64(Op, DAG);
3193
3194 SDLoc SL(Op);
3195 SDValue X = Op.getOperand(i: 0);
3196 SDNodeFlags Flags = Op->getFlags();
3197 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3198
3199 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3200 // library behavior. Also, is known-not-daz source sufficient?
3201 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3202 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3203 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3204 }
3205
3206 if (VT.getScalarType() == MVT::f16) {
3207 if (VT.isVector())
3208 return SDValue();
3209
3210 // Nothing in half is a denormal when promoted to f32.
3211 //
3212 // exp(f16 x) ->
3213 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3214 //
3215 // exp10(f16 x) ->
3216 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3217 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3218 SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3219 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3220 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3221 }
3222
3223 assert(VT == MVT::f32);
3224
3225 // Algorithm:
3226 //
3227 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3228 //
3229 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3230 // n = 64*m + j, 0 <= j < 64
3231 //
3232 // e^x = 2^((64*m + j + f)/64)
3233 // = (2^m) * (2^(j/64)) * 2^(f/64)
3234 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3235 //
3236 // f = x*(64/ln(2)) - n
3237 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3238 //
3239 // e^x = (2^m) * (2^(j/64)) * e^r
3240 //
3241 // (2^(j/64)) is precomputed
3242 //
3243 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3244 // e^r = 1 + q
3245 //
3246 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3247 //
3248 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3249 SDNodeFlags FlagsNoContract = Flags;
3250 FlagsNoContract.setAllowContract(false);
3251
3252 SDValue PH, PL;
3253 if (Subtarget->hasFastFMAF32()) {
3254 const float c_exp = numbers::log2ef;
3255 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3256 const float c_exp10 = 0x1.a934f0p+1f;
3257 const float cc_exp10 = 0x1.2f346ep-24f;
3258
3259 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3260 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3261
3262 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3263 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3264 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3265 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3266 } else {
3267 const float ch_exp = 0x1.714000p+0f;
3268 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3269
3270 const float ch_exp10 = 0x1.a92000p+1f;
3271 const float cl_exp10 = 0x1.4f0978p-11f;
3272
3273 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3274 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3275
3276 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3277 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3278 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3279 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3280 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3281
3282 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3283
3284 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3285 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3286 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3287 }
3288
3289 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3290
3291 // It is unsafe to contract this fsub into the PH multiply.
3292 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3293
3294 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3295 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3296 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3297
3298 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3299
3300 SDValue UnderflowCheckConst =
3301 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3302
3303 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3304 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3305 SDValue Underflow =
3306 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3307
3308 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3309
3310 if (!Flags.hasNoInfs()) {
3311 SDValue OverflowCheckConst =
3312 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3313 SDValue Overflow =
3314 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3315 SDValue Inf =
3316 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3317 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3318 }
3319
3320 return R;
3321}
3322
3323static bool isCtlzOpc(unsigned Opc) {
3324 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3325}
3326
3327static bool isCttzOpc(unsigned Opc) {
3328 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3329}
3330
3331SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3332 SelectionDAG &DAG) const {
3333 auto SL = SDLoc(Op);
3334 auto Opc = Op.getOpcode();
3335 auto Arg = Op.getOperand(i: 0u);
3336 auto ResultVT = Op.getValueType();
3337
3338 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3339 return {};
3340
3341 assert(isCtlzOpc(Opc));
3342 assert(ResultVT == Arg.getValueType());
3343
3344 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3345 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3346 SDValue NewOp;
3347
3348 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3349 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3350 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3351 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3352 } else {
3353 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3354 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3355 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3356 }
3357
3358 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3359}
3360
3361SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3362 SDLoc SL(Op);
3363 SDValue Src = Op.getOperand(i: 0);
3364
3365 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3366 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3367 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3368
3369 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3370 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3371 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3372
3373 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3374 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3375 // (cttz hi:lo) -> (umin (ffbl src), 32)
3376 // (ctlz_zero_undef src) -> (ffbh src)
3377 // (cttz_zero_undef src) -> (ffbl src)
3378
3379 // 64-bit scalar version produce 32-bit result
3380 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3381 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3382 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3383 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3384 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3385 if (!ZeroUndef) {
3386 const SDValue ConstVal = DAG.getConstant(
3387 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3388 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3389 }
3390 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3391 }
3392
3393 SDValue Lo, Hi;
3394 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3395
3396 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3397 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3398
3399 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3400 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3401 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3402 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3403
3404 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3405 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3406 if (Ctlz)
3407 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3408 else
3409 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3410
3411 SDValue NewOpr;
3412 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3413 if (!ZeroUndef) {
3414 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3415 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3416 }
3417
3418 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3419}
3420
3421SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3422 bool Signed) const {
3423 // The regular method converting a 64-bit integer to float roughly consists of
3424 // 2 steps: normalization and rounding. In fact, after normalization, the
3425 // conversion from a 64-bit integer to a float is essentially the same as the
3426 // one from a 32-bit integer. The only difference is that it has more
3427 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3428 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3429 // converted into the correct float number. The basic steps for the unsigned
3430 // conversion are illustrated in the following pseudo code:
3431 //
3432 // f32 uitofp(i64 u) {
3433 // i32 hi, lo = split(u);
3434 // // Only count the leading zeros in hi as we have native support of the
3435 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3436 // // reduced to a 32-bit one automatically.
3437 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3438 // u <<= shamt;
3439 // hi, lo = split(u);
3440 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3441 // // convert it as a 32-bit integer and scale the result back.
3442 // return uitofp(hi) * 2^(32 - shamt);
3443 // }
3444 //
3445 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3446 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3447 // converted instead followed by negation based its sign bit.
3448
3449 SDLoc SL(Op);
3450 SDValue Src = Op.getOperand(i: 0);
3451
3452 SDValue Lo, Hi;
3453 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3454 SDValue Sign;
3455 SDValue ShAmt;
3456 if (Signed && Subtarget->isGCN()) {
3457 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3458 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3459 // account. That is, the maximal shift is
3460 // - 32 if Lo and Hi have opposite signs;
3461 // - 33 if Lo and Hi have the same sign.
3462 //
3463 // Or, MaxShAmt = 33 + OppositeSign, where
3464 //
3465 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3466 // - -1 if Lo and Hi have opposite signs; and
3467 // - 0 otherwise.
3468 //
3469 // All in all, ShAmt is calculated as
3470 //
3471 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3472 //
3473 // or
3474 //
3475 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3476 //
3477 // to reduce the critical path.
3478 SDValue OppositeSign = DAG.getNode(
3479 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3480 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3481 SDValue MaxShAmt =
3482 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3483 N2: OppositeSign);
3484 // Count the leading sign bits.
3485 ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3486 // Different from unsigned conversion, the shift should be one bit less to
3487 // preserve the sign bit.
3488 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3489 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3490 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3491 } else {
3492 if (Signed) {
3493 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3494 // absolute value first.
3495 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3496 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3497 SDValue Abs =
3498 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3499 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3500 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3501 }
3502 // Count the leading zeros.
3503 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3504 // The shift amount for signed integers is [0, 32].
3505 }
3506 // Normalize the given 64-bit integer.
3507 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3508 // Split it again.
3509 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3510 // Calculate the adjust bit for rounding.
3511 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3512 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3513 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3514 // Get the 32-bit normalized integer.
3515 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3516 // Convert the normalized 32-bit integer into f32.
3517
3518 bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3519 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3520 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3521
3522 // Finally, need to scale back the converted floating number as the original
3523 // 64-bit integer is converted as a 32-bit one.
3524 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3525 N2: ShAmt);
3526 // On GCN, use LDEXP directly.
3527 if (UseLDEXP)
3528 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3529
3530 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3531 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3532 // exponent is enough to avoid overflowing into the sign bit.
3533 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3534 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3535 SDValue IVal =
3536 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3537 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3538 if (Signed) {
3539 // Set the sign bit.
3540 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3541 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3542 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3543 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3544 }
3545 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3546}
3547
3548SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3549 bool Signed) const {
3550 SDLoc SL(Op);
3551 SDValue Src = Op.getOperand(i: 0);
3552
3553 SDValue Lo, Hi;
3554 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3555
3556 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3557 DL: SL, VT: MVT::f64, Operand: Hi);
3558
3559 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3560
3561 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3562 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3563 // TODO: Should this propagate fast-math-flags?
3564 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3565}
3566
3567SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3568 SelectionDAG &DAG) const {
3569 // TODO: Factor out code common with LowerSINT_TO_FP.
3570 EVT DestVT = Op.getValueType();
3571 SDValue Src = Op.getOperand(i: 0);
3572 EVT SrcVT = Src.getValueType();
3573
3574 if (SrcVT == MVT::i16) {
3575 if (DestVT == MVT::f16)
3576 return Op;
3577 SDLoc DL(Op);
3578
3579 // Promote src to i32
3580 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3581 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3582 }
3583
3584 if (DestVT == MVT::bf16) {
3585 SDLoc SL(Op);
3586 SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3587 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3588 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3589 }
3590
3591 if (SrcVT != MVT::i64)
3592 return Op;
3593
3594 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3595 SDLoc DL(Op);
3596
3597 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3598 SDValue FPRoundFlag =
3599 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3600 SDValue FPRound =
3601 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3602
3603 return FPRound;
3604 }
3605
3606 if (DestVT == MVT::f32)
3607 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3608
3609 assert(DestVT == MVT::f64);
3610 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3611}
3612
3613SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3614 SelectionDAG &DAG) const {
3615 EVT DestVT = Op.getValueType();
3616
3617 SDValue Src = Op.getOperand(i: 0);
3618 EVT SrcVT = Src.getValueType();
3619
3620 if (SrcVT == MVT::i16) {
3621 if (DestVT == MVT::f16)
3622 return Op;
3623
3624 SDLoc DL(Op);
3625 // Promote src to i32
3626 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3627 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3628 }
3629
3630 if (DestVT == MVT::bf16) {
3631 SDLoc SL(Op);
3632 SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3633 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3634 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3635 }
3636
3637 if (SrcVT != MVT::i64)
3638 return Op;
3639
3640 // TODO: Factor out code common with LowerUINT_TO_FP.
3641
3642 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3643 SDLoc DL(Op);
3644 SDValue Src = Op.getOperand(i: 0);
3645
3646 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3647 SDValue FPRoundFlag =
3648 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3649 SDValue FPRound =
3650 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3651
3652 return FPRound;
3653 }
3654
3655 if (DestVT == MVT::f32)
3656 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3657
3658 assert(DestVT == MVT::f64);
3659 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3660}
3661
3662SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3663 bool Signed) const {
3664 SDLoc SL(Op);
3665
3666 SDValue Src = Op.getOperand(i: 0);
3667 EVT SrcVT = Src.getValueType();
3668
3669 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3670
3671 // The basic idea of converting a floating point number into a pair of 32-bit
3672 // integers is illustrated as follows:
3673 //
3674 // tf := trunc(val);
3675 // hif := floor(tf * 2^-32);
3676 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3677 // hi := fptoi(hif);
3678 // lo := fptoi(lof);
3679 //
3680 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3681 SDValue Sign;
3682 if (Signed && SrcVT == MVT::f32) {
3683 // However, a 32-bit floating point number has only 23 bits mantissa and
3684 // it's not enough to hold all the significant bits of `lof` if val is
3685 // negative. To avoid the loss of precision, We need to take the absolute
3686 // value after truncating and flip the result back based on the original
3687 // signedness.
3688 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3689 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3690 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3691 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3692 }
3693
3694 SDValue K0, K1;
3695 if (SrcVT == MVT::f64) {
3696 K0 = DAG.getConstantFP(
3697 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3698 VT: SrcVT);
3699 K1 = DAG.getConstantFP(
3700 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3701 VT: SrcVT);
3702 } else {
3703 K0 = DAG.getConstantFP(
3704 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3705 K1 = DAG.getConstantFP(
3706 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3707 }
3708 // TODO: Should this propagate fast-math-flags?
3709 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3710
3711 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3712
3713 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3714
3715 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3716 : ISD::FP_TO_UINT,
3717 DL: SL, VT: MVT::i32, Operand: FloorMul);
3718 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3719
3720 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3721 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3722
3723 if (Signed && SrcVT == MVT::f32) {
3724 assert(Sign);
3725 // Flip the result based on the signedness, which is either all 0s or 1s.
3726 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3727 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3728 // r := xor(r, sign) - sign;
3729 Result =
3730 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3731 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3732 }
3733
3734 return Result;
3735}
3736
3737SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3738 SDLoc DL(Op);
3739 SDValue N0 = Op.getOperand(i: 0);
3740
3741 // Convert to target node to get known bits
3742 if (N0.getValueType() == MVT::f32)
3743 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3744
3745 if (Op->getFlags().hasApproximateFuncs()) {
3746 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3747 return SDValue();
3748 }
3749
3750 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3751}
3752
3753// return node in i32
3754SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3755 SelectionDAG &DAG) const {
3756 assert(Src.getSimpleValueType() == MVT::f64);
3757
3758 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3759 // TODO: We can generate better code for True16.
3760 const unsigned ExpMask = 0x7ff;
3761 const unsigned ExpBiasf64 = 1023;
3762 const unsigned ExpBiasf16 = 15;
3763 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3764 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3765 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3766 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3767 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3768 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3769 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3770 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3771 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3772 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3773 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3774 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3775 // add the f16 bias (15) to get the biased exponent for the f16 format.
3776 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3777 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3778
3779 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3780 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3781 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3782 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3783
3784 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3785 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3786 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3787
3788 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3789 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3790
3791 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3792 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3793 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3794 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3795
3796 // N = M | (E << 12);
3797 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3798 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3799 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3800
3801 // B = clamp(1-E, 0, 13);
3802 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3803 N1: One, N2: E);
3804 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3805 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3806 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3807
3808 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3809 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3810
3811 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3812 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3813 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3814 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3815
3816 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3817 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3818 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3819 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3820 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3821 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3822 True: One, False: Zero, Cond: ISD::SETEQ);
3823 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3824 True: One, False: Zero, Cond: ISD::SETGT);
3825 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3826 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3827
3828 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3829 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3830 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3831 True: I, False: V, Cond: ISD::SETEQ);
3832
3833 // Extract the sign bit.
3834 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3835 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3836 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3837 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3838
3839 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3840}
3841
3842SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3843 SelectionDAG &DAG) const {
3844 SDValue Src = Op.getOperand(i: 0);
3845 unsigned OpOpcode = Op.getOpcode();
3846 EVT SrcVT = Src.getValueType();
3847 EVT DestVT = Op.getValueType();
3848
3849 // Will be selected natively
3850 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3851 return Op;
3852
3853 if (SrcVT == MVT::bf16) {
3854 SDLoc DL(Op);
3855 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3856 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3857 }
3858
3859 // Promote i16 to i32
3860 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3861 SDLoc DL(Op);
3862
3863 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3864 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3865 }
3866
3867 if (DestVT != MVT::i64)
3868 return Op;
3869
3870 if (SrcVT == MVT::f16 ||
3871 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3872 SDLoc DL(Op);
3873
3874 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3875 unsigned Ext =
3876 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3877 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3878 }
3879
3880 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3881 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3882
3883 return SDValue();
3884}
3885
3886SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3887 SelectionDAG &DAG) const {
3888 SDValue Src = Op.getOperand(i: 0);
3889 unsigned OpOpcode = Op.getOpcode();
3890 EVT SrcVT = Src.getValueType();
3891 EVT DstVT = Op.getValueType();
3892 SDValue SatVTOp = Op.getNode()->getOperand(Num: 1);
3893 EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3894 SDLoc DL(Op);
3895
3896 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3897 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3898 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3899
3900 // Will be selected natively
3901 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3902 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3903 return Op;
3904
3905 const SDValue Int32VT = DAG.getValueType(MVT::i32);
3906
3907 // Perform all saturation at i32 and truncate
3908 if (SatWidth < DstWidth) {
3909 const uint64_t Int32Width = 32;
3910 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: Int32VT);
3911 SDValue Int32SatVal;
3912
3913 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3914 SDValue MinConst = DAG.getConstant(
3915 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3916 SDValue MaxConst = DAG.getConstant(
3917 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3918 SDValue MinVal =
3919 DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3920 Int32SatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: MinVal, N2: MaxConst);
3921 } else {
3922 SDValue MinConst = DAG.getConstant(
3923 Val: APInt::getMaxValue(numBits: SatWidth).zext(width: Int32Width), DL, VT: MVT::i32);
3924 Int32SatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3925 }
3926
3927 if (DstWidth == Int32Width)
3928 return Int32SatVal;
3929 if (DstWidth < Int32Width)
3930 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Int32SatVal);
3931
3932 // DstWidth > Int32Width
3933 const unsigned Ext =
3934 OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3935 return DAG.getNode(Opcode: Ext, DL, VT: DstVT, Operand: FpToInt32);
3936 }
3937
3938 // SatWidth == DstWidth
3939
3940 // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3941 if (DstVT == MVT::i64 &&
3942 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3943 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3944 return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VT);
3945 }
3946
3947 // Promote f16/bf16 src to f32
3948 if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
3949 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3950 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3951 }
3952
3953 // Promote sub-i32 dst to i32 with sub-i32 saturation
3954 if (DstWidth < 32) {
3955 // Note: this triggers SatWidth < DstWidth above to generate saturated
3956 // truncate by requesting MVT::i32 destination with SatWidth < 32.
3957 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: SatVTOp);
3958 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt32);
3959 }
3960
3961 // TODO: can we implement i64 dst for f32/f64?
3962
3963 return SDValue();
3964}
3965
3966SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3967 SelectionDAG &DAG) const {
3968 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3969 MVT VT = Op.getSimpleValueType();
3970 MVT ScalarVT = VT.getScalarType();
3971
3972 assert(VT.isVector());
3973
3974 SDValue Src = Op.getOperand(i: 0);
3975 SDLoc DL(Op);
3976
3977 // TODO: Don't scalarize on Evergreen?
3978 unsigned NElts = VT.getVectorNumElements();
3979 SmallVector<SDValue, 8> Args;
3980 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
3981
3982 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3983 for (unsigned I = 0; I < NElts; ++I)
3984 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
3985
3986 return DAG.getBuildVector(VT, DL, Ops: Args);
3987}
3988
3989//===----------------------------------------------------------------------===//
3990// Custom DAG optimizations
3991//===----------------------------------------------------------------------===//
3992
3993static bool isU24(SDValue Op, SelectionDAG &DAG) {
3994 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3995}
3996
3997static bool isI24(SDValue Op, SelectionDAG &DAG) {
3998 EVT VT = Op.getValueType();
3999 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4000 // as unsigned 24-bit values.
4001 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
4002}
4003
4004static SDValue simplifyMul24(SDNode *Node24,
4005 TargetLowering::DAGCombinerInfo &DCI) {
4006 SelectionDAG &DAG = DCI.DAG;
4007 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4008 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4009
4010 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
4011 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
4012 unsigned NewOpcode = Node24->getOpcode();
4013 if (IsIntrin) {
4014 unsigned IID = Node24->getConstantOperandVal(Num: 0);
4015 switch (IID) {
4016 case Intrinsic::amdgcn_mul_i24:
4017 NewOpcode = AMDGPUISD::MUL_I24;
4018 break;
4019 case Intrinsic::amdgcn_mul_u24:
4020 NewOpcode = AMDGPUISD::MUL_U24;
4021 break;
4022 case Intrinsic::amdgcn_mulhi_i24:
4023 NewOpcode = AMDGPUISD::MULHI_I24;
4024 break;
4025 case Intrinsic::amdgcn_mulhi_u24:
4026 NewOpcode = AMDGPUISD::MULHI_U24;
4027 break;
4028 default:
4029 llvm_unreachable("Expected 24-bit mul intrinsic");
4030 }
4031 }
4032
4033 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
4034
4035 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4036 // the operands to have other uses, but will only perform simplifications that
4037 // involve bypassing some nodes for this user.
4038 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
4039 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
4040 if (DemandedLHS || DemandedRHS)
4041 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
4042 N1: DemandedLHS ? DemandedLHS : LHS,
4043 N2: DemandedRHS ? DemandedRHS : RHS);
4044
4045 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4046 // operands if this node is the only user.
4047 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
4048 return SDValue(Node24, 0);
4049 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
4050 return SDValue(Node24, 0);
4051
4052 return SDValue();
4053}
4054
4055template <typename IntTy>
4056static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
4057 uint32_t Width, const SDLoc &DL) {
4058 if (Width + Offset < 32) {
4059 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4060 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4061 if constexpr (std::is_signed_v<IntTy>) {
4062 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
4063 } else {
4064 return DAG.getConstant(Result, DL, MVT::i32);
4065 }
4066 }
4067
4068 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4069}
4070
4071static bool hasVolatileUser(SDNode *Val) {
4072 for (SDNode *U : Val->users()) {
4073 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
4074 if (M->isVolatile())
4075 return true;
4076 }
4077 }
4078
4079 return false;
4080}
4081
4082bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
4083 // i32 vectors are the canonical memory type.
4084 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4085 return false;
4086
4087 if (!VT.isByteSized())
4088 return false;
4089
4090 unsigned Size = VT.getStoreSize();
4091
4092 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4093 return false;
4094
4095 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4096 return false;
4097
4098 return true;
4099}
4100
4101// Replace load of an illegal type with a bitcast from a load of a friendlier
4102// type.
4103SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
4104 DAGCombinerInfo &DCI) const {
4105 if (!DCI.isBeforeLegalize())
4106 return SDValue();
4107
4108 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
4109 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
4110 return SDValue();
4111
4112 SDLoc SL(N);
4113 SelectionDAG &DAG = DCI.DAG;
4114 EVT VT = LN->getMemoryVT();
4115
4116 unsigned Size = VT.getStoreSize();
4117 Align Alignment = LN->getAlign();
4118 if (Alignment < Size && isTypeLegal(VT)) {
4119 unsigned IsFast;
4120 unsigned AS = LN->getAddressSpace();
4121
4122 // Expand unaligned loads earlier than legalization. Due to visitation order
4123 // problems during legalization, the emitted instructions to pack and unpack
4124 // the bytes again are not eliminated in the case of an unaligned copy.
4125 if (!allowsMisalignedMemoryAccesses(
4126 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4127 if (VT.isVector())
4128 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
4129
4130 SDValue Ops[2];
4131 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
4132
4133 return DAG.getMergeValues(Ops, dl: SDLoc(N));
4134 }
4135
4136 if (!IsFast)
4137 return SDValue();
4138 }
4139
4140 if (!shouldCombineMemoryType(VT))
4141 return SDValue();
4142
4143 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4144
4145 SDValue NewLoad
4146 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4147 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4148
4149 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4150 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
4151 return SDValue(N, 0);
4152}
4153
4154// Replace store of an illegal type with a store of a bitcast to a friendlier
4155// type.
4156SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4157 DAGCombinerInfo &DCI) const {
4158 if (!DCI.isBeforeLegalize())
4159 return SDValue();
4160
4161 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4162 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
4163 return SDValue();
4164
4165 EVT VT = SN->getMemoryVT();
4166 unsigned Size = VT.getStoreSize();
4167
4168 SDLoc SL(N);
4169 SelectionDAG &DAG = DCI.DAG;
4170 Align Alignment = SN->getAlign();
4171 if (Alignment < Size && isTypeLegal(VT)) {
4172 unsigned IsFast;
4173 unsigned AS = SN->getAddressSpace();
4174
4175 // Expand unaligned stores earlier than legalization. Due to visitation
4176 // order problems during legalization, the emitted instructions to pack and
4177 // unpack the bytes again are not eliminated in the case of an unaligned
4178 // copy.
4179 if (!allowsMisalignedMemoryAccesses(
4180 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4181 if (VT.isVector())
4182 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
4183
4184 return expandUnalignedStore(ST: SN, DAG);
4185 }
4186
4187 if (!IsFast)
4188 return SDValue();
4189 }
4190
4191 if (!shouldCombineMemoryType(VT))
4192 return SDValue();
4193
4194 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4195 SDValue Val = SN->getValue();
4196
4197 //DCI.AddToWorklist(Val.getNode());
4198
4199 bool OtherUses = !Val.hasOneUse();
4200 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
4201 if (OtherUses) {
4202 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
4203 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4204 }
4205
4206 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4207 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4208}
4209
4210// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4211// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4212// issues.
4213SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4214 DAGCombinerInfo &DCI) const {
4215 SelectionDAG &DAG = DCI.DAG;
4216 SDValue N0 = N->getOperand(Num: 0);
4217
4218 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4219 // (vt2 (truncate (assertzext vt0:x, vt1)))
4220 if (N0.getOpcode() == ISD::TRUNCATE) {
4221 SDValue N1 = N->getOperand(Num: 1);
4222 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4223 SDLoc SL(N);
4224
4225 SDValue Src = N0.getOperand(i: 0);
4226 EVT SrcVT = Src.getValueType();
4227 if (SrcVT.bitsGE(VT: ExtVT)) {
4228 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4229 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
4230 }
4231 }
4232
4233 return SDValue();
4234}
4235
4236SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4237 SDNode *N, DAGCombinerInfo &DCI) const {
4238 unsigned IID = N->getConstantOperandVal(Num: 0);
4239 switch (IID) {
4240 case Intrinsic::amdgcn_mul_i24:
4241 case Intrinsic::amdgcn_mul_u24:
4242 case Intrinsic::amdgcn_mulhi_i24:
4243 case Intrinsic::amdgcn_mulhi_u24:
4244 return simplifyMul24(Node24: N, DCI);
4245 case Intrinsic::amdgcn_fract:
4246 case Intrinsic::amdgcn_rsq:
4247 case Intrinsic::amdgcn_rcp_legacy:
4248 case Intrinsic::amdgcn_rsq_legacy:
4249 case Intrinsic::amdgcn_rsq_clamp:
4250 case Intrinsic::amdgcn_tanh:
4251 case Intrinsic::amdgcn_prng_b32: {
4252 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4253 SDValue Src = N->getOperand(Num: 1);
4254 return Src.isUndef() ? Src : SDValue();
4255 }
4256 case Intrinsic::amdgcn_frexp_exp: {
4257 // frexp_exp (fneg x) -> frexp_exp x
4258 // frexp_exp (fabs x) -> frexp_exp x
4259 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4260 SDValue Src = N->getOperand(Num: 1);
4261 SDValue PeekSign = peekFPSignOps(Val: Src);
4262 if (PeekSign == Src)
4263 return SDValue();
4264 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4265 0);
4266 }
4267 default:
4268 return SDValue();
4269 }
4270}
4271
4272/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4273/// binary operation \p Opc to it with the corresponding constant operands.
4274SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4275 DAGCombinerInfo &DCI, const SDLoc &SL,
4276 unsigned Opc, SDValue LHS,
4277 uint32_t ValLo, uint32_t ValHi) const {
4278 SelectionDAG &DAG = DCI.DAG;
4279 SDValue Lo, Hi;
4280 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4281
4282 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4283 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4284
4285 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4286 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4287
4288 // Re-visit the ands. It's possible we eliminated one of them and it could
4289 // simplify the vector.
4290 DCI.AddToWorklist(N: Lo.getNode());
4291 DCI.AddToWorklist(N: Hi.getNode());
4292
4293 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4294 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4295}
4296
4297SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4298 DAGCombinerInfo &DCI) const {
4299 EVT VT = N->getValueType(ResNo: 0);
4300 SDValue LHS = N->getOperand(Num: 0);
4301 SDValue RHS = N->getOperand(Num: 1);
4302 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4303 SDLoc SL(N);
4304 SelectionDAG &DAG = DCI.DAG;
4305
4306 unsigned RHSVal;
4307 if (CRHS) {
4308 RHSVal = CRHS->getZExtValue();
4309 if (!RHSVal)
4310 return LHS;
4311
4312 switch (LHS->getOpcode()) {
4313 default:
4314 break;
4315 case ISD::ZERO_EXTEND:
4316 case ISD::SIGN_EXTEND:
4317 case ISD::ANY_EXTEND: {
4318 SDValue X = LHS->getOperand(Num: 0);
4319
4320 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4321 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4322 // Prefer build_vector as the canonical form if packed types are legal.
4323 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4324 SDValue Vec = DAG.getBuildVector(
4325 VT: MVT::v2i16, DL: SL,
4326 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4327 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4328 }
4329
4330 // shl (ext x) => zext (shl x), if shift does not overflow int
4331 if (VT != MVT::i64)
4332 break;
4333 KnownBits Known = DAG.computeKnownBits(Op: X);
4334 unsigned LZ = Known.countMinLeadingZeros();
4335 if (LZ < RHSVal)
4336 break;
4337 EVT XVT = X.getValueType();
4338 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4339 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4340 }
4341 }
4342 }
4343
4344 if (VT.getScalarType() != MVT::i64)
4345 return SDValue();
4346
4347 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4348 // common case, splitting this into a move and a 32-bit shift is faster and
4349 // the same code size.
4350 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4351
4352 EVT ElementType = VT.getScalarType();
4353 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4354 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4355
4356 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4357 return SDValue();
4358 SDValue ShiftAmt;
4359
4360 if (CRHS) {
4361 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4362 VT: TargetType);
4363 } else {
4364 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4365 const SDValue ShiftMask =
4366 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4367 // This AND instruction will clamp out of bounds shift values.
4368 // It will also be removed during later instruction selection.
4369 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4370 }
4371
4372 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4373 SDValue NewShift =
4374 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4375
4376 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4377 SDValue Vec;
4378
4379 if (VT.isVector()) {
4380 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4381 unsigned NElts = TargetType.getVectorNumElements();
4382 SmallVector<SDValue, 8> HiOps;
4383 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4384
4385 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4386 for (unsigned I = 0; I != NElts; ++I)
4387 HiAndLoOps[2 * I + 1] = HiOps[I];
4388 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4389 } else {
4390 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4391 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4392 }
4393 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4394}
4395
4396SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4397 DAGCombinerInfo &DCI) const {
4398 SDValue RHS = N->getOperand(Num: 1);
4399 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4400 EVT VT = N->getValueType(ResNo: 0);
4401 SDValue LHS = N->getOperand(Num: 0);
4402 SelectionDAG &DAG = DCI.DAG;
4403 SDLoc SL(N);
4404
4405 if (VT.getScalarType() != MVT::i64)
4406 return SDValue();
4407
4408 // For C >= 32
4409 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4410
4411 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4412 // common case, splitting this into a move and a 32-bit shift is faster and
4413 // the same code size.
4414 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4415
4416 EVT ElementType = VT.getScalarType();
4417 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4418 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4419
4420 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4421 return SDValue();
4422
4423 SDValue ShiftFullAmt =
4424 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4425 SDValue ShiftAmt;
4426 if (CRHS) {
4427 unsigned RHSVal = CRHS->getZExtValue();
4428 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4429 VT: TargetType);
4430 } else if (Known.getMinValue().getZExtValue() ==
4431 (ElementType.getSizeInBits() - 1)) {
4432 ShiftAmt = ShiftFullAmt;
4433 } else {
4434 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4435 const SDValue ShiftMask =
4436 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4437 // This AND instruction will clamp out of bounds shift values.
4438 // It will also be removed during later instruction selection.
4439 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4440 }
4441
4442 EVT ConcatType;
4443 SDValue Hi;
4444 SDLoc LHSSL(LHS);
4445 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4446 if (VT.isVector()) {
4447 unsigned NElts = TargetType.getVectorNumElements();
4448 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4449 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4450 SmallVector<SDValue, 8> HiOps(NElts);
4451 SmallVector<SDValue, 16> HiAndLoOps;
4452
4453 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4454 for (unsigned I = 0; I != NElts; ++I) {
4455 HiOps[I] = HiAndLoOps[2 * I + 1];
4456 }
4457 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4458 } else {
4459 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4460 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4461 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4462 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4463 }
4464
4465 KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4466 SDValue HiShift;
4467 if (KnownLHS.isNegative()) {
4468 HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4469 } else {
4470 Hi = DAG.getFreeze(V: Hi);
4471 HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4472 }
4473 SDValue NewShift =
4474 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4475
4476 SDValue Vec;
4477 if (VT.isVector()) {
4478 unsigned NElts = TargetType.getVectorNumElements();
4479 SmallVector<SDValue, 8> HiOps;
4480 SmallVector<SDValue, 8> LoOps;
4481 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4482
4483 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4484 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4485 for (unsigned I = 0; I != NElts; ++I) {
4486 HiAndLoOps[2 * I + 1] = HiOps[I];
4487 HiAndLoOps[2 * I] = LoOps[I];
4488 }
4489 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4490 } else {
4491 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4492 }
4493 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4494}
4495
4496SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4497 DAGCombinerInfo &DCI) const {
4498 SDValue RHS = N->getOperand(Num: 1);
4499 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4500 EVT VT = N->getValueType(ResNo: 0);
4501 SDValue LHS = N->getOperand(Num: 0);
4502 SelectionDAG &DAG = DCI.DAG;
4503 SDLoc SL(N);
4504 unsigned RHSVal;
4505
4506 if (CRHS) {
4507 RHSVal = CRHS->getZExtValue();
4508
4509 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4510 // this improves the ability to match BFE patterns in isel.
4511 if (LHS.getOpcode() == ISD::AND) {
4512 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4513 unsigned MaskIdx, MaskLen;
4514 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4515 MaskIdx == RHSVal) {
4516 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4517 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4518 N2: N->getOperand(Num: 1)),
4519 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4520 N2: N->getOperand(Num: 1)));
4521 }
4522 }
4523 }
4524 }
4525
4526 if (VT.getScalarType() != MVT::i64)
4527 return SDValue();
4528
4529 // for C >= 32
4530 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4531
4532 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4533 // common case, splitting this into a move and a 32-bit shift is faster and
4534 // the same code size.
4535 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4536
4537 EVT ElementType = VT.getScalarType();
4538 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4539 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4540
4541 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4542 return SDValue();
4543
4544 SDValue ShiftAmt;
4545 if (CRHS) {
4546 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4547 VT: TargetType);
4548 } else {
4549 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4550 const SDValue ShiftMask =
4551 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4552 // This AND instruction will clamp out of bounds shift values.
4553 // It will also be removed during later instruction selection.
4554 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4555 }
4556
4557 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4558 EVT ConcatType;
4559 SDValue Hi;
4560 SDLoc LHSSL(LHS);
4561 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4562 if (VT.isVector()) {
4563 unsigned NElts = TargetType.getVectorNumElements();
4564 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4565 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4566 SmallVector<SDValue, 8> HiOps(NElts);
4567 SmallVector<SDValue, 16> HiAndLoOps;
4568
4569 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4570 for (unsigned I = 0; I != NElts; ++I)
4571 HiOps[I] = HiAndLoOps[2 * I + 1];
4572 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4573 } else {
4574 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4575 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4576 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4577 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4578 }
4579
4580 SDValue NewShift =
4581 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4582
4583 SDValue Vec;
4584 if (VT.isVector()) {
4585 unsigned NElts = TargetType.getVectorNumElements();
4586 SmallVector<SDValue, 8> LoOps;
4587 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4588
4589 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4590 for (unsigned I = 0; I != NElts; ++I)
4591 HiAndLoOps[2 * I] = LoOps[I];
4592 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4593 } else {
4594 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4595 }
4596 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4597}
4598
4599SDValue AMDGPUTargetLowering::performTruncateCombine(
4600 SDNode *N, DAGCombinerInfo &DCI) const {
4601 SDLoc SL(N);
4602 SelectionDAG &DAG = DCI.DAG;
4603 EVT VT = N->getValueType(ResNo: 0);
4604 SDValue Src = N->getOperand(Num: 0);
4605
4606 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4607 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4608 SDValue Vec = Src.getOperand(i: 0);
4609 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4610 SDValue Elt0 = Vec.getOperand(i: 0);
4611 EVT EltVT = Elt0.getValueType();
4612 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4613 if (EltVT.isFloatingPoint()) {
4614 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4615 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4616 }
4617
4618 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4619 }
4620 }
4621 }
4622
4623 // Equivalent of above for accessing the high element of a vector as an
4624 // integer operation.
4625 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4626 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4627 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4628 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4629 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4630 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4631 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4632 unsigned BitIndex = K->getZExtValue();
4633 unsigned PartIndex = BitIndex / SrcEltSize;
4634
4635 if (PartIndex * SrcEltSize == BitIndex &&
4636 PartIndex < BV.getNumOperands()) {
4637 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4638 SDValue SrcElt =
4639 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4640 Operand: BV.getOperand(i: PartIndex));
4641 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4642 }
4643 }
4644 }
4645 }
4646 }
4647
4648 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4649 //
4650 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4651 // i16 (trunc (srl (i32 (trunc x), K)))
4652 if (VT.getScalarSizeInBits() < 32) {
4653 EVT SrcVT = Src.getValueType();
4654 if (SrcVT.getScalarSizeInBits() > 32 &&
4655 (Src.getOpcode() == ISD::SRL ||
4656 Src.getOpcode() == ISD::SRA ||
4657 Src.getOpcode() == ISD::SHL)) {
4658 SDValue Amt = Src.getOperand(i: 1);
4659 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4660
4661 // - For left shifts, do the transform as long as the shift
4662 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4663 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4664 // losing information stored in the high bits when truncating.
4665 const unsigned MaxCstSize =
4666 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4667 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4668 EVT MidVT = VT.isVector() ?
4669 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4670 NumElements: VT.getVectorNumElements()) : MVT::i32;
4671
4672 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4673 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4674 Operand: Src.getOperand(i: 0));
4675 DCI.AddToWorklist(N: Trunc.getNode());
4676
4677 if (Amt.getValueType() != NewShiftVT) {
4678 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4679 DCI.AddToWorklist(N: Amt.getNode());
4680 }
4681
4682 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4683 N1: Trunc, N2: Amt);
4684 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4685 }
4686 }
4687 }
4688
4689 return SDValue();
4690}
4691
4692// We need to specifically handle i64 mul here to avoid unnecessary conversion
4693// instructions. If we only match on the legalized i64 mul expansion,
4694// SimplifyDemandedBits will be unable to remove them because there will be
4695// multiple uses due to the separate mul + mulh[su].
4696static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4697 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4698 if (Size <= 32) {
4699 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4700 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4701 }
4702
4703 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4704 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4705
4706 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4707 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4708
4709 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4710}
4711
4712/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4713/// return SDValue().
4714static SDValue getAddOneOp(const SDNode *V) {
4715 if (V->getOpcode() != ISD::ADD)
4716 return SDValue();
4717
4718 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4719}
4720
4721SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4722 DAGCombinerInfo &DCI) const {
4723 assert(N->getOpcode() == ISD::MUL);
4724 EVT VT = N->getValueType(ResNo: 0);
4725
4726 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4727 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4728 // unnecessarily). isDivergent() is used as an approximation of whether the
4729 // value is in an SGPR.
4730 if (!N->isDivergent())
4731 return SDValue();
4732
4733 unsigned Size = VT.getSizeInBits();
4734 if (VT.isVector() || Size > 64)
4735 return SDValue();
4736
4737 SelectionDAG &DAG = DCI.DAG;
4738 SDLoc DL(N);
4739
4740 SDValue N0 = N->getOperand(Num: 0);
4741 SDValue N1 = N->getOperand(Num: 1);
4742
4743 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4744 // matching.
4745
4746 // mul x, (add y, 1) -> add (mul x, y), x
4747 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4748 SDValue AddOp = getAddOneOp(V: V.getNode());
4749 if (!AddOp)
4750 return SDValue();
4751
4752 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4753 return U->getOpcode() == ISD::MUL;
4754 }))
4755 return AddOp;
4756
4757 return SDValue();
4758 };
4759
4760 // FIXME: The selection pattern is not properly checking for commuted
4761 // operands, so we have to place the mul in the LHS
4762 if (SDValue MulOper = IsFoldableAdd(N0)) {
4763 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4764 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4765 }
4766
4767 if (SDValue MulOper = IsFoldableAdd(N1)) {
4768 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4769 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4770 }
4771
4772 // There are i16 integer mul/mad.
4773 if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4774 return SDValue();
4775
4776 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4777 // in the source into any_extends if the result of the mul is truncated. Since
4778 // we can assume the high bits are whatever we want, use the underlying value
4779 // to avoid the unknown high bits from interfering.
4780 if (N0.getOpcode() == ISD::ANY_EXTEND)
4781 N0 = N0.getOperand(i: 0);
4782
4783 if (N1.getOpcode() == ISD::ANY_EXTEND)
4784 N1 = N1.getOperand(i: 0);
4785
4786 SDValue Mul;
4787
4788 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4789 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4790 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4791 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4792 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4793 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4794 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4795 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4796 } else {
4797 return SDValue();
4798 }
4799
4800 // We need to use sext even for MUL_U24, because MUL_U24 is used
4801 // for signed multiply of 8 and 16-bit types.
4802 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4803}
4804
4805SDValue
4806AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4807 DAGCombinerInfo &DCI) const {
4808 if (N->getValueType(ResNo: 0) != MVT::i32)
4809 return SDValue();
4810
4811 SelectionDAG &DAG = DCI.DAG;
4812 SDLoc DL(N);
4813
4814 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4815 SDValue N0 = N->getOperand(Num: 0);
4816 SDValue N1 = N->getOperand(Num: 1);
4817
4818 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4819 // in the source into any_extends if the result of the mul is truncated. Since
4820 // we can assume the high bits are whatever we want, use the underlying value
4821 // to avoid the unknown high bits from interfering.
4822 if (N0.getOpcode() == ISD::ANY_EXTEND)
4823 N0 = N0.getOperand(i: 0);
4824 if (N1.getOpcode() == ISD::ANY_EXTEND)
4825 N1 = N1.getOperand(i: 0);
4826
4827 // Try to use two fast 24-bit multiplies (one for each half of the result)
4828 // instead of one slow extending multiply.
4829 unsigned LoOpcode = 0;
4830 unsigned HiOpcode = 0;
4831 if (Signed) {
4832 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4833 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4834 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4835 LoOpcode = AMDGPUISD::MUL_I24;
4836 HiOpcode = AMDGPUISD::MULHI_I24;
4837 }
4838 } else {
4839 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4840 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4841 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4842 LoOpcode = AMDGPUISD::MUL_U24;
4843 HiOpcode = AMDGPUISD::MULHI_U24;
4844 }
4845 }
4846 if (!LoOpcode)
4847 return SDValue();
4848
4849 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4850 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4851 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4852 return SDValue(N, 0);
4853}
4854
4855SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4856 DAGCombinerInfo &DCI) const {
4857 EVT VT = N->getValueType(ResNo: 0);
4858
4859 if (!Subtarget->hasMulI24() || VT.isVector())
4860 return SDValue();
4861
4862 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4863 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4864 // unnecessarily). isDivergent() is used as an approximation of whether the
4865 // value is in an SGPR.
4866 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4867 // valu op anyway)
4868 if (Subtarget->hasSMulHi() && !N->isDivergent())
4869 return SDValue();
4870
4871 SelectionDAG &DAG = DCI.DAG;
4872 SDLoc DL(N);
4873
4874 SDValue N0 = N->getOperand(Num: 0);
4875 SDValue N1 = N->getOperand(Num: 1);
4876
4877 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4878 return SDValue();
4879
4880 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4881 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4882
4883 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4884 DCI.AddToWorklist(N: Mulhi.getNode());
4885 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4886}
4887
4888SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4889 DAGCombinerInfo &DCI) const {
4890 EVT VT = N->getValueType(ResNo: 0);
4891
4892 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4893 return SDValue();
4894
4895 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4896 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4897 // unnecessarily). isDivergent() is used as an approximation of whether the
4898 // value is in an SGPR.
4899 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4900 // valu op anyway)
4901 if (!N->isDivergent() && Subtarget->hasSMulHi())
4902 return SDValue();
4903
4904 SelectionDAG &DAG = DCI.DAG;
4905 SDLoc DL(N);
4906
4907 SDValue N0 = N->getOperand(Num: 0);
4908 SDValue N1 = N->getOperand(Num: 1);
4909
4910 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4911 return SDValue();
4912
4913 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4914 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4915
4916 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4917 DCI.AddToWorklist(N: Mulhi.getNode());
4918 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4919}
4920
4921SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4922 SDValue Op,
4923 const SDLoc &DL,
4924 unsigned Opc) const {
4925 EVT VT = Op.getValueType();
4926 if (VT.bitsGT(VT: MVT::i32))
4927 return SDValue();
4928
4929 if (VT != MVT::i32)
4930 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4931
4932 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4933 if (VT != MVT::i32)
4934 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4935
4936 return FFBX;
4937}
4938
4939// The native instructions return -1 on 0 input. Optimize out a select that
4940// produces -1 on 0.
4941//
4942// TODO: If zero is not undef, we could also do this if the output is compared
4943// against the bitwidth.
4944//
4945// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4946SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4947 SDValue LHS, SDValue RHS,
4948 DAGCombinerInfo &DCI) const {
4949 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4950 return SDValue();
4951
4952 SelectionDAG &DAG = DCI.DAG;
4953 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4954 SDValue CmpLHS = Cond.getOperand(i: 0);
4955
4956 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4957 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4958 if (CCOpcode == ISD::SETEQ &&
4959 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4960 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4961 unsigned Opc =
4962 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4963 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4964 }
4965
4966 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4967 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4968 if (CCOpcode == ISD::SETNE &&
4969 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4970 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
4971 unsigned Opc =
4972 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4973
4974 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4975 }
4976
4977 return SDValue();
4978}
4979
4980static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4981 unsigned Op,
4982 const SDLoc &SL,
4983 SDValue Cond,
4984 SDValue N1,
4985 SDValue N2) {
4986 SelectionDAG &DAG = DCI.DAG;
4987 EVT VT = N1.getValueType();
4988
4989 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4990 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
4991 DCI.AddToWorklist(N: NewSelect.getNode());
4992 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4993}
4994
4995// Pull a free FP operation out of a select so it may fold into uses.
4996//
4997// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4998// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4999//
5000// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5001// select c, (fabs x), +k -> fabs (select c, x, k)
5002SDValue
5003AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
5004 SDValue N) const {
5005 SelectionDAG &DAG = DCI.DAG;
5006 SDValue Cond = N.getOperand(i: 0);
5007 SDValue LHS = N.getOperand(i: 1);
5008 SDValue RHS = N.getOperand(i: 2);
5009
5010 EVT VT = N.getValueType();
5011 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5012 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5013 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5014 return SDValue();
5015
5016 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
5017 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
5018 }
5019
5020 bool Inv = false;
5021 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5022 std::swap(a&: LHS, b&: RHS);
5023 Inv = true;
5024 }
5025
5026 // TODO: Support vector constants.
5027 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
5028 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5029 !selectSupportsSourceMods(N: N.getNode())) {
5030 SDLoc SL(N);
5031 // If one side is an fneg/fabs and the other is a constant, we can push the
5032 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5033 SDValue NewLHS = LHS.getOperand(i: 0);
5034 SDValue NewRHS = RHS;
5035
5036 // Careful: if the neg can be folded up, don't try to pull it back down.
5037 bool ShouldFoldNeg = true;
5038
5039 if (NewLHS.hasOneUse()) {
5040 unsigned Opc = NewLHS.getOpcode();
5041 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
5042 ShouldFoldNeg = false;
5043 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5044 ShouldFoldNeg = false;
5045 }
5046
5047 if (ShouldFoldNeg) {
5048 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5049 return SDValue();
5050
5051 // We're going to be forced to use a source modifier anyway, there's no
5052 // point to pulling the negate out unless we can get a size reduction by
5053 // negating the constant.
5054 //
5055 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5056 // about cheaper constants.
5057 if (NewLHS.getOpcode() == ISD::FABS &&
5058 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
5059 return SDValue();
5060
5061 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
5062 return SDValue();
5063
5064 if (LHS.getOpcode() == ISD::FNEG)
5065 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5066
5067 if (Inv)
5068 std::swap(a&: NewLHS, b&: NewRHS);
5069
5070 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
5071 N1: Cond, N2: NewLHS, N3: NewRHS);
5072 DCI.AddToWorklist(N: NewSelect.getNode());
5073 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
5074 }
5075 }
5076
5077 return SDValue();
5078}
5079
5080SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
5081 DAGCombinerInfo &DCI) const {
5082 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
5083 return Folded;
5084
5085 SDValue Cond = N->getOperand(Num: 0);
5086 if (Cond.getOpcode() != ISD::SETCC)
5087 return SDValue();
5088
5089 EVT VT = N->getValueType(ResNo: 0);
5090 SDValue LHS = Cond.getOperand(i: 0);
5091 SDValue RHS = Cond.getOperand(i: 1);
5092 SDValue CC = Cond.getOperand(i: 2);
5093
5094 SDValue True = N->getOperand(Num: 1);
5095 SDValue False = N->getOperand(Num: 2);
5096
5097 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5098 SelectionDAG &DAG = DCI.DAG;
5099 if (DAG.isConstantValueOfAnyType(N: True) &&
5100 !DAG.isConstantValueOfAnyType(N: False)) {
5101 // Swap cmp + select pair to move constant to false input.
5102 // This will allow using VOPC cndmasks more often.
5103 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5104
5105 SDLoc SL(N);
5106 ISD::CondCode NewCC =
5107 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
5108
5109 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
5110 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
5111 }
5112
5113 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5114 SDValue MinMax
5115 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5116 // Revisit this node so we can catch min3/max3/med3 patterns.
5117 //DCI.AddToWorklist(MinMax.getNode());
5118 return MinMax;
5119 }
5120 }
5121
5122 // There's no reason to not do this if the condition has other uses.
5123 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
5124}
5125
5126static bool isInv2Pi(const APFloat &APF) {
5127 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5128 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5129 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5130
5131 return APF.bitwiseIsEqual(RHS: KF16) ||
5132 APF.bitwiseIsEqual(RHS: KF32) ||
5133 APF.bitwiseIsEqual(RHS: KF64);
5134}
5135
5136// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5137// additional cost to negate them.
5138TargetLowering::NegatibleCost
5139AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
5140 if (C->isZero())
5141 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5142
5143 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5144 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5145
5146 return NegatibleCost::Neutral;
5147}
5148
5149bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5150 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5151 return getConstantNegateCost(C) == NegatibleCost::Expensive;
5152 return false;
5153}
5154
5155bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5156 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5157 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5158 return false;
5159}
5160
5161static unsigned inverseMinMax(unsigned Opc) {
5162 switch (Opc) {
5163 case ISD::FMAXNUM:
5164 return ISD::FMINNUM;
5165 case ISD::FMINNUM:
5166 return ISD::FMAXNUM;
5167 case ISD::FMAXNUM_IEEE:
5168 return ISD::FMINNUM_IEEE;
5169 case ISD::FMINNUM_IEEE:
5170 return ISD::FMAXNUM_IEEE;
5171 case ISD::FMAXIMUM:
5172 return ISD::FMINIMUM;
5173 case ISD::FMINIMUM:
5174 return ISD::FMAXIMUM;
5175 case ISD::FMAXIMUMNUM:
5176 return ISD::FMINIMUMNUM;
5177 case ISD::FMINIMUMNUM:
5178 return ISD::FMAXIMUMNUM;
5179 case AMDGPUISD::FMAX_LEGACY:
5180 return AMDGPUISD::FMIN_LEGACY;
5181 case AMDGPUISD::FMIN_LEGACY:
5182 return AMDGPUISD::FMAX_LEGACY;
5183 default:
5184 llvm_unreachable("invalid min/max opcode");
5185 }
5186}
5187
5188/// \return true if it's profitable to try to push an fneg into its source
5189/// instruction.
5190bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5191 // If the input has multiple uses and we can either fold the negate down, or
5192 // the other uses cannot, give up. This both prevents unprofitable
5193 // transformations and infinite loops: we won't repeatedly try to fold around
5194 // a negate that has no 'good' form.
5195 if (N0.hasOneUse()) {
5196 // This may be able to fold into the source, but at a code size cost. Don't
5197 // fold if the fold into the user is free.
5198 if (allUsesHaveSourceMods(N, CostThreshold: 0))
5199 return false;
5200 } else {
5201 if (fnegFoldsIntoOp(N: N0.getNode()) &&
5202 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
5203 return false;
5204 }
5205
5206 return true;
5207}
5208
5209SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5210 DAGCombinerInfo &DCI) const {
5211 SelectionDAG &DAG = DCI.DAG;
5212 SDValue N0 = N->getOperand(Num: 0);
5213 EVT VT = N->getValueType(ResNo: 0);
5214
5215 unsigned Opc = N0.getOpcode();
5216
5217 if (!shouldFoldFNegIntoSrc(N, N0))
5218 return SDValue();
5219
5220 SDLoc SL(N);
5221 switch (Opc) {
5222 case ISD::FADD: {
5223 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5224 return SDValue();
5225
5226 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5227 SDValue LHS = N0.getOperand(i: 0);
5228 SDValue RHS = N0.getOperand(i: 1);
5229
5230 if (LHS.getOpcode() != ISD::FNEG)
5231 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5232 else
5233 LHS = LHS.getOperand(i: 0);
5234
5235 if (RHS.getOpcode() != ISD::FNEG)
5236 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5237 else
5238 RHS = RHS.getOperand(i: 0);
5239
5240 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5241 if (Res.getOpcode() != ISD::FADD)
5242 return SDValue(); // Op got folded away.
5243 if (!N0.hasOneUse())
5244 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5245 return Res;
5246 }
5247 case ISD::FMUL:
5248 case AMDGPUISD::FMUL_LEGACY: {
5249 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5250 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5251 SDValue LHS = N0.getOperand(i: 0);
5252 SDValue RHS = N0.getOperand(i: 1);
5253
5254 if (LHS.getOpcode() == ISD::FNEG)
5255 LHS = LHS.getOperand(i: 0);
5256 else if (RHS.getOpcode() == ISD::FNEG)
5257 RHS = RHS.getOperand(i: 0);
5258 else
5259 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5260
5261 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5262 if (Res.getOpcode() != Opc)
5263 return SDValue(); // Op got folded away.
5264 if (!N0.hasOneUse())
5265 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5266 return Res;
5267 }
5268 case ISD::FMA:
5269 case ISD::FMAD: {
5270 // TODO: handle llvm.amdgcn.fma.legacy
5271 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5272 return SDValue();
5273
5274 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5275 SDValue LHS = N0.getOperand(i: 0);
5276 SDValue MHS = N0.getOperand(i: 1);
5277 SDValue RHS = N0.getOperand(i: 2);
5278
5279 if (LHS.getOpcode() == ISD::FNEG)
5280 LHS = LHS.getOperand(i: 0);
5281 else if (MHS.getOpcode() == ISD::FNEG)
5282 MHS = MHS.getOperand(i: 0);
5283 else
5284 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5285
5286 if (RHS.getOpcode() != ISD::FNEG)
5287 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5288 else
5289 RHS = RHS.getOperand(i: 0);
5290
5291 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5292 if (Res.getOpcode() != Opc)
5293 return SDValue(); // Op got folded away.
5294 if (!N0.hasOneUse())
5295 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5296 return Res;
5297 }
5298 case ISD::FMAXNUM:
5299 case ISD::FMINNUM:
5300 case ISD::FMAXNUM_IEEE:
5301 case ISD::FMINNUM_IEEE:
5302 case ISD::FMINIMUM:
5303 case ISD::FMAXIMUM:
5304 case ISD::FMINIMUMNUM:
5305 case ISD::FMAXIMUMNUM:
5306 case AMDGPUISD::FMAX_LEGACY:
5307 case AMDGPUISD::FMIN_LEGACY: {
5308 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5309 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5310 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5311 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5312
5313 SDValue LHS = N0.getOperand(i: 0);
5314 SDValue RHS = N0.getOperand(i: 1);
5315
5316 // 0 doesn't have a negated inline immediate.
5317 // TODO: This constant check should be generalized to other operations.
5318 if (isConstantCostlierToNegate(N: RHS))
5319 return SDValue();
5320
5321 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5322 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5323 unsigned Opposite = inverseMinMax(Opc);
5324
5325 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5326 if (Res.getOpcode() != Opposite)
5327 return SDValue(); // Op got folded away.
5328 if (!N0.hasOneUse())
5329 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5330 return Res;
5331 }
5332 case AMDGPUISD::FMED3: {
5333 SDValue Ops[3];
5334 for (unsigned I = 0; I < 3; ++I)
5335 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5336
5337 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5338 if (Res.getOpcode() != AMDGPUISD::FMED3)
5339 return SDValue(); // Op got folded away.
5340
5341 if (!N0.hasOneUse()) {
5342 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5343 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5344
5345 for (SDNode *U : Neg->users())
5346 DCI.AddToWorklist(N: U);
5347 }
5348
5349 return Res;
5350 }
5351 case ISD::FP_EXTEND:
5352 case ISD::FTRUNC:
5353 case ISD::FRINT:
5354 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5355 case ISD::FROUNDEVEN:
5356 case ISD::FSIN:
5357 case ISD::FCANONICALIZE:
5358 case AMDGPUISD::RCP:
5359 case AMDGPUISD::RCP_LEGACY:
5360 case AMDGPUISD::RCP_IFLAG:
5361 case AMDGPUISD::SIN_HW: {
5362 SDValue CvtSrc = N0.getOperand(i: 0);
5363 if (CvtSrc.getOpcode() == ISD::FNEG) {
5364 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5365 // (fneg (rcp (fneg x))) -> (rcp x)
5366 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5367 }
5368
5369 if (!N0.hasOneUse())
5370 return SDValue();
5371
5372 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5373 // (fneg (rcp x)) -> (rcp (fneg x))
5374 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5375 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5376 }
5377 case ISD::FP_ROUND: {
5378 SDValue CvtSrc = N0.getOperand(i: 0);
5379
5380 if (CvtSrc.getOpcode() == ISD::FNEG) {
5381 // (fneg (fp_round (fneg x))) -> (fp_round x)
5382 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5383 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5384 }
5385
5386 if (!N0.hasOneUse())
5387 return SDValue();
5388
5389 // (fneg (fp_round x)) -> (fp_round (fneg x))
5390 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5391 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5392 }
5393 case ISD::FP16_TO_FP: {
5394 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5395 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5396 // Put the fneg back as a legal source operation that can be matched later.
5397 SDLoc SL(N);
5398
5399 SDValue Src = N0.getOperand(i: 0);
5400 EVT SrcVT = Src.getValueType();
5401
5402 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5403 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5404 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5405 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5406 }
5407 case ISD::SELECT: {
5408 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5409 // TODO: Invert conditions of foldFreeOpFromSelect
5410 return SDValue();
5411 }
5412 case ISD::BITCAST: {
5413 SDLoc SL(N);
5414 SDValue BCSrc = N0.getOperand(i: 0);
5415 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5416 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5417 if (HighBits.getValueType().getSizeInBits() != 32 ||
5418 !fnegFoldsIntoOp(N: HighBits.getNode()))
5419 return SDValue();
5420
5421 // f64 fneg only really needs to operate on the high half of of the
5422 // register, so try to force it to an f32 operation to help make use of
5423 // source modifiers.
5424 //
5425 //
5426 // fneg (f64 (bitcast (build_vector x, y))) ->
5427 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5428 // (fneg (bitcast i32:y to f32)))
5429
5430 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5431 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5432 SDValue CastBack =
5433 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5434
5435 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5436 Ops.back() = CastBack;
5437 DCI.AddToWorklist(N: NegHi.getNode());
5438 SDValue Build =
5439 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5440 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5441
5442 if (!N0.hasOneUse())
5443 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5444 return Result;
5445 }
5446
5447 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5448 BCSrc.hasOneUse()) {
5449 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5450 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5451
5452 // TODO: Cast back result for multiple uses is beneficial in some cases.
5453
5454 SDValue LHS =
5455 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5456 SDValue RHS =
5457 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5458
5459 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5460 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5461
5462 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5463 N3: NegRHS);
5464 }
5465
5466 return SDValue();
5467 }
5468 default:
5469 return SDValue();
5470 }
5471}
5472
5473SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5474 DAGCombinerInfo &DCI) const {
5475 SelectionDAG &DAG = DCI.DAG;
5476 SDValue N0 = N->getOperand(Num: 0);
5477
5478 if (!N0.hasOneUse())
5479 return SDValue();
5480
5481 switch (N0.getOpcode()) {
5482 case ISD::FP16_TO_FP: {
5483 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5484 SDLoc SL(N);
5485 SDValue Src = N0.getOperand(i: 0);
5486 EVT SrcVT = Src.getValueType();
5487
5488 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5489 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5490 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5491 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5492 }
5493 default:
5494 return SDValue();
5495 }
5496}
5497
5498SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5499 DAGCombinerInfo &DCI) const {
5500 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5501 if (!CFP)
5502 return SDValue();
5503
5504 // XXX - Should this flush denormals?
5505 const APFloat &Val = CFP->getValueAPF();
5506 APFloat One(Val.getSemantics(), "1.0");
5507 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5508}
5509
5510bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const {
5511 if (!Subtarget->isGCN())
5512 return false;
5513
5514 ConstantSDNode *SDConstant = dyn_cast<ConstantSDNode>(Val: N);
5515 ConstantFPSDNode *SDFPConstant = dyn_cast<ConstantFPSDNode>(Val: N);
5516 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5517 const auto *TII = ST.getInstrInfo();
5518
5519 if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant))
5520 return false;
5521
5522 if (ST.has64BitLiterals())
5523 return true;
5524
5525 if (SDConstant) {
5526 const APInt &APVal = SDConstant->getAPIntValue();
5527 return isUInt<32>(x: APVal.getZExtValue()) || TII->isInlineConstant(Imm: APVal);
5528 }
5529
5530 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5531 return isUInt<32>(x: Val.getZExtValue()) || TII->isInlineConstant(Imm: Val);
5532}
5533
5534SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5535 DAGCombinerInfo &DCI) const {
5536 SelectionDAG &DAG = DCI.DAG;
5537 SDLoc DL(N);
5538
5539 switch(N->getOpcode()) {
5540 default:
5541 break;
5542 case ISD::BITCAST: {
5543 EVT DestVT = N->getValueType(ResNo: 0);
5544
5545 // Push casts through vector builds. This helps avoid emitting a large
5546 // number of copies when materializing floating point vector constants.
5547 //
5548 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5549 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5550 if (DestVT.isVector()) {
5551 SDValue Src = N->getOperand(Num: 0);
5552 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5553 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5554 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5555 EVT SrcVT = Src.getValueType();
5556 unsigned NElts = DestVT.getVectorNumElements();
5557
5558 if (SrcVT.getVectorNumElements() == NElts) {
5559 EVT DestEltVT = DestVT.getVectorElementType();
5560
5561 SmallVector<SDValue, 8> CastedElts;
5562 SDLoc SL(N);
5563 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5564 SDValue Elt = Src.getOperand(i: I);
5565 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5566 }
5567
5568 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5569 }
5570 }
5571 }
5572
5573 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5574 break;
5575
5576 // Fold bitcasts of constants.
5577 //
5578 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5579 // TODO: Generalize and move to DAGCombiner
5580 SDValue Src = N->getOperand(Num: 0);
5581 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5582 SDLoc SL(N);
5583 if (isInt64ImmLegal(N: C, DAG))
5584 break;
5585 uint64_t CVal = C->getZExtValue();
5586 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5587 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5588 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5589 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5590 }
5591
5592 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5593 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5594 SDLoc SL(N);
5595 if (isInt64ImmLegal(N: C, DAG))
5596 break;
5597 uint64_t CVal = Val.getZExtValue();
5598 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5599 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5600 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5601
5602 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5603 }
5604
5605 break;
5606 }
5607 case ISD::SHL:
5608 case ISD::SRA:
5609 case ISD::SRL: {
5610 // Range metadata can be invalidated when loads are converted to legal types
5611 // (e.g. v2i64 -> v4i32).
5612 // Try to convert vector shl/sra/srl before type legalization so that range
5613 // metadata can be utilized.
5614 if (!(N->getValueType(ResNo: 0).isVector() &&
5615 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5616 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5617 break;
5618 if (N->getOpcode() == ISD::SHL)
5619 return performShlCombine(N, DCI);
5620 if (N->getOpcode() == ISD::SRA)
5621 return performSraCombine(N, DCI);
5622 return performSrlCombine(N, DCI);
5623 }
5624 case ISD::TRUNCATE:
5625 return performTruncateCombine(N, DCI);
5626 case ISD::MUL:
5627 return performMulCombine(N, DCI);
5628 case AMDGPUISD::MUL_U24:
5629 case AMDGPUISD::MUL_I24: {
5630 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5631 return Simplified;
5632 break;
5633 }
5634 case AMDGPUISD::MULHI_I24:
5635 case AMDGPUISD::MULHI_U24:
5636 return simplifyMul24(Node24: N, DCI);
5637 case ISD::SMUL_LOHI:
5638 case ISD::UMUL_LOHI:
5639 return performMulLoHiCombine(N, DCI);
5640 case ISD::MULHS:
5641 return performMulhsCombine(N, DCI);
5642 case ISD::MULHU:
5643 return performMulhuCombine(N, DCI);
5644 case ISD::SELECT:
5645 return performSelectCombine(N, DCI);
5646 case ISD::FNEG:
5647 return performFNegCombine(N, DCI);
5648 case ISD::FABS:
5649 return performFAbsCombine(N, DCI);
5650 case AMDGPUISD::BFE_I32:
5651 case AMDGPUISD::BFE_U32: {
5652 assert(!N->getValueType(0).isVector() &&
5653 "Vector handling of BFE not implemented");
5654 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5655 if (!Width)
5656 break;
5657
5658 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5659 if (WidthVal == 0)
5660 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5661
5662 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5663 if (!Offset)
5664 break;
5665
5666 SDValue BitsFrom = N->getOperand(Num: 0);
5667 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5668
5669 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5670
5671 if (OffsetVal == 0) {
5672 // This is already sign / zero extended, so try to fold away extra BFEs.
5673 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5674
5675 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5676 if (OpSignBits >= SignBits)
5677 return BitsFrom;
5678
5679 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5680 if (Signed) {
5681 // This is a sign_extend_inreg. Replace it to take advantage of existing
5682 // DAG Combines. If not eliminated, we will match back to BFE during
5683 // selection.
5684
5685 // TODO: The sext_inreg of extended types ends, although we can could
5686 // handle them in a single BFE.
5687 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5688 N2: DAG.getValueType(SmallVT));
5689 }
5690
5691 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5692 }
5693
5694 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5695 if (Signed) {
5696 return constantFoldBFE<int32_t>(DAG,
5697 Src0: CVal->getSExtValue(),
5698 Offset: OffsetVal,
5699 Width: WidthVal,
5700 DL);
5701 }
5702
5703 return constantFoldBFE<uint32_t>(DAG,
5704 Src0: CVal->getZExtValue(),
5705 Offset: OffsetVal,
5706 Width: WidthVal,
5707 DL);
5708 }
5709
5710 if ((OffsetVal + WidthVal) >= 32 &&
5711 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5712 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5713 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5714 N1: BitsFrom, N2: ShiftVal);
5715 }
5716
5717 if (BitsFrom.hasOneUse()) {
5718 APInt Demanded = APInt::getBitsSet(numBits: 32,
5719 loBit: OffsetVal,
5720 hiBit: OffsetVal + WidthVal);
5721
5722 KnownBits Known;
5723 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5724 !DCI.isBeforeLegalizeOps());
5725 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5726 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5727 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5728 DCI.CommitTargetLoweringOpt(TLO);
5729 }
5730 }
5731
5732 break;
5733 }
5734 case ISD::LOAD:
5735 return performLoadCombine(N, DCI);
5736 case ISD::STORE:
5737 return performStoreCombine(N, DCI);
5738 case AMDGPUISD::RCP:
5739 case AMDGPUISD::RCP_IFLAG:
5740 return performRcpCombine(N, DCI);
5741 case ISD::AssertZext:
5742 case ISD::AssertSext:
5743 return performAssertSZExtCombine(N, DCI);
5744 case ISD::INTRINSIC_WO_CHAIN:
5745 return performIntrinsicWOChainCombine(N, DCI);
5746 case AMDGPUISD::FMAD_FTZ: {
5747 SDValue N0 = N->getOperand(Num: 0);
5748 SDValue N1 = N->getOperand(Num: 1);
5749 SDValue N2 = N->getOperand(Num: 2);
5750 EVT VT = N->getValueType(ResNo: 0);
5751
5752 // FMAD_FTZ is a FMAD + flush denormals to zero.
5753 // We flush the inputs, the intermediate step, and the output.
5754 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5755 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5756 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5757 if (N0CFP && N1CFP && N2CFP) {
5758 const auto FTZ = [](const APFloat &V) {
5759 if (V.isDenormal()) {
5760 APFloat Zero(V.getSemantics(), 0);
5761 return V.isNegative() ? -Zero : Zero;
5762 }
5763 return V;
5764 };
5765
5766 APFloat V0 = FTZ(N0CFP->getValueAPF());
5767 APFloat V1 = FTZ(N1CFP->getValueAPF());
5768 APFloat V2 = FTZ(N2CFP->getValueAPF());
5769 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5770 V0 = FTZ(V0);
5771 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5772 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5773 }
5774 break;
5775 }
5776 }
5777 return SDValue();
5778}
5779
5780//===----------------------------------------------------------------------===//
5781// Helper functions
5782//===----------------------------------------------------------------------===//
5783
5784SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5785 const TargetRegisterClass *RC,
5786 Register Reg, EVT VT,
5787 const SDLoc &SL,
5788 bool RawReg) const {
5789 MachineFunction &MF = DAG.getMachineFunction();
5790 MachineRegisterInfo &MRI = MF.getRegInfo();
5791 Register VReg;
5792
5793 if (!MRI.isLiveIn(Reg)) {
5794 VReg = MRI.createVirtualRegister(RegClass: RC);
5795 MRI.addLiveIn(Reg, vreg: VReg);
5796 } else {
5797 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5798 }
5799
5800 if (RawReg)
5801 return DAG.getRegister(Reg: VReg, VT);
5802
5803 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5804}
5805
5806// This may be called multiple times, and nothing prevents creating multiple
5807// objects at the same offset. See if we already defined this object.
5808static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5809 int64_t Offset) {
5810 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5811 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5812 assert(MFI.getObjectSize(I) == Size);
5813 return I;
5814 }
5815 }
5816
5817 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5818}
5819
5820SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5821 EVT VT,
5822 const SDLoc &SL,
5823 int64_t Offset) const {
5824 MachineFunction &MF = DAG.getMachineFunction();
5825 MachineFrameInfo &MFI = MF.getFrameInfo();
5826 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5827
5828 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5829 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5830
5831 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5832 MMOFlags: MachineMemOperand::MODereferenceable |
5833 MachineMemOperand::MOInvariant);
5834}
5835
5836SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5837 const SDLoc &SL,
5838 SDValue Chain,
5839 SDValue ArgVal,
5840 int64_t Offset) const {
5841 MachineFunction &MF = DAG.getMachineFunction();
5842 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5843 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5844
5845 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5846 // Stores to the argument stack area are relative to the stack pointer.
5847 SDValue SP =
5848 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5849 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5850 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5851 MMOFlags: MachineMemOperand::MODereferenceable);
5852 return Store;
5853}
5854
5855SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5856 const TargetRegisterClass *RC,
5857 EVT VT, const SDLoc &SL,
5858 const ArgDescriptor &Arg) const {
5859 assert(Arg && "Attempting to load missing argument");
5860
5861 SDValue V = Arg.isRegister() ?
5862 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5863 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5864
5865 if (!Arg.isMasked())
5866 return V;
5867
5868 unsigned Mask = Arg.getMask();
5869 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5870 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5871 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5872 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5873 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5874}
5875
5876uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5877 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5878 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5879 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5880 uint64_t ArgOffset =
5881 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5882 switch (Param) {
5883 case FIRST_IMPLICIT:
5884 return ArgOffset;
5885 case PRIVATE_BASE:
5886 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5887 case SHARED_BASE:
5888 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5889 case QUEUE_PTR:
5890 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5891 }
5892 llvm_unreachable("unexpected implicit parameter type");
5893}
5894
5895uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5896 const MachineFunction &MF, const ImplicitParameter Param) const {
5897 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5898 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5899}
5900
5901SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5902 SelectionDAG &DAG, int Enabled,
5903 int &RefinementSteps,
5904 bool &UseOneConstNR,
5905 bool Reciprocal) const {
5906 EVT VT = Operand.getValueType();
5907
5908 if (VT == MVT::f32) {
5909 RefinementSteps = 0;
5910 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5911 }
5912
5913 // TODO: There is also f64 rsq instruction, but the documentation is less
5914 // clear on its precision.
5915
5916 return SDValue();
5917}
5918
5919SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5920 SelectionDAG &DAG, int Enabled,
5921 int &RefinementSteps) const {
5922 EVT VT = Operand.getValueType();
5923
5924 if (VT == MVT::f32) {
5925 // Reciprocal, < 1 ulp error.
5926 //
5927 // This reciprocal approximation converges to < 0.5 ulp error with one
5928 // newton rhapson performed with two fused multiple adds (FMAs).
5929
5930 RefinementSteps = 0;
5931 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5932 }
5933
5934 // TODO: There is also f64 rcp instruction, but the documentation is less
5935 // clear on its precision.
5936
5937 return SDValue();
5938}
5939
5940static unsigned workitemIntrinsicDim(unsigned ID) {
5941 switch (ID) {
5942 case Intrinsic::amdgcn_workitem_id_x:
5943 return 0;
5944 case Intrinsic::amdgcn_workitem_id_y:
5945 return 1;
5946 case Intrinsic::amdgcn_workitem_id_z:
5947 return 2;
5948 default:
5949 llvm_unreachable("not a workitem intrinsic");
5950 }
5951}
5952
5953void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5954 const SDValue Op, KnownBits &Known,
5955 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5956
5957 Known.resetAll(); // Don't know anything.
5958
5959 unsigned Opc = Op.getOpcode();
5960
5961 switch (Opc) {
5962 default:
5963 break;
5964 case AMDGPUISD::CARRY:
5965 case AMDGPUISD::BORROW: {
5966 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
5967 break;
5968 }
5969
5970 case AMDGPUISD::BFE_I32:
5971 case AMDGPUISD::BFE_U32: {
5972 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5973 if (!CWidth)
5974 return;
5975
5976 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5977
5978 if (Opc == AMDGPUISD::BFE_U32)
5979 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
5980
5981 break;
5982 }
5983 case AMDGPUISD::FP_TO_FP16: {
5984 unsigned BitWidth = Known.getBitWidth();
5985
5986 // High bits are zero.
5987 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
5988 break;
5989 }
5990 case AMDGPUISD::MUL_U24:
5991 case AMDGPUISD::MUL_I24: {
5992 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5993 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5994 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5995 RHSKnown.countMinTrailingZeros();
5996 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
5997 // Skip extra check if all bits are known zeros.
5998 if (TrailZ >= 32)
5999 break;
6000
6001 // Truncate to 24 bits.
6002 LHSKnown = LHSKnown.trunc(BitWidth: 24);
6003 RHSKnown = RHSKnown.trunc(BitWidth: 24);
6004
6005 if (Opc == AMDGPUISD::MUL_I24) {
6006 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
6007 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
6008 unsigned MaxValBits = LHSValBits + RHSValBits;
6009 if (MaxValBits > 32)
6010 break;
6011 unsigned SignBits = 32 - MaxValBits + 1;
6012 bool LHSNegative = LHSKnown.isNegative();
6013 bool LHSNonNegative = LHSKnown.isNonNegative();
6014 bool LHSPositive = LHSKnown.isStrictlyPositive();
6015 bool RHSNegative = RHSKnown.isNegative();
6016 bool RHSNonNegative = RHSKnown.isNonNegative();
6017 bool RHSPositive = RHSKnown.isStrictlyPositive();
6018
6019 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
6020 Known.Zero.setHighBits(SignBits);
6021 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
6022 Known.One.setHighBits(SignBits);
6023 } else {
6024 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
6025 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
6026 unsigned MaxValBits = LHSValBits + RHSValBits;
6027 if (MaxValBits >= 32)
6028 break;
6029 Known.Zero.setBitsFrom(MaxValBits);
6030 }
6031 break;
6032 }
6033 case AMDGPUISD::PERM: {
6034 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6035 if (!CMask)
6036 return;
6037
6038 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6039 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6040 unsigned Sel = CMask->getZExtValue();
6041
6042 for (unsigned I = 0; I < 32; I += 8) {
6043 unsigned SelBits = Sel & 0xff;
6044 if (SelBits < 4) {
6045 SelBits *= 8;
6046 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6047 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6048 } else if (SelBits < 7) {
6049 SelBits = (SelBits & 3) * 8;
6050 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6051 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6052 } else if (SelBits == 0x0c) {
6053 Known.Zero |= 0xFFull << I;
6054 } else if (SelBits > 0x0c) {
6055 Known.One |= 0xFFull << I;
6056 }
6057 Sel >>= 8;
6058 }
6059 break;
6060 }
6061 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6062 Known.Zero.setHighBits(24);
6063 break;
6064 }
6065 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6066 Known.Zero.setHighBits(16);
6067 break;
6068 }
6069 case AMDGPUISD::LDS: {
6070 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
6071 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
6072
6073 Known.Zero.setHighBits(16);
6074 Known.Zero.setLowBits(Log2(A: Alignment));
6075 break;
6076 }
6077 case AMDGPUISD::SMIN3:
6078 case AMDGPUISD::SMAX3:
6079 case AMDGPUISD::SMED3:
6080 case AMDGPUISD::UMIN3:
6081 case AMDGPUISD::UMAX3:
6082 case AMDGPUISD::UMED3: {
6083 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6084 if (Known2.isUnknown())
6085 break;
6086
6087 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6088 if (Known1.isUnknown())
6089 break;
6090
6091 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6092 if (Known0.isUnknown())
6093 break;
6094
6095 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6096 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6097 Known.One = Known0.One & Known1.One & Known2.One;
6098 break;
6099 }
6100 case ISD::INTRINSIC_WO_CHAIN: {
6101 unsigned IID = Op.getConstantOperandVal(i: 0);
6102 switch (IID) {
6103 case Intrinsic::amdgcn_workitem_id_x:
6104 case Intrinsic::amdgcn_workitem_id_y:
6105 case Intrinsic::amdgcn_workitem_id_z: {
6106 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6107 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
6108 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
6109 break;
6110 }
6111 default:
6112 break;
6113 }
6114 }
6115 }
6116}
6117
6118unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6119 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6120 unsigned Depth) const {
6121 switch (Op.getOpcode()) {
6122 case AMDGPUISD::BFE_I32: {
6123 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6124 if (!Width)
6125 return 1;
6126
6127 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6128 if (!isNullConstant(V: Op.getOperand(i: 1)))
6129 return SignBits;
6130
6131 // TODO: Could probably figure something out with non-0 offsets.
6132 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6133 return std::max(a: SignBits, b: Op0SignBits);
6134 }
6135
6136 case AMDGPUISD::BFE_U32: {
6137 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6138 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6139 }
6140
6141 case AMDGPUISD::CARRY:
6142 case AMDGPUISD::BORROW:
6143 return 31;
6144 case AMDGPUISD::BUFFER_LOAD_BYTE:
6145 return 25;
6146 case AMDGPUISD::BUFFER_LOAD_SHORT:
6147 return 17;
6148 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6149 return 24;
6150 case AMDGPUISD::BUFFER_LOAD_USHORT:
6151 return 16;
6152 case AMDGPUISD::FP_TO_FP16:
6153 return 16;
6154 case AMDGPUISD::SMIN3:
6155 case AMDGPUISD::SMAX3:
6156 case AMDGPUISD::SMED3:
6157 case AMDGPUISD::UMIN3:
6158 case AMDGPUISD::UMAX3:
6159 case AMDGPUISD::UMED3: {
6160 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6161 if (Tmp2 == 1)
6162 return 1; // Early out.
6163
6164 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6165 if (Tmp1 == 1)
6166 return 1; // Early out.
6167
6168 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6169 if (Tmp0 == 1)
6170 return 1; // Early out.
6171
6172 return std::min(l: {Tmp0, Tmp1, Tmp2});
6173 }
6174 default:
6175 return 1;
6176 }
6177}
6178
6179unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6180 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6181 const MachineRegisterInfo &MRI, unsigned Depth) const {
6182 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6183 if (!MI)
6184 return 1;
6185
6186 // TODO: Check range metadata on MMO.
6187 switch (MI->getOpcode()) {
6188 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6189 return 25;
6190 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6191 return 17;
6192 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6193 return 24;
6194 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6195 return 16;
6196 case AMDGPU::G_AMDGPU_SMED3:
6197 case AMDGPU::G_AMDGPU_UMED3: {
6198 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6199 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6200 if (Tmp2 == 1)
6201 return 1;
6202 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6203 if (Tmp1 == 1)
6204 return 1;
6205 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6206 if (Tmp0 == 1)
6207 return 1;
6208 return std::min(l: {Tmp0, Tmp1, Tmp2});
6209 }
6210 default:
6211 return 1;
6212 }
6213}
6214
6215bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6216 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6217 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6218 unsigned Opcode = Op.getOpcode();
6219 switch (Opcode) {
6220 case AMDGPUISD::BFE_I32:
6221 case AMDGPUISD::BFE_U32:
6222 return false;
6223 }
6224 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6225 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6226}
6227
6228bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6229 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6230 unsigned Depth) const {
6231 unsigned Opcode = Op.getOpcode();
6232 switch (Opcode) {
6233 case AMDGPUISD::FMIN_LEGACY:
6234 case AMDGPUISD::FMAX_LEGACY: {
6235 if (SNaN)
6236 return true;
6237
6238 // TODO: Can check no nans on one of the operands for each one, but which
6239 // one?
6240 return false;
6241 }
6242 case AMDGPUISD::FMUL_LEGACY:
6243 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6244 if (SNaN)
6245 return true;
6246 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6247 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6248 }
6249 case AMDGPUISD::FMED3:
6250 case AMDGPUISD::FMIN3:
6251 case AMDGPUISD::FMAX3:
6252 case AMDGPUISD::FMINIMUM3:
6253 case AMDGPUISD::FMAXIMUM3:
6254 case AMDGPUISD::FMAD_FTZ: {
6255 if (SNaN)
6256 return true;
6257 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6258 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6259 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6260 }
6261 case AMDGPUISD::CVT_F32_UBYTE0:
6262 case AMDGPUISD::CVT_F32_UBYTE1:
6263 case AMDGPUISD::CVT_F32_UBYTE2:
6264 case AMDGPUISD::CVT_F32_UBYTE3:
6265 return true;
6266
6267 case AMDGPUISD::RCP:
6268 case AMDGPUISD::RSQ:
6269 case AMDGPUISD::RCP_LEGACY:
6270 case AMDGPUISD::RSQ_CLAMP: {
6271 if (SNaN)
6272 return true;
6273
6274 // TODO: Need is known positive check.
6275 return false;
6276 }
6277 case ISD::FLDEXP:
6278 case AMDGPUISD::FRACT: {
6279 if (SNaN)
6280 return true;
6281 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6282 }
6283 case AMDGPUISD::DIV_SCALE:
6284 case AMDGPUISD::DIV_FMAS:
6285 case AMDGPUISD::DIV_FIXUP:
6286 // TODO: Refine on operands.
6287 return SNaN;
6288 case AMDGPUISD::SIN_HW:
6289 case AMDGPUISD::COS_HW: {
6290 // TODO: Need check for infinity
6291 return SNaN;
6292 }
6293 case ISD::INTRINSIC_WO_CHAIN: {
6294 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6295 // TODO: Handle more intrinsics
6296 switch (IntrinsicID) {
6297 case Intrinsic::amdgcn_cubeid:
6298 case Intrinsic::amdgcn_cvt_off_f32_i4:
6299 return true;
6300
6301 case Intrinsic::amdgcn_frexp_mant: {
6302 if (SNaN)
6303 return true;
6304 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6305 }
6306 case Intrinsic::amdgcn_cvt_pkrtz: {
6307 if (SNaN)
6308 return true;
6309 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6310 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6311 }
6312 case Intrinsic::amdgcn_rcp:
6313 case Intrinsic::amdgcn_rsq:
6314 case Intrinsic::amdgcn_rcp_legacy:
6315 case Intrinsic::amdgcn_rsq_legacy:
6316 case Intrinsic::amdgcn_rsq_clamp:
6317 case Intrinsic::amdgcn_tanh: {
6318 if (SNaN)
6319 return true;
6320
6321 // TODO: Need is known positive check.
6322 return false;
6323 }
6324 case Intrinsic::amdgcn_trig_preop:
6325 case Intrinsic::amdgcn_fdot2:
6326 // TODO: Refine on operand
6327 return SNaN;
6328 case Intrinsic::amdgcn_fma_legacy:
6329 if (SNaN)
6330 return true;
6331 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6332 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6333 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6334 default:
6335 return false;
6336 }
6337 }
6338 default:
6339 return false;
6340 }
6341}
6342
6343bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6344 Register N0, Register N1) const {
6345 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6346}
6347