1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUSelectionDAGInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Target/TargetMachine.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
35static cl::opt<bool> AMDGPUBypassSlowDiv(
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(Val: true));
39
40// Find a larger type to do a load / store of a vector with.
41EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
48
49 return VT;
50}
51
52unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53 return DAG.computeKnownBits(Op).countMaxActiveBits();
54}
55
56unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
62AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
68 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
69 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
70 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
73 MaxGluedStoresPerMemcpy = 16;
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
172 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
173
174 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
175 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
176
177 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
178 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
179
180 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
181 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
186 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
187 Action: Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
195 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
196 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
197 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
198 }
199 }
200
201 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
204 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
205 Action: Expand);
206
207 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
208 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
209 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
210 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
212 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
219 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
221
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
226 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
227 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
228
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
239 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
240 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
241
242 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
243 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
244
245 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
246 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
247
248 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
249 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
250
251 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
252 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
253
254 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
255 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
256
257 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
258 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
259
260 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
261 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
262
263 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
264 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
265
266 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
267 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
268
269 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
270 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
271
272 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
273 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
274
275 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
276 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
277
278 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
279 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
280
281 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
282 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
283
284 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
285 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
286
287 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
288 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
289
290 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
291 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
292
293 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
294 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
295
296 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
297 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
298
299 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
300 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
301
302 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
303 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
304
305 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
306 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
307
308 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
309 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
310
311 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
312 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
313
314 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
315 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
316
317 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
318 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
319
320 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
321 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
322
323 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
324 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
325 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
326 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
327
328 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
329 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
330 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
331 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
332
333 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
334 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
338 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
339 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
345 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
346 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
348
349 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
350 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
352
353 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
354 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
355 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
356
357 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
358
359 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
362 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
363 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
364 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
365 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
366
367 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
369 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
370 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
372
373 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
374 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
376
377 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
378 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
379 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
380
381 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
382 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
383 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
384
385 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
386 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
387 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
388
389 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
390 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
391 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
392 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
393 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
394 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
395 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
396
397 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
398 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
399
400 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
404 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
408 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409 ISD::FROUNDEVEN, ISD::FTRUNC},
410 VTs: {MVT::f16, MVT::f32}, Action: Legal);
411 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
412
413 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
414 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
415 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
416 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
417
418 setOperationAction(
419 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
420 Action: Custom);
421
422 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
423
424 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
425
426 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
427 Action: Expand);
428
429 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
430 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
431 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
432
433 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
434 Action: Custom);
435
436 setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
437
438 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
439 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
440 // default unless marked custom/legal.
441 setOperationAction(Ops: ISD::IS_FPCLASS,
442 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
443 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
444 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
445 MVT::v16f64},
446 Action: Custom);
447
448 // Expand to fneg + fadd.
449 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
450
451 setOperationAction(Ops: ISD::CONCAT_VECTORS,
452 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
453 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
454 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
455 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
456 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
457 Action: Custom);
458
459 setOperationAction(
460 Ops: ISD::EXTRACT_SUBVECTOR,
461 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
462 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
463 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
464 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
465 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
466 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
467 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
468 Action: Custom);
469
470 setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
471 Action: Expand);
472 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
473
474 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
475 for (MVT VT : ScalarIntVTs) {
476 // These should use [SU]DIVREM, so set them to expand
477 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
478 Action: Expand);
479
480 // GPU does not have divrem function for signed or unsigned.
481 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
482
483 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
484 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
485
486 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
487
488 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
489 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
490 }
491
492 // The hardware supports 32-bit FSHR, but not FSHL.
493 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
494
495 setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
496
497 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
498
499 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
500 setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
501 ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
502 ISD::FP_TO_UINT_SAT},
503 VT: MVT::i64, Action: Custom);
504 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
505
506 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
507 Action: Legal);
508
509 setOperationAction(
510 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
511 VT: MVT::i64, Action: Custom);
512
513 for (auto VT : {MVT::i8, MVT::i16})
514 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
515
516 static const MVT::SimpleValueType VectorIntTypes[] = {
517 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
518 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
519
520 for (MVT VT : VectorIntTypes) {
521 // Expand the following operations for the current type by default.
522 // clang-format off
523 setOperationAction(Ops: {ISD::ADD, ISD::AND,
524 ISD::FP_TO_SINT, ISD::FP_TO_UINT,
525 ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
526 ISD::MUL, ISD::MULHU,
527 ISD::MULHS, ISD::OR,
528 ISD::SHL, ISD::SRA,
529 ISD::SRL, ISD::ROTL,
530 ISD::ROTR, ISD::SUB,
531 ISD::SINT_TO_FP, ISD::UINT_TO_FP,
532 ISD::SDIV, ISD::UDIV,
533 ISD::SREM, ISD::UREM,
534 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
535 ISD::SDIVREM, ISD::UDIVREM,
536 ISD::SELECT, ISD::VSELECT,
537 ISD::SELECT_CC, ISD::XOR,
538 ISD::BSWAP, ISD::CTPOP,
539 ISD::CTTZ, ISD::CTLZ,
540 ISD::VECTOR_SHUFFLE, ISD::SETCC,
541 ISD::ADDRSPACECAST},
542 VT, Action: Expand);
543 // clang-format on
544 }
545
546 static const MVT::SimpleValueType FloatVectorTypes[] = {
547 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
548 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
549
550 for (MVT VT : FloatVectorTypes) {
551 setOperationAction(
552 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
553 ISD::FADD, ISD::FCEIL, ISD::FCOS,
554 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
555 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
556 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
557 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
558 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
559 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
560 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
561 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
562 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
563 VT, Action: Expand);
564 }
565
566 // This causes using an unrolled select operation rather than expansion with
567 // bit operations. This is in general better, but the alternative using BFI
568 // instructions may be better if the select sources are SGPRs.
569 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
570 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
571
572 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
573 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
574
575 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
576 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
577
578 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
579 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
580
581 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
582 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
583
584 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
585 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
586
587 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
588 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
589
590 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
591 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
592
593 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
594 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
595
596 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
597 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
598
599 setSchedulingPreference(Sched::RegPressure);
600 setJumpIsExpensive(true);
601
602 setMinCmpXchgSizeInBits(32);
603 setSupportsUnalignedAtomics(false);
604
605 PredictableSelectIsExpensive = false;
606
607 // We want to find all load dependencies for long chains of stores to enable
608 // merging into very wide vectors. The problem is with vectors with > 4
609 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
610 // vectors are a legal type, even though we have to split the loads
611 // usually. When we can more precisely specify load legality per address
612 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
613 // smarter so that they can figure out what to do in 2 iterations without all
614 // N > 4 stores on the same chain.
615 GatherAllAliasesMaxDepth = 16;
616
617 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
618 // about these during lowering.
619 MaxStoresPerMemcpy = 0xffffffff;
620 MaxStoresPerMemmove = 0xffffffff;
621 MaxStoresPerMemset = 0xffffffff;
622
623 // The expansion for 64-bit division is enormous.
624 if (AMDGPUBypassSlowDiv)
625 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
626
627 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
628 ISD::SRA, ISD::SRL,
629 ISD::TRUNCATE, ISD::MUL,
630 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
631 ISD::MULHU, ISD::MULHS,
632 ISD::SELECT, ISD::SELECT_CC,
633 ISD::STORE, ISD::FADD,
634 ISD::FSUB, ISD::FNEG,
635 ISD::FABS, ISD::AssertZext,
636 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
637
638 setMaxAtomicSizeInBitsSupported(64);
639 setMaxDivRemBitWidthSupported(64);
640 setMaxLargeFPConvertBitWidthSupported(64);
641}
642
643bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
644 const auto Flags = Op.getNode()->getFlags();
645 if (Flags.hasNoSignedZeros())
646 return true;
647
648 return false;
649}
650
651//===----------------------------------------------------------------------===//
652// Target Information
653//===----------------------------------------------------------------------===//
654
655LLVM_READNONE
656static bool fnegFoldsIntoOpcode(unsigned Opc) {
657 switch (Opc) {
658 case ISD::FADD:
659 case ISD::FSUB:
660 case ISD::FMUL:
661 case ISD::FMA:
662 case ISD::FMAD:
663 case ISD::FMINNUM:
664 case ISD::FMAXNUM:
665 case ISD::FMINNUM_IEEE:
666 case ISD::FMAXNUM_IEEE:
667 case ISD::FMINIMUM:
668 case ISD::FMAXIMUM:
669 case ISD::FMINIMUMNUM:
670 case ISD::FMAXIMUMNUM:
671 case ISD::SELECT:
672 case ISD::FSIN:
673 case ISD::FTRUNC:
674 case ISD::FRINT:
675 case ISD::FNEARBYINT:
676 case ISD::FROUNDEVEN:
677 case ISD::FCANONICALIZE:
678 case AMDGPUISD::RCP:
679 case AMDGPUISD::RCP_LEGACY:
680 case AMDGPUISD::RCP_IFLAG:
681 case AMDGPUISD::SIN_HW:
682 case AMDGPUISD::FMUL_LEGACY:
683 case AMDGPUISD::FMIN_LEGACY:
684 case AMDGPUISD::FMAX_LEGACY:
685 case AMDGPUISD::FMED3:
686 // TODO: handle llvm.amdgcn.fma.legacy
687 return true;
688 case ISD::BITCAST:
689 llvm_unreachable("bitcast is special cased");
690 default:
691 return false;
692 }
693}
694
695static bool fnegFoldsIntoOp(const SDNode *N) {
696 unsigned Opc = N->getOpcode();
697 if (Opc == ISD::BITCAST) {
698 // TODO: Is there a benefit to checking the conditions performFNegCombine
699 // does? We don't for the other cases.
700 SDValue BCSrc = N->getOperand(Num: 0);
701 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
702 return BCSrc.getNumOperands() == 2 &&
703 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
704 }
705
706 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
707 }
708
709 return fnegFoldsIntoOpcode(Opc);
710}
711
712/// \p returns true if the operation will definitely need to use a 64-bit
713/// encoding, and thus will use a VOP3 encoding regardless of the source
714/// modifiers.
715LLVM_READONLY
716static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
717 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
718 VT == MVT::f64;
719}
720
721/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
722/// type for ISD::SELECT.
723LLVM_READONLY
724static bool selectSupportsSourceMods(const SDNode *N) {
725 // TODO: Only applies if select will be vector
726 return N->getValueType(ResNo: 0) == MVT::f32;
727}
728
729// Most FP instructions support source modifiers, but this could be refined
730// slightly.
731LLVM_READONLY
732static bool hasSourceMods(const SDNode *N) {
733 if (isa<MemSDNode>(Val: N))
734 return false;
735
736 switch (N->getOpcode()) {
737 case ISD::CopyToReg:
738 case ISD::FDIV:
739 case ISD::FREM:
740 case ISD::INLINEASM:
741 case ISD::INLINEASM_BR:
742 case AMDGPUISD::DIV_SCALE:
743 case ISD::INTRINSIC_W_CHAIN:
744
745 // TODO: Should really be looking at the users of the bitcast. These are
746 // problematic because bitcasts are used to legalize all stores to integer
747 // types.
748 case ISD::BITCAST:
749 return false;
750 case ISD::INTRINSIC_WO_CHAIN: {
751 switch (N->getConstantOperandVal(Num: 0)) {
752 case Intrinsic::amdgcn_interp_p1:
753 case Intrinsic::amdgcn_interp_p2:
754 case Intrinsic::amdgcn_interp_mov:
755 case Intrinsic::amdgcn_interp_p1_f16:
756 case Intrinsic::amdgcn_interp_p2_f16:
757 return false;
758 default:
759 return true;
760 }
761 }
762 case ISD::SELECT:
763 return selectSupportsSourceMods(N);
764 default:
765 return true;
766 }
767}
768
769bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
770 unsigned CostThreshold) {
771 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
772 // it is truly free to use a source modifier in all cases. If there are
773 // multiple users but for each one will necessitate using VOP3, there will be
774 // a code size increase. Try to avoid increasing code size unless we know it
775 // will save on the instruction count.
776 unsigned NumMayIncreaseSize = 0;
777 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
778
779 assert(!N->use_empty());
780
781 // XXX - Should this limit number of uses to check?
782 for (const SDNode *U : N->users()) {
783 if (!hasSourceMods(N: U))
784 return false;
785
786 if (!opMustUseVOP3Encoding(N: U, VT)) {
787 if (++NumMayIncreaseSize > CostThreshold)
788 return false;
789 }
790 }
791
792 return true;
793}
794
795EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
796 ISD::NodeType ExtendKind) const {
797 assert(!VT.isVector() && "only scalar expected");
798
799 // Round to the next multiple of 32-bits.
800 unsigned Size = VT.getSizeInBits();
801 if (Size <= 32)
802 return MVT::i32;
803 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
804}
805
806unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
807 return 32;
808}
809
810bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
811 return true;
812}
813
814// The backend supports 32 and 64 bit floating point immediates.
815// FIXME: Why are we reporting vectors of FP immediates as legal?
816bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
817 bool ForCodeSize) const {
818 return isTypeLegal(VT: VT.getScalarType());
819}
820
821// We don't want to shrink f64 / f32 constants.
822bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
825}
826
827bool AMDGPUTargetLowering::shouldReduceLoadWidth(
828 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
829 std::optional<unsigned> ByteOffset) const {
830 // TODO: This may be worth removing. Check regression tests for diffs.
831 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
832 return false;
833
834 unsigned NewSize = NewVT.getStoreSizeInBits();
835
836 // If we are reducing to a 32-bit load or a smaller multi-dword load,
837 // this is always better.
838 if (NewSize >= 32)
839 return true;
840
841 EVT OldVT = N->getValueType(ResNo: 0);
842 unsigned OldSize = OldVT.getStoreSizeInBits();
843
844 MemSDNode *MN = cast<MemSDNode>(Val: N);
845 unsigned AS = MN->getAddressSpace();
846 // Do not shrink an aligned scalar load to sub-dword.
847 // Scalar engine cannot do sub-dword loads.
848 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
849 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
850 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
851 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
852 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
853 MN->isInvariant())) &&
854 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
855 return false;
856
857 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
858 // extloads, so doing one requires using a buffer_load. In cases where we
859 // still couldn't use a scalar load, using the wider load shouldn't really
860 // hurt anything.
861
862 // If the old size already had to be an extload, there's no harm in continuing
863 // to reduce the width.
864 return (OldSize < 32);
865}
866
867bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
868 const SelectionDAG &DAG,
869 const MachineMemOperand &MMO) const {
870
871 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
872
873 if (LoadTy.getScalarType() == MVT::i32)
874 return false;
875
876 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
877 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
878
879 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
880 return false;
881
882 unsigned Fast = 0;
883 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
884 VT: CastTy, MMO, Fast: &Fast) &&
885 Fast;
886}
887
888// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
889// profitable with the expansion for 64-bit since it's generally good to
890// speculate things.
891bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
892 return true;
893}
894
895bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
896 return true;
897}
898
899bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
900 switch (N->getOpcode()) {
901 case ISD::EntryToken:
902 case ISD::TokenFactor:
903 return true;
904 case ISD::INTRINSIC_WO_CHAIN: {
905 unsigned IntrID = N->getConstantOperandVal(Num: 0);
906 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
907 }
908 case ISD::INTRINSIC_W_CHAIN: {
909 unsigned IntrID = N->getConstantOperandVal(Num: 1);
910 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
911 }
912 case ISD::LOAD:
913 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
914 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
915 return true;
916 return false;
917 case AMDGPUISD::SETCC: // ballot-style instruction
918 return true;
919 }
920 return false;
921}
922
923SDValue AMDGPUTargetLowering::getNegatedExpression(
924 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
925 NegatibleCost &Cost, unsigned Depth) const {
926
927 switch (Op.getOpcode()) {
928 case ISD::FMA:
929 case ISD::FMAD: {
930 // Negating a fma is not free if it has users without source mods.
931 if (!allUsesHaveSourceMods(N: Op.getNode()))
932 return SDValue();
933 break;
934 }
935 case AMDGPUISD::RCP: {
936 SDValue Src = Op.getOperand(i: 0);
937 EVT VT = Op.getValueType();
938 SDLoc SL(Op);
939
940 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
941 ForCodeSize, Cost, Depth: Depth + 1);
942 if (NegSrc)
943 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
944 return SDValue();
945 }
946 default:
947 break;
948 }
949
950 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
951 OptForSize: ForCodeSize, Cost, Depth);
952}
953
954//===---------------------------------------------------------------------===//
955// Target Properties
956//===---------------------------------------------------------------------===//
957
958bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
959 assert(VT.isFloatingPoint());
960
961 // Packed operations do not have a fabs modifier.
962 // Report this based on the end legalized type.
963 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
964}
965
966bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
967 assert(VT.isFloatingPoint());
968 // Report this based on the end legalized type.
969 VT = VT.getScalarType();
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
973bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
974 unsigned NumElem,
975 unsigned AS) const {
976 return true;
977}
978
979bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
980 // There are few operations which truly have vector input operands. Any vector
981 // operation is going to involve operations on each component, and a
982 // build_vector will be a copy per element, so it always makes sense to use a
983 // build_vector input in place of the extracted element to avoid a copy into a
984 // super register.
985 //
986 // We should probably only do this if all users are extracts only, but this
987 // should be the common case.
988 return true;
989}
990
991bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
992 // Truncate is just accessing a subregister.
993
994 unsigned SrcSize = Source.getSizeInBits();
995 unsigned DestSize = Dest.getSizeInBits();
996
997 return DestSize < SrcSize && DestSize % 32 == 0 ;
998}
999
1000bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
1001 // Truncate is just accessing a subregister.
1002
1003 unsigned SrcSize = Source->getScalarSizeInBits();
1004 unsigned DestSize = Dest->getScalarSizeInBits();
1005
1006 if (DestSize== 16 && Subtarget->has16BitInsts())
1007 return SrcSize >= 32;
1008
1009 return DestSize < SrcSize && DestSize % 32 == 0;
1010}
1011
1012bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1013 unsigned SrcSize = Src->getScalarSizeInBits();
1014 unsigned DestSize = Dest->getScalarSizeInBits();
1015
1016 if (SrcSize == 16 && Subtarget->has16BitInsts())
1017 return DestSize >= 32;
1018
1019 return SrcSize == 32 && DestSize == 64;
1020}
1021
1022bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1023 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1024 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1025 // this will enable reducing 64-bit operations the 32-bit, which is always
1026 // good.
1027
1028 if (Src == MVT::i16)
1029 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1030
1031 return Src == MVT::i32 && Dest == MVT::i64;
1032}
1033
1034bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1035 EVT DestVT) const {
1036 switch (N->getOpcode()) {
1037 case ISD::ADD:
1038 case ISD::SUB:
1039 case ISD::SHL:
1040 case ISD::SRL:
1041 case ISD::SRA:
1042 case ISD::AND:
1043 case ISD::OR:
1044 case ISD::XOR:
1045 case ISD::MUL:
1046 case ISD::SETCC:
1047 case ISD::SELECT:
1048 case ISD::SMIN:
1049 case ISD::SMAX:
1050 case ISD::UMIN:
1051 case ISD::UMAX:
1052 if (isTypeLegal(VT: MVT::i16) &&
1053 (!DestVT.isVector() ||
1054 !isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1055 // Don't narrow back down to i16 if promoted to i32 already.
1056 if (!N->isDivergent() && DestVT.isInteger() &&
1057 DestVT.getScalarSizeInBits() > 1 &&
1058 DestVT.getScalarSizeInBits() <= 16 &&
1059 SrcVT.getScalarSizeInBits() > 16) {
1060 return false;
1061 }
1062 }
1063 return true;
1064 default:
1065 break;
1066 }
1067
1068 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1069 // limited number of native 64-bit operations. Shrinking an operation to fit
1070 // in a single 32-bit register should always be helpful. As currently used,
1071 // this is much less general than the name suggests, and is only used in
1072 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1073 // not profitable, and may actually be harmful.
1074 if (isa<LoadSDNode>(Val: N))
1075 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1076
1077 return true;
1078}
1079
1080bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1081 const SDNode* N, CombineLevel Level) const {
1082 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1083 N->getOpcode() == ISD::SRL) &&
1084 "Expected shift op");
1085
1086 SDValue ShiftLHS = N->getOperand(Num: 0);
1087 if (!ShiftLHS->hasOneUse())
1088 return false;
1089
1090 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1091 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1092 return false;
1093
1094 // Always commute pre-type legalization and right shifts.
1095 // We're looking for shl(or(x,y),z) patterns.
1096 if (Level < CombineLevel::AfterLegalizeTypes ||
1097 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1098 return true;
1099
1100 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1101 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1102 (N->user_begin()->getOpcode() == ISD::SRA ||
1103 N->user_begin()->getOpcode() == ISD::SRL))
1104 return false;
1105
1106 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1107 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1108 if (LHS.getOpcode() != ISD::SHL)
1109 return false;
1110 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1111 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1112 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1113 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1114 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1115 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1116 };
1117 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1118 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1119 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1120}
1121
1122//===---------------------------------------------------------------------===//
1123// TargetLowering Callbacks
1124//===---------------------------------------------------------------------===//
1125
1126CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1127 bool IsVarArg) {
1128 switch (CC) {
1129 case CallingConv::AMDGPU_VS:
1130 case CallingConv::AMDGPU_GS:
1131 case CallingConv::AMDGPU_PS:
1132 case CallingConv::AMDGPU_CS:
1133 case CallingConv::AMDGPU_HS:
1134 case CallingConv::AMDGPU_ES:
1135 case CallingConv::AMDGPU_LS:
1136 return CC_AMDGPU;
1137 case CallingConv::AMDGPU_CS_Chain:
1138 case CallingConv::AMDGPU_CS_ChainPreserve:
1139 return CC_AMDGPU_CS_CHAIN;
1140 case CallingConv::C:
1141 case CallingConv::Fast:
1142 case CallingConv::Cold:
1143 return CC_AMDGPU_Func;
1144 case CallingConv::AMDGPU_Gfx:
1145 case CallingConv::AMDGPU_Gfx_WholeWave:
1146 return CC_SI_Gfx;
1147 case CallingConv::AMDGPU_KERNEL:
1148 case CallingConv::SPIR_KERNEL:
1149 default:
1150 reportFatalUsageError(reason: "unsupported calling convention for call");
1151 }
1152}
1153
1154CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1155 bool IsVarArg) {
1156 switch (CC) {
1157 case CallingConv::AMDGPU_KERNEL:
1158 case CallingConv::SPIR_KERNEL:
1159 llvm_unreachable("kernels should not be handled here");
1160 case CallingConv::AMDGPU_VS:
1161 case CallingConv::AMDGPU_GS:
1162 case CallingConv::AMDGPU_PS:
1163 case CallingConv::AMDGPU_CS:
1164 case CallingConv::AMDGPU_CS_Chain:
1165 case CallingConv::AMDGPU_CS_ChainPreserve:
1166 case CallingConv::AMDGPU_HS:
1167 case CallingConv::AMDGPU_ES:
1168 case CallingConv::AMDGPU_LS:
1169 return RetCC_SI_Shader;
1170 case CallingConv::AMDGPU_Gfx:
1171 case CallingConv::AMDGPU_Gfx_WholeWave:
1172 return RetCC_SI_Gfx;
1173 case CallingConv::C:
1174 case CallingConv::Fast:
1175 case CallingConv::Cold:
1176 return RetCC_AMDGPU_Func;
1177 default:
1178 reportFatalUsageError(reason: "unsupported calling convention");
1179 }
1180}
1181
1182/// The SelectionDAGBuilder will automatically promote function arguments
1183/// with illegal types. However, this does not work for the AMDGPU targets
1184/// since the function arguments are stored in memory as these illegal types.
1185/// In order to handle this properly we need to get the original types sizes
1186/// from the LLVM IR Function and fixup the ISD:InputArg values before
1187/// passing them to AnalyzeFormalArguments()
1188
1189/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1190/// input values across multiple registers. Each item in the Ins array
1191/// represents a single value that will be stored in registers. Ins[x].VT is
1192/// the value type of the value that will be stored in the register, so
1193/// whatever SDNode we lower the argument to needs to be this type.
1194///
1195/// In order to correctly lower the arguments we need to know the size of each
1196/// argument. Since Ins[x].VT gives us the size of the register that will
1197/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1198/// for the original function argument so that we can deduce the correct memory
1199/// type to use for Ins[x]. In most cases the correct memory type will be
1200/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1201/// we have a kernel argument of type v8i8, this argument will be split into
1202/// 8 parts and each part will be represented by its own item in the Ins array.
1203/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1204/// the argument before it was split. From this, we deduce that the memory type
1205/// for each individual part is i8. We pass the memory type as LocVT to the
1206/// calling convention analysis function and the register type (Ins[x].VT) as
1207/// the ValVT.
1208void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1209 CCState &State,
1210 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1211 const MachineFunction &MF = State.getMachineFunction();
1212 const Function &Fn = MF.getFunction();
1213 LLVMContext &Ctx = Fn.getContext();
1214 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1215 CallingConv::ID CC = Fn.getCallingConv();
1216
1217 Align MaxAlign = Align(1);
1218 uint64_t ExplicitArgOffset = 0;
1219 const DataLayout &DL = Fn.getDataLayout();
1220
1221 unsigned InIndex = 0;
1222
1223 for (const Argument &Arg : Fn.args()) {
1224 const bool IsByRef = Arg.hasByRefAttr();
1225 Type *BaseArgTy = Arg.getType();
1226 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1227 Align Alignment = DL.getValueOrABITypeAlignment(
1228 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1229 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1230 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1231
1232 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1233 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1234
1235 // We're basically throwing away everything passed into us and starting over
1236 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1237 // to us as computed in Ins.
1238 //
1239 // We also need to figure out what type legalization is trying to do to get
1240 // the correct memory offsets.
1241
1242 SmallVector<EVT, 16> ValueVTs;
1243 SmallVector<uint64_t, 16> Offsets;
1244 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1245 FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1246
1247 for (unsigned Value = 0, NumValues = ValueVTs.size();
1248 Value != NumValues; ++Value) {
1249 uint64_t BasePartOffset = Offsets[Value];
1250
1251 EVT ArgVT = ValueVTs[Value];
1252 EVT MemVT = ArgVT;
1253 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1254 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1255
1256 if (NumRegs == 1) {
1257 // This argument is not split, so the IR type is the memory type.
1258 if (ArgVT.isExtended()) {
1259 // We have an extended type, like i24, so we should just use the
1260 // register type.
1261 MemVT = RegisterVT;
1262 } else {
1263 MemVT = ArgVT;
1264 }
1265 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1266 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1267 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1268 // We have a vector value which has been split into a vector with
1269 // the same scalar type, but fewer elements. This should handle
1270 // all the floating-point vector types.
1271 MemVT = RegisterVT;
1272 } else if (ArgVT.isVector() &&
1273 ArgVT.getVectorNumElements() == NumRegs) {
1274 // This arg has been split so that each element is stored in a separate
1275 // register.
1276 MemVT = ArgVT.getScalarType();
1277 } else if (ArgVT.isExtended()) {
1278 // We have an extended type, like i65.
1279 MemVT = RegisterVT;
1280 } else {
1281 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1282 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1283 if (RegisterVT.isInteger()) {
1284 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1285 } else if (RegisterVT.isVector()) {
1286 assert(!RegisterVT.getScalarType().isFloatingPoint());
1287 unsigned NumElements = RegisterVT.getVectorNumElements();
1288 assert(MemoryBits % NumElements == 0);
1289 // This vector type has been split into another vector type with
1290 // a different elements size.
1291 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1292 BitWidth: MemoryBits / NumElements);
1293 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1294 } else {
1295 llvm_unreachable("cannot deduce memory type.");
1296 }
1297 }
1298
1299 // Convert one element vectors to scalar.
1300 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1301 MemVT = MemVT.getScalarType();
1302
1303 // Round up vec3/vec5 argument.
1304 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1305 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1306 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1307 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1308 }
1309
1310 unsigned PartOffset = 0;
1311 for (unsigned i = 0; i != NumRegs; ++i) {
1312 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1313 Offset: BasePartOffset + PartOffset,
1314 LocVT: MemVT.getSimpleVT(),
1315 HTP: CCValAssign::Full));
1316 PartOffset += MemVT.getStoreSize();
1317 }
1318 }
1319 }
1320}
1321
1322SDValue AMDGPUTargetLowering::LowerReturn(
1323 SDValue Chain, CallingConv::ID CallConv,
1324 bool isVarArg,
1325 const SmallVectorImpl<ISD::OutputArg> &Outs,
1326 const SmallVectorImpl<SDValue> &OutVals,
1327 const SDLoc &DL, SelectionDAG &DAG) const {
1328 // FIXME: Fails for r600 tests
1329 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1330 // "wave terminate should not have return values");
1331 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1332}
1333
1334//===---------------------------------------------------------------------===//
1335// Target specific lowering
1336//===---------------------------------------------------------------------===//
1337
1338/// Selects the correct CCAssignFn for a given CallingConvention value.
1339CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1340 bool IsVarArg) {
1341 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1342}
1343
1344CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1345 bool IsVarArg) {
1346 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1347}
1348
1349SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1350 SelectionDAG &DAG,
1351 MachineFrameInfo &MFI,
1352 int ClobberedFI) const {
1353 SmallVector<SDValue, 8> ArgChains;
1354 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1355 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1356
1357 // Include the original chain at the beginning of the list. When this is
1358 // used by target LowerCall hooks, this helps legalize find the
1359 // CALLSEQ_BEGIN node.
1360 ArgChains.push_back(Elt: Chain);
1361
1362 // Add a chain value for each stack argument corresponding
1363 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1364 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1365 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1366 if (FI->getIndex() < 0) {
1367 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1368 int64_t InLastByte = InFirstByte;
1369 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1370
1371 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1372 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1373 ArgChains.push_back(Elt: SDValue(L, 1));
1374 }
1375 }
1376 }
1377 }
1378
1379 // Build a tokenfactor for all the chains.
1380 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1381}
1382
1383SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1384 SmallVectorImpl<SDValue> &InVals,
1385 StringRef Reason) const {
1386 SDValue Callee = CLI.Callee;
1387 SelectionDAG &DAG = CLI.DAG;
1388
1389 const Function &Fn = DAG.getMachineFunction().getFunction();
1390
1391 StringRef FuncName("<unknown>");
1392
1393 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1394 FuncName = G->getSymbol();
1395 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1396 FuncName = G->getGlobal()->getName();
1397
1398 DAG.getContext()->diagnose(
1399 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1400
1401 if (!CLI.IsTailCall) {
1402 for (ISD::InputArg &Arg : CLI.Ins)
1403 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1404 }
1405
1406 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1407 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1408 return CLI.Chain;
1409
1410 SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: 0, OutSize: 0, DL: CLI.DL);
1411 return DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, /*InGlue=*/Glue: SDValue(), DL: CLI.DL);
1412}
1413
1414SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1415 SmallVectorImpl<SDValue> &InVals) const {
1416 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1417}
1418
1419SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1420 SelectionDAG &DAG) const {
1421 const Function &Fn = DAG.getMachineFunction().getFunction();
1422
1423 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1424 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1425 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1426 return DAG.getMergeValues(Ops, dl: SDLoc());
1427}
1428
1429SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1430 SelectionDAG &DAG) const {
1431 switch (Op.getOpcode()) {
1432 default:
1433 Op->print(OS&: errs(), G: &DAG);
1434 llvm_unreachable("Custom lowering code for this "
1435 "instruction is not implemented yet!");
1436 break;
1437 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1438 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1439 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1440 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1441 case ISD::SDIVREM:
1442 return LowerSDIVREM(Op, DAG);
1443 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1444 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1445 case ISD::FRINT: return LowerFRINT(Op, DAG);
1446 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1447 case ISD::FROUNDEVEN:
1448 return LowerFROUNDEVEN(Op, DAG);
1449 case ISD::FROUND: return LowerFROUND(Op, DAG);
1450 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1451 case ISD::FLOG2:
1452 return LowerFLOG2(Op, DAG);
1453 case ISD::FLOG:
1454 case ISD::FLOG10:
1455 return LowerFLOGCommon(Op, DAG);
1456 case ISD::FEXP:
1457 case ISD::FEXP10:
1458 return lowerFEXP(Op, DAG);
1459 case ISD::FEXP2:
1460 return lowerFEXP2(Op, DAG);
1461 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1462 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1463 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1464 case ISD::FP_TO_SINT:
1465 case ISD::FP_TO_UINT:
1466 return LowerFP_TO_INT(Op, DAG);
1467 case ISD::FP_TO_SINT_SAT:
1468 case ISD::FP_TO_UINT_SAT:
1469 return LowerFP_TO_INT_SAT(Op, DAG);
1470 case ISD::CTTZ:
1471 case ISD::CTTZ_ZERO_UNDEF:
1472 case ISD::CTLZ:
1473 case ISD::CTLZ_ZERO_UNDEF:
1474 return LowerCTLZ_CTTZ(Op, DAG);
1475 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1476 }
1477 return Op;
1478}
1479
1480void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1481 SmallVectorImpl<SDValue> &Results,
1482 SelectionDAG &DAG) const {
1483 switch (N->getOpcode()) {
1484 case ISD::SIGN_EXTEND_INREG:
1485 // Different parts of legalization seem to interpret which type of
1486 // sign_extend_inreg is the one to check for custom lowering. The extended
1487 // from type is what really matters, but some places check for custom
1488 // lowering of the result type. This results in trying to use
1489 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1490 // nothing here and let the illegal result integer be handled normally.
1491 return;
1492 case ISD::FLOG2:
1493 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1494 Results.push_back(Elt: Lowered);
1495 return;
1496 case ISD::FLOG:
1497 case ISD::FLOG10:
1498 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1499 Results.push_back(Elt: Lowered);
1500 return;
1501 case ISD::FEXP2:
1502 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1503 Results.push_back(Elt: Lowered);
1504 return;
1505 case ISD::FEXP:
1506 case ISD::FEXP10:
1507 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1508 Results.push_back(Elt: Lowered);
1509 return;
1510 case ISD::CTLZ:
1511 case ISD::CTLZ_ZERO_UNDEF:
1512 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1513 Results.push_back(Elt: Lowered);
1514 return;
1515 default:
1516 return;
1517 }
1518}
1519
1520SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1521 SDValue Op,
1522 SelectionDAG &DAG) const {
1523
1524 const DataLayout &DL = DAG.getDataLayout();
1525 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1526 const GlobalValue *GV = G->getGlobal();
1527
1528 if (!MFI->isModuleEntryFunction()) {
1529 auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1530 if (std::optional<uint32_t> Address =
1531 AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1532 if (IsNamedBarrier) {
1533 unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / 16;
1534 MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1535 }
1536 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1537 } else if (IsNamedBarrier) {
1538 llvm_unreachable("named barrier should have an assigned address");
1539 }
1540 }
1541
1542 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1543 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1544 if (!MFI->isModuleEntryFunction() &&
1545 GV->getName() != "llvm.amdgcn.module.lds" &&
1546 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1547 SDLoc DL(Op);
1548 const Function &Fn = DAG.getMachineFunction().getFunction();
1549 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1550 Fn, "local memory global used by non-kernel function",
1551 DL.getDebugLoc(), DS_Warning));
1552
1553 // We currently don't have a way to correctly allocate LDS objects that
1554 // aren't directly associated with a kernel. We do force inlining of
1555 // functions that use local objects. However, if these dead functions are
1556 // not eliminated, we don't want a compile time error. Just emit a warning
1557 // and a trap, since there should be no callable path here.
1558 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1559 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1560 N1: Trap, N2: DAG.getRoot());
1561 DAG.setRoot(OutputChain);
1562 return DAG.getPOISON(VT: Op.getValueType());
1563 }
1564
1565 // XXX: What does the value of G->getOffset() mean?
1566 assert(G->getOffset() == 0 &&
1567 "Do not know what to do with an non-zero offset");
1568
1569 // TODO: We could emit code to handle the initialization somewhere.
1570 // We ignore the initializer for now and legalize it to allow selection.
1571 // The initializer will anyway get errored out during assembly emission.
1572 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1573 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1574 }
1575 return SDValue();
1576}
1577
1578SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1579 SelectionDAG &DAG) const {
1580 SmallVector<SDValue, 8> Args;
1581 SDLoc SL(Op);
1582
1583 EVT VT = Op.getValueType();
1584 if (VT.getVectorElementType().getSizeInBits() < 32) {
1585 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1586 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1587 unsigned NewNumElt = OpBitSize / 32;
1588 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1589 : EVT::getVectorVT(Context&: *DAG.getContext(),
1590 VT: MVT::i32, NumElements: NewNumElt);
1591 for (const SDUse &U : Op->ops()) {
1592 SDValue In = U.get();
1593 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1594 if (NewNumElt > 1)
1595 DAG.ExtractVectorElements(Op: NewIn, Args);
1596 else
1597 Args.push_back(Elt: NewIn);
1598 }
1599
1600 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1601 NumElements: NewNumElt * Op.getNumOperands());
1602 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1603 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1604 }
1605 }
1606
1607 for (const SDUse &U : Op->ops())
1608 DAG.ExtractVectorElements(Op: U.get(), Args);
1609
1610 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1611}
1612
1613SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1614 SelectionDAG &DAG) const {
1615 SDLoc SL(Op);
1616 SmallVector<SDValue, 8> Args;
1617 unsigned Start = Op.getConstantOperandVal(i: 1);
1618 EVT VT = Op.getValueType();
1619 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1620
1621 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1622 unsigned NumElt = VT.getVectorNumElements();
1623 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1624 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1625
1626 // Extract 32-bit registers at a time.
1627 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1628 EVT NewVT = NumElt == 2
1629 ? MVT::i32
1630 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1631 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1632
1633 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1634 if (NumElt == 2)
1635 Tmp = Args[0];
1636 else
1637 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1638
1639 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1640 }
1641
1642 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1643 Count: VT.getVectorNumElements());
1644
1645 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1646}
1647
1648// TODO: Handle fabs too
1649static SDValue peekFNeg(SDValue Val) {
1650 if (Val.getOpcode() == ISD::FNEG)
1651 return Val.getOperand(i: 0);
1652
1653 return Val;
1654}
1655
1656static SDValue peekFPSignOps(SDValue Val) {
1657 if (Val.getOpcode() == ISD::FNEG)
1658 Val = Val.getOperand(i: 0);
1659 if (Val.getOpcode() == ISD::FABS)
1660 Val = Val.getOperand(i: 0);
1661 if (Val.getOpcode() == ISD::FCOPYSIGN)
1662 Val = Val.getOperand(i: 0);
1663 return Val;
1664}
1665
1666SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1667 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1668 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1669 SelectionDAG &DAG = DCI.DAG;
1670 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1671 switch (CCOpcode) {
1672 case ISD::SETOEQ:
1673 case ISD::SETONE:
1674 case ISD::SETUNE:
1675 case ISD::SETNE:
1676 case ISD::SETUEQ:
1677 case ISD::SETEQ:
1678 case ISD::SETFALSE:
1679 case ISD::SETFALSE2:
1680 case ISD::SETTRUE:
1681 case ISD::SETTRUE2:
1682 case ISD::SETUO:
1683 case ISD::SETO:
1684 break;
1685 case ISD::SETULE:
1686 case ISD::SETULT: {
1687 if (LHS == True)
1688 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1689 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1690 }
1691 case ISD::SETOLE:
1692 case ISD::SETOLT:
1693 case ISD::SETLE:
1694 case ISD::SETLT: {
1695 // Ordered. Assume ordered for undefined.
1696
1697 // Only do this after legalization to avoid interfering with other combines
1698 // which might occur.
1699 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1700 !DCI.isCalledByLegalizer())
1701 return SDValue();
1702
1703 // We need to permute the operands to get the correct NaN behavior. The
1704 // selected operand is the second one based on the failing compare with NaN,
1705 // so permute it based on the compare type the hardware uses.
1706 if (LHS == True)
1707 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1708 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1709 }
1710 case ISD::SETUGE:
1711 case ISD::SETUGT: {
1712 if (LHS == True)
1713 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1714 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1715 }
1716 case ISD::SETGT:
1717 case ISD::SETGE:
1718 case ISD::SETOGE:
1719 case ISD::SETOGT: {
1720 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1721 !DCI.isCalledByLegalizer())
1722 return SDValue();
1723
1724 if (LHS == True)
1725 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1726 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1727 }
1728 case ISD::SETCC_INVALID:
1729 llvm_unreachable("Invalid setcc condcode!");
1730 }
1731 return SDValue();
1732}
1733
1734/// Generate Min/Max node
1735SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1736 SDValue LHS, SDValue RHS,
1737 SDValue True, SDValue False,
1738 SDValue CC,
1739 DAGCombinerInfo &DCI) const {
1740 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1741 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1742
1743 SelectionDAG &DAG = DCI.DAG;
1744
1745 // If we can't directly match this, try to see if we can fold an fneg to
1746 // match.
1747
1748 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1749 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1750 SDValue NegTrue = peekFNeg(Val: True);
1751
1752 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1753 // fmin/fmax.
1754 //
1755 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1756 // -> fneg (fmin_legacy lhs, K)
1757 //
1758 // TODO: Use getNegatedExpression
1759 if (LHS == NegTrue && CFalse && CRHS) {
1760 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1761 if (NegRHS == CFalse->getValueAPF()) {
1762 SDValue Combined =
1763 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1764 if (Combined)
1765 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1766 return SDValue();
1767 }
1768 }
1769
1770 return SDValue();
1771}
1772
1773std::pair<SDValue, SDValue>
1774AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1775 SDLoc SL(Op);
1776
1777 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1778
1779 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1780 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1781
1782 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1783 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1784
1785 return std::pair(Lo, Hi);
1786}
1787
1788SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1789 SDLoc SL(Op);
1790
1791 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1792 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1793 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1794}
1795
1796SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1797 SDLoc SL(Op);
1798
1799 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1800 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1801 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1802}
1803
1804// Split a vector type into two parts. The first part is a power of two vector.
1805// The second part is whatever is left over, and is a scalar if it would
1806// otherwise be a 1-vector.
1807std::pair<EVT, EVT>
1808AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1809 EVT LoVT, HiVT;
1810 EVT EltVT = VT.getVectorElementType();
1811 unsigned NumElts = VT.getVectorNumElements();
1812 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1813 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1814 HiVT = NumElts - LoNumElts == 1
1815 ? EltVT
1816 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1817 return std::pair(LoVT, HiVT);
1818}
1819
1820// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1821// scalar.
1822std::pair<SDValue, SDValue>
1823AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1824 const EVT &LoVT, const EVT &HiVT,
1825 SelectionDAG &DAG) const {
1826 EVT VT = N.getValueType();
1827 assert(LoVT.getVectorNumElements() +
1828 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1829 VT.getVectorNumElements() &&
1830 "More vector elements requested than available!");
1831 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1832 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1833
1834 unsigned LoNumElts = LoVT.getVectorNumElements();
1835
1836 if (HiVT.isVector()) {
1837 unsigned HiNumElts = HiVT.getVectorNumElements();
1838 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1839 // Avoid creating an extract_subvector with an index that isn't a multiple
1840 // of the result type.
1841 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1842 N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1843 return {Lo, Hi};
1844 }
1845
1846 SmallVector<SDValue, 8> Elts;
1847 DAG.ExtractVectorElements(Op: N, Args&: Elts, /*Start=*/LoNumElts,
1848 /*Count=*/HiNumElts);
1849 SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1850 return {Lo, Hi};
1851 }
1852
1853 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1854 N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1855 return {Lo, Hi};
1856}
1857
1858SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1859 SelectionDAG &DAG) const {
1860 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1861 EVT VT = Op.getValueType();
1862 SDLoc SL(Op);
1863
1864
1865 // If this is a 2 element vector, we really want to scalarize and not create
1866 // weird 1 element vectors.
1867 if (VT.getVectorNumElements() == 2) {
1868 SDValue Ops[2];
1869 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1870 return DAG.getMergeValues(Ops, dl: SL);
1871 }
1872
1873 SDValue BasePtr = Load->getBasePtr();
1874 EVT MemVT = Load->getMemoryVT();
1875
1876 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1877
1878 EVT LoVT, HiVT;
1879 EVT LoMemVT, HiMemVT;
1880 SDValue Lo, Hi;
1881
1882 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1883 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1884 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1885
1886 unsigned Size = LoMemVT.getStoreSize();
1887 Align BaseAlign = Load->getAlign();
1888 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1889
1890 SDValue LoLoad = DAG.getExtLoad(
1891 ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1892 MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1893 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1894 SDValue HiLoad = DAG.getExtLoad(
1895 ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1896 PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1897 MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1898
1899 SDValue Join;
1900 if (LoVT == HiVT) {
1901 // This is the case that the vector is power of two so was evenly split.
1902 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1903 } else {
1904 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1905 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1906 Join = DAG.getNode(
1907 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1908 VT, N1: Join, N2: HiLoad,
1909 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1910 }
1911
1912 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1913 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1914
1915 return DAG.getMergeValues(Ops, dl: SL);
1916}
1917
1918SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1919 SelectionDAG &DAG) const {
1920 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1921 EVT VT = Op.getValueType();
1922 SDValue BasePtr = Load->getBasePtr();
1923 EVT MemVT = Load->getMemoryVT();
1924 SDLoc SL(Op);
1925 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1926 Align BaseAlign = Load->getAlign();
1927 unsigned NumElements = MemVT.getVectorNumElements();
1928
1929 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1930 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1931 if (NumElements != 3 ||
1932 (BaseAlign < Align(8) &&
1933 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1934 return SplitVectorLoad(Op, DAG);
1935
1936 assert(NumElements == 3);
1937
1938 EVT WideVT =
1939 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1940 EVT WideMemVT =
1941 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1942 SDValue WideLoad = DAG.getExtLoad(
1943 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1944 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1945 return DAG.getMergeValues(
1946 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1947 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1948 WideLoad.getValue(R: 1)},
1949 dl: SL);
1950}
1951
1952SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1953 SelectionDAG &DAG) const {
1954 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1955 SDValue Val = Store->getValue();
1956 EVT VT = Val.getValueType();
1957
1958 // If this is a 2 element vector, we really want to scalarize and not create
1959 // weird 1 element vectors.
1960 if (VT.getVectorNumElements() == 2)
1961 return scalarizeVectorStore(ST: Store, DAG);
1962
1963 EVT MemVT = Store->getMemoryVT();
1964 SDValue Chain = Store->getChain();
1965 SDValue BasePtr = Store->getBasePtr();
1966 SDLoc SL(Op);
1967
1968 EVT LoVT, HiVT;
1969 EVT LoMemVT, HiMemVT;
1970 SDValue Lo, Hi;
1971
1972 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1973 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1974 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1975
1976 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1977
1978 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1979 Align BaseAlign = Store->getAlign();
1980 unsigned Size = LoMemVT.getStoreSize();
1981 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1982
1983 SDValue LoStore =
1984 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1985 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1986 SDValue HiStore = DAG.getTruncStore(
1987 Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
1988 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1989
1990 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1991}
1992
1993// This is a shortcut for integer division because we have fast i32<->f32
1994// conversions, and fast f32 reciprocal instructions. The fractional part of a
1995// float is enough to accurately represent up to a 24-bit signed integer.
1996SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1997 bool Sign) const {
1998 SDLoc DL(Op);
1999 EVT VT = Op.getValueType();
2000 SDValue LHS = Op.getOperand(i: 0);
2001 SDValue RHS = Op.getOperand(i: 1);
2002 MVT IntVT = MVT::i32;
2003 MVT FltVT = MVT::f32;
2004
2005 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2006 if (LHSSignBits < 9)
2007 return SDValue();
2008
2009 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2010 if (RHSSignBits < 9)
2011 return SDValue();
2012
2013 unsigned BitSize = VT.getSizeInBits();
2014 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2015 unsigned DivBits = BitSize - SignBits;
2016 if (Sign)
2017 ++DivBits;
2018
2019 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2020 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2021
2022 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
2023
2024 if (Sign) {
2025 // char|short jq = ia ^ ib;
2026 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2027
2028 // jq = jq >> (bitsize - 2)
2029 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2030 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
2031
2032 // jq = jq | 0x1
2033 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
2034 }
2035
2036 // int ia = (int)LHS;
2037 SDValue ia = LHS;
2038
2039 // int ib, (int)RHS;
2040 SDValue ib = RHS;
2041
2042 // float fa = (float)ia;
2043 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2044
2045 // float fb = (float)ib;
2046 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2047
2048 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2049 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2050
2051 // fq = trunc(fq);
2052 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2053
2054 // float fqneg = -fq;
2055 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2056
2057 MachineFunction &MF = DAG.getMachineFunction();
2058
2059 bool UseFmadFtz = false;
2060 if (Subtarget->isGCN()) {
2061 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062 UseFmadFtz =
2063 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2064 }
2065
2066 // float fr = mad(fqneg, fb, fa);
2067 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2068 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2069 : (unsigned)ISD::FMAD;
2070 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2071
2072 // int iq = (int)fq;
2073 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2074
2075 // fr = fabs(fr);
2076 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2077
2078 // fb = fabs(fb);
2079 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2080
2081 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2082
2083 // int cv = fr >= fb;
2084 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2085
2086 // jq = (cv ? jq : 0);
2087 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2088
2089 // dst = iq + jq;
2090 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2091
2092 // Rem needs compensation, it's easier to recompute it
2093 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2094 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2095
2096 // Truncate to number of bits this divide really is.
2097 if (Sign) {
2098 SDValue InRegSize
2099 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2100 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2101 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2102 } else {
2103 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2104 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2105 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2106 }
2107
2108 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2109}
2110
2111void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2112 SelectionDAG &DAG,
2113 SmallVectorImpl<SDValue> &Results) const {
2114 SDLoc DL(Op);
2115 EVT VT = Op.getValueType();
2116
2117 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2118
2119 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2120
2121 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2122 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2123
2124 //HiLo split
2125 SDValue LHS_Lo, LHS_Hi;
2126 SDValue LHS = Op.getOperand(i: 0);
2127 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2128
2129 SDValue RHS_Lo, RHS_Hi;
2130 SDValue RHS = Op.getOperand(i: 1);
2131 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2132
2133 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2134 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2135
2136 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2137 N1: LHS_Lo, N2: RHS_Lo);
2138
2139 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2140 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2141
2142 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2143 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2144 return;
2145 }
2146
2147 if (isTypeLegal(VT: MVT::i64)) {
2148 // The algorithm here is based on ideas from "Software Integer Division",
2149 // Tom Rodeheffer, August 2008.
2150
2151 MachineFunction &MF = DAG.getMachineFunction();
2152 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2153
2154 // Compute denominator reciprocal.
2155 unsigned FMAD =
2156 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2157 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2158 ? (unsigned)ISD::FMAD
2159 : (unsigned)AMDGPUISD::FMAD_FTZ;
2160
2161 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2162 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2163 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2164 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2165 N3: Cvt_Lo);
2166 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2167 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2168 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2169 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2170 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2171 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2172 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2173 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2174 N3: Mul1);
2175 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2176 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2177 SDValue Rcp64 = DAG.getBitcast(VT,
2178 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2179
2180 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2181 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2182 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2183 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2184
2185 // First round of UNR (Unsigned integer Newton-Raphson).
2186 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2187 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2188 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2189 SDValue Mulhi1_Lo, Mulhi1_Hi;
2190 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2191 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2192 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2193 N2: Mulhi1_Lo, N3: Zero1);
2194 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2195 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2196 SDValue Add1 = DAG.getBitcast(VT,
2197 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2198
2199 // Second round of UNR.
2200 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2201 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2202 SDValue Mulhi2_Lo, Mulhi2_Hi;
2203 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2204 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2205 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2206 N2: Mulhi2_Lo, N3: Zero1);
2207 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2208 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2209 SDValue Add2 = DAG.getBitcast(VT,
2210 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2211
2212 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2213
2214 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2215
2216 SDValue Mul3_Lo, Mul3_Hi;
2217 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2218 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2219 N2: Mul3_Lo, N3: Zero1);
2220 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2221 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2222 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2223 SDValue Sub1 = DAG.getBitcast(VT,
2224 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2225
2226 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2227 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2228 Cond: ISD::SETUGE);
2229 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2230 Cond: ISD::SETUGE);
2231 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2232
2233 // TODO: Here and below portions of the code can be enclosed into if/endif.
2234 // Currently control flow is unconditional and we have 4 selects after
2235 // potential endif to substitute PHIs.
2236
2237 // if C3 != 0 ...
2238 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2239 N2: RHS_Lo, N3: Zero1);
2240 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2241 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2242 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2243 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2244 SDValue Sub2 = DAG.getBitcast(VT,
2245 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2246
2247 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2248
2249 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2250 Cond: ISD::SETUGE);
2251 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2252 Cond: ISD::SETUGE);
2253 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2254
2255 // if (C6 != 0)
2256 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2257
2258 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2259 N2: RHS_Lo, N3: Zero1);
2260 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2261 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2262 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2263 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2264 SDValue Sub3 = DAG.getBitcast(VT,
2265 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2266
2267 // endif C6
2268 // endif C3
2269
2270 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2271 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2272
2273 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2274 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2275
2276 Results.push_back(Elt: Div);
2277 Results.push_back(Elt: Rem);
2278
2279 return;
2280 }
2281
2282 // r600 expandion.
2283 // Get Speculative values
2284 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2285 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2286
2287 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2288 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2289 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2290
2291 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2292 SDValue DIV_Lo = Zero;
2293
2294 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2295
2296 for (unsigned i = 0; i < halfBitWidth; ++i) {
2297 const unsigned bitPos = halfBitWidth - i - 1;
2298 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2299 // Get value of high bit
2300 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2301 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2302 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2303
2304 // Shift
2305 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2306 // Add LHS high bit
2307 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2308
2309 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2310 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2311
2312 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2313
2314 // Update REM
2315 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2316 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2317 }
2318
2319 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2320 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2321 Results.push_back(Elt: DIV);
2322 Results.push_back(Elt: REM);
2323}
2324
2325SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2326 SelectionDAG &DAG) const {
2327 SDLoc DL(Op);
2328 EVT VT = Op.getValueType();
2329
2330 if (VT == MVT::i64) {
2331 SmallVector<SDValue, 2> Results;
2332 LowerUDIVREM64(Op, DAG, Results);
2333 return DAG.getMergeValues(Ops: Results, dl: DL);
2334 }
2335
2336 if (VT == MVT::i32) {
2337 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2338 return Res;
2339 }
2340
2341 SDValue X = Op.getOperand(i: 0);
2342 SDValue Y = Op.getOperand(i: 1);
2343
2344 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2345 // algorithm used here.
2346
2347 // Initial estimate of inv(y).
2348 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2349
2350 // One round of UNR.
2351 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2352 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2353 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2354 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2355
2356 // Quotient/remainder estimate.
2357 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2358 SDValue R =
2359 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2360
2361 // First quotient/remainder refinement.
2362 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2363 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2364 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2365 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2366 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2367 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2368 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2369
2370 // Second quotient/remainder refinement.
2371 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2372 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2373 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2374 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2375 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2376
2377 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2378}
2379
2380SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2381 SelectionDAG &DAG) const {
2382 SDLoc DL(Op);
2383 EVT VT = Op.getValueType();
2384
2385 SDValue LHS = Op.getOperand(i: 0);
2386 SDValue RHS = Op.getOperand(i: 1);
2387
2388 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2389 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2390
2391 if (VT == MVT::i32) {
2392 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2393 return Res;
2394 }
2395
2396 if (VT == MVT::i64 &&
2397 DAG.ComputeNumSignBits(Op: LHS) > 32 &&
2398 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2399 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2400
2401 //HiLo split
2402 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2403 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2404 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2405 N1: LHS_Lo, N2: RHS_Lo);
2406 SDValue Res[2] = {
2407 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2408 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2409 };
2410 return DAG.getMergeValues(Ops: Res, dl: DL);
2411 }
2412
2413 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2414 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2415 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2416 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2417
2418 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2419 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2420
2421 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2422 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2423
2424 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2425 SDValue Rem = Div.getValue(R: 1);
2426
2427 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2428 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2429
2430 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2431 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2432
2433 SDValue Res[2] = {
2434 Div,
2435 Rem
2436 };
2437 return DAG.getMergeValues(Ops: Res, dl: DL);
2438}
2439
2440SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2441 SDLoc SL(Op);
2442 SDValue Src = Op.getOperand(i: 0);
2443
2444 // result = trunc(src)
2445 // if (src > 0.0 && src != result)
2446 // result += 1.0
2447
2448 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2449
2450 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2451 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2452
2453 EVT SetCCVT =
2454 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2455
2456 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2457 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2458 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2459
2460 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2461 // TODO: Should this propagate fast-math-flags?
2462 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2463}
2464
2465static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2466 SelectionDAG &DAG) {
2467 const unsigned FractBits = 52;
2468 const unsigned ExpBits = 11;
2469
2470 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2471 N1: Hi,
2472 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2473 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2474 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2475 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2476
2477 return Exp;
2478}
2479
2480SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2481 SDLoc SL(Op);
2482 SDValue Src = Op.getOperand(i: 0);
2483
2484 assert(Op.getValueType() == MVT::f64);
2485
2486 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2487
2488 // Extract the upper half, since this is where we will find the sign and
2489 // exponent.
2490 SDValue Hi = getHiHalf64(Op: Src, DAG);
2491
2492 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2493
2494 const unsigned FractBits = 52;
2495
2496 // Extract the sign bit.
2497 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2498 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2499
2500 // Extend back to 64-bits.
2501 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2502 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2503
2504 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2505 const SDValue FractMask
2506 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2507
2508 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2509 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2510 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2511
2512 EVT SetCCVT =
2513 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2514
2515 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2516
2517 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2518 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2519
2520 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2521 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2522
2523 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2524}
2525
2526SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2527 SelectionDAG &DAG) const {
2528 SDLoc SL(Op);
2529 SDValue Src = Op.getOperand(i: 0);
2530
2531 assert(Op.getValueType() == MVT::f64);
2532
2533 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2534 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2535 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2536
2537 // TODO: Should this propagate fast-math-flags?
2538
2539 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2540 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2541
2542 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2543
2544 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2545 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2546
2547 EVT SetCCVT =
2548 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2549 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2550
2551 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2552}
2553
2554SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2555 SelectionDAG &DAG) const {
2556 // FNEARBYINT and FRINT are the same, except in their handling of FP
2557 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2558 // rint, so just treat them as equivalent.
2559 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2560 Operand: Op.getOperand(i: 0));
2561}
2562
2563SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2564 auto VT = Op.getValueType();
2565 auto Arg = Op.getOperand(i: 0u);
2566 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2567}
2568
2569// XXX - May require not supporting f32 denormals?
2570
2571// Don't handle v2f16. The extra instructions to scalarize and repack around the
2572// compare and vselect end up producing worse code than scalarizing the whole
2573// operation.
2574SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2575 SDLoc SL(Op);
2576 SDValue X = Op.getOperand(i: 0);
2577 EVT VT = Op.getValueType();
2578
2579 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2580
2581 // TODO: Should this propagate fast-math-flags?
2582
2583 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2584
2585 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2586
2587 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2588 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2589
2590 EVT SetCCVT =
2591 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2592
2593 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2594 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2595 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2596
2597 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2598 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2599}
2600
2601SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2602 SDLoc SL(Op);
2603 SDValue Src = Op.getOperand(i: 0);
2604
2605 // result = trunc(src);
2606 // if (src < 0.0 && src != result)
2607 // result += -1.0.
2608
2609 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2610
2611 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2612 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2613
2614 EVT SetCCVT =
2615 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2616
2617 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2618 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2619 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2620
2621 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2622 // TODO: Should this propagate fast-math-flags?
2623 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2624}
2625
2626/// Return true if it's known that \p Src can never be an f32 denormal value.
2627static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2628 switch (Src.getOpcode()) {
2629 case ISD::FP_EXTEND:
2630 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2631 case ISD::FP16_TO_FP:
2632 case ISD::FFREXP:
2633 case ISD::FSQRT:
2634 case AMDGPUISD::LOG:
2635 case AMDGPUISD::EXP:
2636 return true;
2637 case ISD::INTRINSIC_WO_CHAIN: {
2638 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2639 switch (IntrinsicID) {
2640 case Intrinsic::amdgcn_frexp_mant:
2641 case Intrinsic::amdgcn_log:
2642 case Intrinsic::amdgcn_log_clamp:
2643 case Intrinsic::amdgcn_exp2:
2644 case Intrinsic::amdgcn_sqrt:
2645 return true;
2646 default:
2647 return false;
2648 }
2649 }
2650 default:
2651 return false;
2652 }
2653
2654 llvm_unreachable("covered opcode switch");
2655}
2656
2657bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2658 SDNodeFlags Flags) {
2659 return Flags.hasApproximateFuncs();
2660}
2661
2662bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2663 SDValue Src,
2664 SDNodeFlags Flags) {
2665 return !valueIsKnownNeverF32Denorm(Src) &&
2666 DAG.getMachineFunction()
2667 .getDenormalMode(FPType: APFloat::IEEEsingle())
2668 .Input != DenormalMode::PreserveSign;
2669}
2670
2671SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2672 SDValue Src,
2673 SDNodeFlags Flags) const {
2674 SDLoc SL(Src);
2675 EVT VT = Src.getValueType();
2676 const fltSemantics &Semantics = VT.getFltSemantics();
2677 SDValue SmallestNormal =
2678 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2679
2680 // Want to scale denormals up, but negatives and 0 work just as well on the
2681 // scaled path.
2682 SDValue IsLtSmallestNormal = DAG.getSetCC(
2683 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2684 RHS: SmallestNormal, Cond: ISD::SETOLT);
2685
2686 return IsLtSmallestNormal;
2687}
2688
2689SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2690 SDNodeFlags Flags) const {
2691 SDLoc SL(Src);
2692 EVT VT = Src.getValueType();
2693 const fltSemantics &Semantics = VT.getFltSemantics();
2694 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2695
2696 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2697 SDValue IsFinite = DAG.getSetCC(
2698 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2699 RHS: Inf, Cond: ISD::SETOLT);
2700 return IsFinite;
2701}
2702
2703/// If denormal handling is required return the scaled input to FLOG2, and the
2704/// check for denormal range. Otherwise, return null values.
2705std::pair<SDValue, SDValue>
2706AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2707 SDValue Src, SDNodeFlags Flags) const {
2708 if (!needsDenormHandlingF32(DAG, Src, Flags))
2709 return {};
2710
2711 MVT VT = MVT::f32;
2712 const fltSemantics &Semantics = APFloat::IEEEsingle();
2713 SDValue SmallestNormal =
2714 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2715
2716 SDValue IsLtSmallestNormal = DAG.getSetCC(
2717 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2718 RHS: SmallestNormal, Cond: ISD::SETOLT);
2719
2720 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2721 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2722 SDValue ScaleFactor =
2723 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2724
2725 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2726 return {ScaledInput, IsLtSmallestNormal};
2727}
2728
2729SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2730 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2731 // If we have to handle denormals, scale up the input and adjust the result.
2732
2733 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2734 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2735
2736 SDLoc SL(Op);
2737 EVT VT = Op.getValueType();
2738 SDValue Src = Op.getOperand(i: 0);
2739 SDNodeFlags Flags = Op->getFlags();
2740
2741 if (VT == MVT::f16) {
2742 // Nothing in half is a denormal when promoted to f32.
2743 assert(!isTypeLegal(VT));
2744 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2745 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2746 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2747 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2748 }
2749
2750 auto [ScaledInput, IsLtSmallestNormal] =
2751 getScaledLogInput(DAG, SL, Src, Flags);
2752 if (!ScaledInput)
2753 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2754
2755 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2756
2757 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2758 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2759 SDValue ResultOffset =
2760 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2761 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2762}
2763
2764static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2765 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2766 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2767 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2768}
2769
2770SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2771 SelectionDAG &DAG) const {
2772 SDValue X = Op.getOperand(i: 0);
2773 EVT VT = Op.getValueType();
2774 SDNodeFlags Flags = Op->getFlags();
2775 SDLoc DL(Op);
2776 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2777 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2778
2779 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2780 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2781 // depending on !fpmath metadata.
2782
2783 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2784 !isTypeLegal(VT: MVT::f16));
2785
2786 if (PromoteToF32) {
2787 // Log and multiply in f32 is always good enough for f16.
2788 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2789 }
2790
2791 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2792 if (PromoteToF32) {
2793 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2794 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2795 }
2796
2797 return Lowered;
2798 }
2799
2800 SDValue ScaledInput, IsScaled;
2801 if (VT == MVT::f16)
2802 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2803 else {
2804 std::tie(args&: ScaledInput, args&: IsScaled) = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2805 if (ScaledInput)
2806 X = ScaledInput;
2807 }
2808
2809 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2810
2811 SDValue R;
2812 if (Subtarget->hasFastFMAF32()) {
2813 // c+cc are ln(2)/ln(10) to more than 49 bits
2814 const float c_log10 = 0x1.344134p-2f;
2815 const float cc_log10 = 0x1.09f79ep-26f;
2816
2817 // c + cc is ln(2) to more than 49 bits
2818 const float c_log = 0x1.62e42ep-1f;
2819 const float cc_log = 0x1.efa39ep-25f;
2820
2821 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2822 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2823 // This adds correction terms for which contraction may lead to an increase
2824 // in the error of the approximation, so disable it.
2825 Flags.setAllowContract(false);
2826 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2827 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2828 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2829 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2830 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2831 } else {
2832 // ch+ct is ln(2)/ln(10) to more than 36 bits
2833 const float ch_log10 = 0x1.344000p-2f;
2834 const float ct_log10 = 0x1.3509f6p-18f;
2835
2836 // ch + ct is ln(2) to more than 36 bits
2837 const float ch_log = 0x1.62e000p-1f;
2838 const float ct_log = 0x1.0bfbe8p-15f;
2839
2840 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2841 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2842
2843 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2844 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2845 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2846 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2847 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2848 // This adds correction terms for which contraction may lead to an increase
2849 // in the error of the approximation, so disable it.
2850 Flags.setAllowContract(false);
2851 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2852 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2853 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2854 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2855 }
2856
2857 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2858
2859 // TODO: Check if known finite from source value.
2860 if (!IsFiniteOnly) {
2861 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2862 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2863 }
2864
2865 if (IsScaled) {
2866 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2867 SDValue ShiftK =
2868 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2869 SDValue Shift =
2870 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2871 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2872 }
2873
2874 return R;
2875}
2876
2877SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2878 return LowerFLOGCommon(Op, DAG);
2879}
2880
2881// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2882// promote f16 operation.
2883SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2884 SelectionDAG &DAG, bool IsLog10,
2885 SDNodeFlags Flags) const {
2886 EVT VT = Src.getValueType();
2887 unsigned LogOp =
2888 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2889
2890 double Log2BaseInverted =
2891 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2892
2893 if (VT == MVT::f32) {
2894 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2895 if (ScaledInput) {
2896 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2897 SDValue ScaledResultOffset =
2898 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2899
2900 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2901
2902 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2903 N2: ScaledResultOffset, N3: Zero, Flags);
2904
2905 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2906
2907 if (Subtarget->hasFastFMAF32())
2908 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2909 Flags);
2910 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2911 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2912 }
2913 }
2914
2915 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2916 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2917
2918 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2919 Flags);
2920}
2921
2922SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2923 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2924 // If we have to handle denormals, scale up the input and adjust the result.
2925
2926 SDLoc SL(Op);
2927 EVT VT = Op.getValueType();
2928 SDValue Src = Op.getOperand(i: 0);
2929 SDNodeFlags Flags = Op->getFlags();
2930
2931 if (VT == MVT::f16) {
2932 // Nothing in half is a denormal when promoted to f32.
2933 assert(!isTypeLegal(MVT::f16));
2934 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2935 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2936 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2937 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2938 }
2939
2940 assert(VT == MVT::f32);
2941
2942 if (!needsDenormHandlingF32(DAG, Src, Flags))
2943 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2944
2945 // bool needs_scaling = x < -0x1.f80000p+6f;
2946 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2947
2948 // -nextafter(128.0, -1)
2949 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
2950
2951 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2952
2953 SDValue NeedsScaling =
2954 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2955
2956 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2957 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2958
2959 SDValue AddOffset =
2960 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2961
2962 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2963 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2964
2965 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
2966 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2967 SDValue ResultScale =
2968 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2969
2970 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2971}
2972
2973SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
2974 SelectionDAG &DAG,
2975 SDNodeFlags Flags,
2976 bool IsExp10) const {
2977 // exp(x) -> exp2(M_LOG2E_F * x);
2978 // exp10(x) -> exp2(log2(10) * x);
2979 EVT VT = X.getValueType();
2980 SDValue Const =
2981 DAG.getConstantFP(Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, DL: SL, VT);
2982
2983 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
2984 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2985 : (unsigned)ISD::FEXP2,
2986 DL: SL, VT, Operand: Mul, Flags);
2987}
2988
2989SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2990 SelectionDAG &DAG,
2991 SDNodeFlags Flags) const {
2992 EVT VT = X.getValueType();
2993 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags))
2994 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2995
2996 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2997
2998 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
2999 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3000
3001 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
3002
3003 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3004
3005 SDValue AdjustedX =
3006 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3007
3008 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3009 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3010
3011 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3012
3013 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
3014 SDValue AdjustedResult =
3015 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3016
3017 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3018 Flags);
3019}
3020
3021/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3022/// handled correctly.
3023SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3024 SelectionDAG &DAG,
3025 SDNodeFlags Flags) const {
3026 const EVT VT = X.getValueType();
3027
3028 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3029 : static_cast<unsigned>(ISD::FEXP2);
3030
3031 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3032 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3033 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3034 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3035
3036 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3037 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3038 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3039 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3040 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3041 }
3042
3043 // bool s = x < -0x1.2f7030p+5f;
3044 // x += s ? 0x1.0p+5f : 0.0f;
3045 // exp10 = exp2(x * 0x1.a92000p+1f) *
3046 // exp2(x * 0x1.4f0978p-11f) *
3047 // (s ? 0x1.9f623ep-107f : 1.0f);
3048
3049 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3050
3051 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3052 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3053
3054 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3055 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3056 SDValue AdjustedX =
3057 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3058
3059 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3060 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3061
3062 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3063 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3064 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3065 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3066
3067 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3068
3069 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3070 SDValue AdjustedResult =
3071 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3072
3073 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3074 Flags);
3075}
3076
3077SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3078 EVT VT = Op.getValueType();
3079 SDLoc SL(Op);
3080 SDValue X = Op.getOperand(i: 0);
3081 SDNodeFlags Flags = Op->getFlags();
3082 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3083
3084 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3085 // library behavior. Also, is known-not-daz source sufficient?
3086 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3087 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3088 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3089 }
3090
3091 if (VT.getScalarType() == MVT::f16) {
3092 if (VT.isVector())
3093 return SDValue();
3094
3095 // Nothing in half is a denormal when promoted to f32.
3096 //
3097 // exp(f16 x) ->
3098 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3099 //
3100 // exp10(f16 x) ->
3101 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3102 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3103 SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3104 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3105 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3106 }
3107
3108 assert(VT == MVT::f32);
3109
3110 // Algorithm:
3111 //
3112 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3113 //
3114 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3115 // n = 64*m + j, 0 <= j < 64
3116 //
3117 // e^x = 2^((64*m + j + f)/64)
3118 // = (2^m) * (2^(j/64)) * 2^(f/64)
3119 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3120 //
3121 // f = x*(64/ln(2)) - n
3122 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3123 //
3124 // e^x = (2^m) * (2^(j/64)) * e^r
3125 //
3126 // (2^(j/64)) is precomputed
3127 //
3128 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3129 // e^r = 1 + q
3130 //
3131 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3132 //
3133 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3134 SDNodeFlags FlagsNoContract = Flags;
3135 FlagsNoContract.setAllowContract(false);
3136
3137 SDValue PH, PL;
3138 if (Subtarget->hasFastFMAF32()) {
3139 const float c_exp = numbers::log2ef;
3140 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3141 const float c_exp10 = 0x1.a934f0p+1f;
3142 const float cc_exp10 = 0x1.2f346ep-24f;
3143
3144 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3145 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3146
3147 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3148 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3149 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3150 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3151 } else {
3152 const float ch_exp = 0x1.714000p+0f;
3153 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3154
3155 const float ch_exp10 = 0x1.a92000p+1f;
3156 const float cl_exp10 = 0x1.4f0978p-11f;
3157
3158 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3159 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3160
3161 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3162 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3163 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3164 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3165 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3166
3167 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3168
3169 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3170 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3171 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3172 }
3173
3174 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3175
3176 // It is unsafe to contract this fsub into the PH multiply.
3177 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3178
3179 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3180 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3181 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3182
3183 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3184
3185 SDValue UnderflowCheckConst =
3186 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3187
3188 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3189 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3190 SDValue Underflow =
3191 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3192
3193 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3194
3195 if (!Flags.hasNoInfs()) {
3196 SDValue OverflowCheckConst =
3197 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3198 SDValue Overflow =
3199 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3200 SDValue Inf =
3201 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3202 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3203 }
3204
3205 return R;
3206}
3207
3208static bool isCtlzOpc(unsigned Opc) {
3209 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3210}
3211
3212static bool isCttzOpc(unsigned Opc) {
3213 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3214}
3215
3216SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3217 SelectionDAG &DAG) const {
3218 auto SL = SDLoc(Op);
3219 auto Opc = Op.getOpcode();
3220 auto Arg = Op.getOperand(i: 0u);
3221 auto ResultVT = Op.getValueType();
3222
3223 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3224 return {};
3225
3226 assert(isCtlzOpc(Opc));
3227 assert(ResultVT == Arg.getValueType());
3228
3229 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3230 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3231 SDValue NewOp;
3232
3233 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3234 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3235 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3236 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3237 } else {
3238 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3239 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3240 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3241 }
3242
3243 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3244}
3245
3246SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3247 SDLoc SL(Op);
3248 SDValue Src = Op.getOperand(i: 0);
3249
3250 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3251 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3252 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3253
3254 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3255 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3256 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3257
3258 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3259 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3260 // (cttz hi:lo) -> (umin (ffbl src), 32)
3261 // (ctlz_zero_undef src) -> (ffbh src)
3262 // (cttz_zero_undef src) -> (ffbl src)
3263
3264 // 64-bit scalar version produce 32-bit result
3265 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3266 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3267 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3268 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3269 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3270 if (!ZeroUndef) {
3271 const SDValue ConstVal = DAG.getConstant(
3272 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3273 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3274 }
3275 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3276 }
3277
3278 SDValue Lo, Hi;
3279 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3280
3281 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3282 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3283
3284 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3285 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3286 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3287 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3288
3289 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3290 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3291 if (Ctlz)
3292 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3293 else
3294 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3295
3296 SDValue NewOpr;
3297 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3298 if (!ZeroUndef) {
3299 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3300 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3301 }
3302
3303 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3304}
3305
3306SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3307 bool Signed) const {
3308 // The regular method converting a 64-bit integer to float roughly consists of
3309 // 2 steps: normalization and rounding. In fact, after normalization, the
3310 // conversion from a 64-bit integer to a float is essentially the same as the
3311 // one from a 32-bit integer. The only difference is that it has more
3312 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3313 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3314 // converted into the correct float number. The basic steps for the unsigned
3315 // conversion are illustrated in the following pseudo code:
3316 //
3317 // f32 uitofp(i64 u) {
3318 // i32 hi, lo = split(u);
3319 // // Only count the leading zeros in hi as we have native support of the
3320 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3321 // // reduced to a 32-bit one automatically.
3322 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3323 // u <<= shamt;
3324 // hi, lo = split(u);
3325 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3326 // // convert it as a 32-bit integer and scale the result back.
3327 // return uitofp(hi) * 2^(32 - shamt);
3328 // }
3329 //
3330 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3331 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3332 // converted instead followed by negation based its sign bit.
3333
3334 SDLoc SL(Op);
3335 SDValue Src = Op.getOperand(i: 0);
3336
3337 SDValue Lo, Hi;
3338 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3339 SDValue Sign;
3340 SDValue ShAmt;
3341 if (Signed && Subtarget->isGCN()) {
3342 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3343 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3344 // account. That is, the maximal shift is
3345 // - 32 if Lo and Hi have opposite signs;
3346 // - 33 if Lo and Hi have the same sign.
3347 //
3348 // Or, MaxShAmt = 33 + OppositeSign, where
3349 //
3350 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3351 // - -1 if Lo and Hi have opposite signs; and
3352 // - 0 otherwise.
3353 //
3354 // All in all, ShAmt is calculated as
3355 //
3356 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3357 //
3358 // or
3359 //
3360 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3361 //
3362 // to reduce the critical path.
3363 SDValue OppositeSign = DAG.getNode(
3364 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3365 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3366 SDValue MaxShAmt =
3367 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3368 N2: OppositeSign);
3369 // Count the leading sign bits.
3370 ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3371 // Different from unsigned conversion, the shift should be one bit less to
3372 // preserve the sign bit.
3373 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3374 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3375 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3376 } else {
3377 if (Signed) {
3378 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3379 // absolute value first.
3380 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3381 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3382 SDValue Abs =
3383 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3384 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3385 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3386 }
3387 // Count the leading zeros.
3388 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3389 // The shift amount for signed integers is [0, 32].
3390 }
3391 // Normalize the given 64-bit integer.
3392 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3393 // Split it again.
3394 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3395 // Calculate the adjust bit for rounding.
3396 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3397 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3398 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3399 // Get the 32-bit normalized integer.
3400 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3401 // Convert the normalized 32-bit integer into f32.
3402
3403 bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3404 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3405 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3406
3407 // Finally, need to scale back the converted floating number as the original
3408 // 64-bit integer is converted as a 32-bit one.
3409 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3410 N2: ShAmt);
3411 // On GCN, use LDEXP directly.
3412 if (UseLDEXP)
3413 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3414
3415 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3416 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3417 // exponent is enough to avoid overflowing into the sign bit.
3418 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3419 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3420 SDValue IVal =
3421 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3422 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3423 if (Signed) {
3424 // Set the sign bit.
3425 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3426 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3427 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3428 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3429 }
3430 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3431}
3432
3433SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3434 bool Signed) const {
3435 SDLoc SL(Op);
3436 SDValue Src = Op.getOperand(i: 0);
3437
3438 SDValue Lo, Hi;
3439 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3440
3441 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3442 DL: SL, VT: MVT::f64, Operand: Hi);
3443
3444 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3445
3446 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3447 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3448 // TODO: Should this propagate fast-math-flags?
3449 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3450}
3451
3452SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3453 SelectionDAG &DAG) const {
3454 // TODO: Factor out code common with LowerSINT_TO_FP.
3455 EVT DestVT = Op.getValueType();
3456 SDValue Src = Op.getOperand(i: 0);
3457 EVT SrcVT = Src.getValueType();
3458
3459 if (SrcVT == MVT::i16) {
3460 if (DestVT == MVT::f16)
3461 return Op;
3462 SDLoc DL(Op);
3463
3464 // Promote src to i32
3465 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3466 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3467 }
3468
3469 if (DestVT == MVT::bf16) {
3470 SDLoc SL(Op);
3471 SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3472 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3473 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3474 }
3475
3476 if (SrcVT != MVT::i64)
3477 return Op;
3478
3479 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3480 SDLoc DL(Op);
3481
3482 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3483 SDValue FPRoundFlag =
3484 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3485 SDValue FPRound =
3486 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3487
3488 return FPRound;
3489 }
3490
3491 if (DestVT == MVT::f32)
3492 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3493
3494 assert(DestVT == MVT::f64);
3495 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3496}
3497
3498SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3499 SelectionDAG &DAG) const {
3500 EVT DestVT = Op.getValueType();
3501
3502 SDValue Src = Op.getOperand(i: 0);
3503 EVT SrcVT = Src.getValueType();
3504
3505 if (SrcVT == MVT::i16) {
3506 if (DestVT == MVT::f16)
3507 return Op;
3508
3509 SDLoc DL(Op);
3510 // Promote src to i32
3511 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3512 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3513 }
3514
3515 if (DestVT == MVT::bf16) {
3516 SDLoc SL(Op);
3517 SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3518 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3519 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3520 }
3521
3522 if (SrcVT != MVT::i64)
3523 return Op;
3524
3525 // TODO: Factor out code common with LowerUINT_TO_FP.
3526
3527 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3528 SDLoc DL(Op);
3529 SDValue Src = Op.getOperand(i: 0);
3530
3531 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3532 SDValue FPRoundFlag =
3533 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3534 SDValue FPRound =
3535 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3536
3537 return FPRound;
3538 }
3539
3540 if (DestVT == MVT::f32)
3541 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3542
3543 assert(DestVT == MVT::f64);
3544 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3545}
3546
3547SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3548 bool Signed) const {
3549 SDLoc SL(Op);
3550
3551 SDValue Src = Op.getOperand(i: 0);
3552 EVT SrcVT = Src.getValueType();
3553
3554 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3555
3556 // The basic idea of converting a floating point number into a pair of 32-bit
3557 // integers is illustrated as follows:
3558 //
3559 // tf := trunc(val);
3560 // hif := floor(tf * 2^-32);
3561 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3562 // hi := fptoi(hif);
3563 // lo := fptoi(lof);
3564 //
3565 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3566 SDValue Sign;
3567 if (Signed && SrcVT == MVT::f32) {
3568 // However, a 32-bit floating point number has only 23 bits mantissa and
3569 // it's not enough to hold all the significant bits of `lof` if val is
3570 // negative. To avoid the loss of precision, We need to take the absolute
3571 // value after truncating and flip the result back based on the original
3572 // signedness.
3573 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3574 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3575 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3576 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3577 }
3578
3579 SDValue K0, K1;
3580 if (SrcVT == MVT::f64) {
3581 K0 = DAG.getConstantFP(
3582 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3583 VT: SrcVT);
3584 K1 = DAG.getConstantFP(
3585 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3586 VT: SrcVT);
3587 } else {
3588 K0 = DAG.getConstantFP(
3589 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3590 K1 = DAG.getConstantFP(
3591 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3592 }
3593 // TODO: Should this propagate fast-math-flags?
3594 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3595
3596 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3597
3598 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3599
3600 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3601 : ISD::FP_TO_UINT,
3602 DL: SL, VT: MVT::i32, Operand: FloorMul);
3603 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3604
3605 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3606 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3607
3608 if (Signed && SrcVT == MVT::f32) {
3609 assert(Sign);
3610 // Flip the result based on the signedness, which is either all 0s or 1s.
3611 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3612 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3613 // r := xor(r, sign) - sign;
3614 Result =
3615 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3616 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3617 }
3618
3619 return Result;
3620}
3621
3622SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3623 SDLoc DL(Op);
3624 SDValue N0 = Op.getOperand(i: 0);
3625
3626 // Convert to target node to get known bits
3627 if (N0.getValueType() == MVT::f32)
3628 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3629
3630 if (Op->getFlags().hasApproximateFuncs()) {
3631 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3632 return SDValue();
3633 }
3634
3635 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3636}
3637
3638// return node in i32
3639SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3640 SelectionDAG &DAG) const {
3641 assert(Src.getSimpleValueType() == MVT::f64);
3642
3643 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3644 // TODO: We can generate better code for True16.
3645 const unsigned ExpMask = 0x7ff;
3646 const unsigned ExpBiasf64 = 1023;
3647 const unsigned ExpBiasf16 = 15;
3648 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3649 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3650 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3651 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3652 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3653 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3654 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3655 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3656 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3657 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3658 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3659 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3660 // add the f16 bias (15) to get the biased exponent for the f16 format.
3661 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3662 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3663
3664 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3665 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3666 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3667 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3668
3669 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3670 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3671 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3672
3673 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3674 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3675
3676 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3677 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3678 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3679 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3680
3681 // N = M | (E << 12);
3682 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3683 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3684 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3685
3686 // B = clamp(1-E, 0, 13);
3687 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3688 N1: One, N2: E);
3689 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3690 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3691 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3692
3693 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3694 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3695
3696 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3697 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3698 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3699 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3700
3701 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3702 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3703 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3704 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3705 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3706 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3707 True: One, False: Zero, Cond: ISD::SETEQ);
3708 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3709 True: One, False: Zero, Cond: ISD::SETGT);
3710 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3711 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3712
3713 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3714 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3715 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3716 True: I, False: V, Cond: ISD::SETEQ);
3717
3718 // Extract the sign bit.
3719 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3720 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3721 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3722 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3723
3724 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3725}
3726
3727SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3728 SelectionDAG &DAG) const {
3729 SDValue Src = Op.getOperand(i: 0);
3730 unsigned OpOpcode = Op.getOpcode();
3731 EVT SrcVT = Src.getValueType();
3732 EVT DestVT = Op.getValueType();
3733
3734 // Will be selected natively
3735 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3736 return Op;
3737
3738 if (SrcVT == MVT::bf16) {
3739 SDLoc DL(Op);
3740 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3741 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3742 }
3743
3744 // Promote i16 to i32
3745 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3746 SDLoc DL(Op);
3747
3748 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3749 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3750 }
3751
3752 if (DestVT != MVT::i64)
3753 return Op;
3754
3755 if (SrcVT == MVT::f16 ||
3756 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3757 SDLoc DL(Op);
3758
3759 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3760 unsigned Ext =
3761 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3762 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3763 }
3764
3765 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3766 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3767
3768 return SDValue();
3769}
3770
3771SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3772 SelectionDAG &DAG) const {
3773 SDValue Src = Op.getOperand(i: 0);
3774 unsigned OpOpcode = Op.getOpcode();
3775 EVT SrcVT = Src.getValueType();
3776 EVT DstVT = Op.getValueType();
3777 SDValue SatVTOp = Op.getNode()->getOperand(Num: 1);
3778 EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3779 SDLoc DL(Op);
3780
3781 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3782 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3783 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3784
3785 // Will be selected natively
3786 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3787 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3788 return Op;
3789
3790 const SDValue Int32VT = DAG.getValueType(MVT::i32);
3791
3792 // Perform all saturation at i32 and truncate
3793 if (SatWidth < DstWidth) {
3794 const uint64_t Int32Width = 32;
3795 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: Int32VT);
3796 SDValue Int32SatVal;
3797
3798 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3799 SDValue MinConst = DAG.getConstant(
3800 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3801 SDValue MaxConst = DAG.getConstant(
3802 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3803 SDValue MinVal =
3804 DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3805 Int32SatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: MinVal, N2: MaxConst);
3806 } else {
3807 SDValue MinConst = DAG.getConstant(
3808 Val: APInt::getMaxValue(numBits: SatWidth).zext(width: Int32Width), DL, VT: MVT::i32);
3809 Int32SatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3810 }
3811
3812 if (DstWidth == Int32Width)
3813 return Int32SatVal;
3814 if (DstWidth < Int32Width)
3815 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Int32SatVal);
3816
3817 // DstWidth > Int32Width
3818 const unsigned Ext =
3819 OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3820 return DAG.getNode(Opcode: Ext, DL, VT: DstVT, Operand: FpToInt32);
3821 }
3822
3823 // SatWidth == DstWidth
3824
3825 // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3826 if (DstVT == MVT::i64 &&
3827 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3828 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3829 return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VT);
3830 }
3831
3832 // Promote f16/bf16 src to f32
3833 if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
3834 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3835 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3836 }
3837
3838 // Promote sub-i32 dst to i32 with sub-i32 saturation
3839 if (DstWidth < 32) {
3840 // Note: this triggers SatWidth < DstWidth above to generate saturated
3841 // truncate by requesting MVT::i32 destination with SatWidth < 32.
3842 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: SatVTOp);
3843 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt32);
3844 }
3845
3846 // TODO: can we implement i64 dst for f32/f64?
3847
3848 return SDValue();
3849}
3850
3851SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3852 SelectionDAG &DAG) const {
3853 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3854 MVT VT = Op.getSimpleValueType();
3855 MVT ScalarVT = VT.getScalarType();
3856
3857 assert(VT.isVector());
3858
3859 SDValue Src = Op.getOperand(i: 0);
3860 SDLoc DL(Op);
3861
3862 // TODO: Don't scalarize on Evergreen?
3863 unsigned NElts = VT.getVectorNumElements();
3864 SmallVector<SDValue, 8> Args;
3865 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
3866
3867 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3868 for (unsigned I = 0; I < NElts; ++I)
3869 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
3870
3871 return DAG.getBuildVector(VT, DL, Ops: Args);
3872}
3873
3874//===----------------------------------------------------------------------===//
3875// Custom DAG optimizations
3876//===----------------------------------------------------------------------===//
3877
3878static bool isU24(SDValue Op, SelectionDAG &DAG) {
3879 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3880}
3881
3882static bool isI24(SDValue Op, SelectionDAG &DAG) {
3883 EVT VT = Op.getValueType();
3884 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3885 // as unsigned 24-bit values.
3886 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3887}
3888
3889static SDValue simplifyMul24(SDNode *Node24,
3890 TargetLowering::DAGCombinerInfo &DCI) {
3891 SelectionDAG &DAG = DCI.DAG;
3892 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3893 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3894
3895 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
3896 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
3897 unsigned NewOpcode = Node24->getOpcode();
3898 if (IsIntrin) {
3899 unsigned IID = Node24->getConstantOperandVal(Num: 0);
3900 switch (IID) {
3901 case Intrinsic::amdgcn_mul_i24:
3902 NewOpcode = AMDGPUISD::MUL_I24;
3903 break;
3904 case Intrinsic::amdgcn_mul_u24:
3905 NewOpcode = AMDGPUISD::MUL_U24;
3906 break;
3907 case Intrinsic::amdgcn_mulhi_i24:
3908 NewOpcode = AMDGPUISD::MULHI_I24;
3909 break;
3910 case Intrinsic::amdgcn_mulhi_u24:
3911 NewOpcode = AMDGPUISD::MULHI_U24;
3912 break;
3913 default:
3914 llvm_unreachable("Expected 24-bit mul intrinsic");
3915 }
3916 }
3917
3918 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
3919
3920 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3921 // the operands to have other uses, but will only perform simplifications that
3922 // involve bypassing some nodes for this user.
3923 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3924 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3925 if (DemandedLHS || DemandedRHS)
3926 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
3927 N1: DemandedLHS ? DemandedLHS : LHS,
3928 N2: DemandedRHS ? DemandedRHS : RHS);
3929
3930 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3931 // operands if this node is the only user.
3932 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3933 return SDValue(Node24, 0);
3934 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3935 return SDValue(Node24, 0);
3936
3937 return SDValue();
3938}
3939
3940template <typename IntTy>
3941static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3942 uint32_t Width, const SDLoc &DL) {
3943 if (Width + Offset < 32) {
3944 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3945 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3946 if constexpr (std::is_signed_v<IntTy>) {
3947 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
3948 } else {
3949 return DAG.getConstant(Result, DL, MVT::i32);
3950 }
3951 }
3952
3953 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3954}
3955
3956static bool hasVolatileUser(SDNode *Val) {
3957 for (SDNode *U : Val->users()) {
3958 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3959 if (M->isVolatile())
3960 return true;
3961 }
3962 }
3963
3964 return false;
3965}
3966
3967bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3968 // i32 vectors are the canonical memory type.
3969 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3970 return false;
3971
3972 if (!VT.isByteSized())
3973 return false;
3974
3975 unsigned Size = VT.getStoreSize();
3976
3977 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3978 return false;
3979
3980 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3981 return false;
3982
3983 return true;
3984}
3985
3986// Replace load of an illegal type with a bitcast from a load of a friendlier
3987// type.
3988SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3989 DAGCombinerInfo &DCI) const {
3990 if (!DCI.isBeforeLegalize())
3991 return SDValue();
3992
3993 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3994 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
3995 return SDValue();
3996
3997 SDLoc SL(N);
3998 SelectionDAG &DAG = DCI.DAG;
3999 EVT VT = LN->getMemoryVT();
4000
4001 unsigned Size = VT.getStoreSize();
4002 Align Alignment = LN->getAlign();
4003 if (Alignment < Size && isTypeLegal(VT)) {
4004 unsigned IsFast;
4005 unsigned AS = LN->getAddressSpace();
4006
4007 // Expand unaligned loads earlier than legalization. Due to visitation order
4008 // problems during legalization, the emitted instructions to pack and unpack
4009 // the bytes again are not eliminated in the case of an unaligned copy.
4010 if (!allowsMisalignedMemoryAccesses(
4011 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4012 if (VT.isVector())
4013 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
4014
4015 SDValue Ops[2];
4016 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
4017
4018 return DAG.getMergeValues(Ops, dl: SDLoc(N));
4019 }
4020
4021 if (!IsFast)
4022 return SDValue();
4023 }
4024
4025 if (!shouldCombineMemoryType(VT))
4026 return SDValue();
4027
4028 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4029
4030 SDValue NewLoad
4031 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4032 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4033
4034 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4035 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
4036 return SDValue(N, 0);
4037}
4038
4039// Replace store of an illegal type with a store of a bitcast to a friendlier
4040// type.
4041SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4042 DAGCombinerInfo &DCI) const {
4043 if (!DCI.isBeforeLegalize())
4044 return SDValue();
4045
4046 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4047 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
4048 return SDValue();
4049
4050 EVT VT = SN->getMemoryVT();
4051 unsigned Size = VT.getStoreSize();
4052
4053 SDLoc SL(N);
4054 SelectionDAG &DAG = DCI.DAG;
4055 Align Alignment = SN->getAlign();
4056 if (Alignment < Size && isTypeLegal(VT)) {
4057 unsigned IsFast;
4058 unsigned AS = SN->getAddressSpace();
4059
4060 // Expand unaligned stores earlier than legalization. Due to visitation
4061 // order problems during legalization, the emitted instructions to pack and
4062 // unpack the bytes again are not eliminated in the case of an unaligned
4063 // copy.
4064 if (!allowsMisalignedMemoryAccesses(
4065 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4066 if (VT.isVector())
4067 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
4068
4069 return expandUnalignedStore(ST: SN, DAG);
4070 }
4071
4072 if (!IsFast)
4073 return SDValue();
4074 }
4075
4076 if (!shouldCombineMemoryType(VT))
4077 return SDValue();
4078
4079 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4080 SDValue Val = SN->getValue();
4081
4082 //DCI.AddToWorklist(Val.getNode());
4083
4084 bool OtherUses = !Val.hasOneUse();
4085 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
4086 if (OtherUses) {
4087 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
4088 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4089 }
4090
4091 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4092 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4093}
4094
4095// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4096// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4097// issues.
4098SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4099 DAGCombinerInfo &DCI) const {
4100 SelectionDAG &DAG = DCI.DAG;
4101 SDValue N0 = N->getOperand(Num: 0);
4102
4103 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4104 // (vt2 (truncate (assertzext vt0:x, vt1)))
4105 if (N0.getOpcode() == ISD::TRUNCATE) {
4106 SDValue N1 = N->getOperand(Num: 1);
4107 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4108 SDLoc SL(N);
4109
4110 SDValue Src = N0.getOperand(i: 0);
4111 EVT SrcVT = Src.getValueType();
4112 if (SrcVT.bitsGE(VT: ExtVT)) {
4113 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4114 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
4115 }
4116 }
4117
4118 return SDValue();
4119}
4120
4121SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4122 SDNode *N, DAGCombinerInfo &DCI) const {
4123 unsigned IID = N->getConstantOperandVal(Num: 0);
4124 switch (IID) {
4125 case Intrinsic::amdgcn_mul_i24:
4126 case Intrinsic::amdgcn_mul_u24:
4127 case Intrinsic::amdgcn_mulhi_i24:
4128 case Intrinsic::amdgcn_mulhi_u24:
4129 return simplifyMul24(Node24: N, DCI);
4130 case Intrinsic::amdgcn_fract:
4131 case Intrinsic::amdgcn_rsq:
4132 case Intrinsic::amdgcn_rcp_legacy:
4133 case Intrinsic::amdgcn_rsq_legacy:
4134 case Intrinsic::amdgcn_rsq_clamp:
4135 case Intrinsic::amdgcn_tanh:
4136 case Intrinsic::amdgcn_prng_b32: {
4137 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4138 SDValue Src = N->getOperand(Num: 1);
4139 return Src.isUndef() ? Src : SDValue();
4140 }
4141 case Intrinsic::amdgcn_frexp_exp: {
4142 // frexp_exp (fneg x) -> frexp_exp x
4143 // frexp_exp (fabs x) -> frexp_exp x
4144 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4145 SDValue Src = N->getOperand(Num: 1);
4146 SDValue PeekSign = peekFPSignOps(Val: Src);
4147 if (PeekSign == Src)
4148 return SDValue();
4149 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4150 0);
4151 }
4152 default:
4153 return SDValue();
4154 }
4155}
4156
4157/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4158/// binary operation \p Opc to it with the corresponding constant operands.
4159SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4160 DAGCombinerInfo &DCI, const SDLoc &SL,
4161 unsigned Opc, SDValue LHS,
4162 uint32_t ValLo, uint32_t ValHi) const {
4163 SelectionDAG &DAG = DCI.DAG;
4164 SDValue Lo, Hi;
4165 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4166
4167 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4168 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4169
4170 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4171 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4172
4173 // Re-visit the ands. It's possible we eliminated one of them and it could
4174 // simplify the vector.
4175 DCI.AddToWorklist(N: Lo.getNode());
4176 DCI.AddToWorklist(N: Hi.getNode());
4177
4178 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4179 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4180}
4181
4182SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4183 DAGCombinerInfo &DCI) const {
4184 EVT VT = N->getValueType(ResNo: 0);
4185 SDValue LHS = N->getOperand(Num: 0);
4186 SDValue RHS = N->getOperand(Num: 1);
4187 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4188 SDLoc SL(N);
4189 SelectionDAG &DAG = DCI.DAG;
4190
4191 unsigned RHSVal;
4192 if (CRHS) {
4193 RHSVal = CRHS->getZExtValue();
4194 if (!RHSVal)
4195 return LHS;
4196
4197 switch (LHS->getOpcode()) {
4198 default:
4199 break;
4200 case ISD::ZERO_EXTEND:
4201 case ISD::SIGN_EXTEND:
4202 case ISD::ANY_EXTEND: {
4203 SDValue X = LHS->getOperand(Num: 0);
4204
4205 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4206 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4207 // Prefer build_vector as the canonical form if packed types are legal.
4208 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4209 SDValue Vec = DAG.getBuildVector(
4210 VT: MVT::v2i16, DL: SL,
4211 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4212 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4213 }
4214
4215 // shl (ext x) => zext (shl x), if shift does not overflow int
4216 if (VT != MVT::i64)
4217 break;
4218 KnownBits Known = DAG.computeKnownBits(Op: X);
4219 unsigned LZ = Known.countMinLeadingZeros();
4220 if (LZ < RHSVal)
4221 break;
4222 EVT XVT = X.getValueType();
4223 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4224 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4225 }
4226 }
4227 }
4228
4229 if (VT.getScalarType() != MVT::i64)
4230 return SDValue();
4231
4232 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4233 // common case, splitting this into a move and a 32-bit shift is faster and
4234 // the same code size.
4235 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4236
4237 EVT ElementType = VT.getScalarType();
4238 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4239 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4240
4241 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4242 return SDValue();
4243 SDValue ShiftAmt;
4244
4245 if (CRHS) {
4246 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4247 VT: TargetType);
4248 } else {
4249 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4250 const SDValue ShiftMask =
4251 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4252 // This AND instruction will clamp out of bounds shift values.
4253 // It will also be removed during later instruction selection.
4254 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4255 }
4256
4257 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4258 SDValue NewShift =
4259 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4260
4261 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4262 SDValue Vec;
4263
4264 if (VT.isVector()) {
4265 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4266 unsigned NElts = TargetType.getVectorNumElements();
4267 SmallVector<SDValue, 8> HiOps;
4268 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4269
4270 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4271 for (unsigned I = 0; I != NElts; ++I)
4272 HiAndLoOps[2 * I + 1] = HiOps[I];
4273 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4274 } else {
4275 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4276 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4277 }
4278 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4279}
4280
4281SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4282 DAGCombinerInfo &DCI) const {
4283 SDValue RHS = N->getOperand(Num: 1);
4284 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4285 EVT VT = N->getValueType(ResNo: 0);
4286 SDValue LHS = N->getOperand(Num: 0);
4287 SelectionDAG &DAG = DCI.DAG;
4288 SDLoc SL(N);
4289
4290 if (VT.getScalarType() != MVT::i64)
4291 return SDValue();
4292
4293 // For C >= 32
4294 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4295
4296 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4297 // common case, splitting this into a move and a 32-bit shift is faster and
4298 // the same code size.
4299 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4300
4301 EVT ElementType = VT.getScalarType();
4302 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4303 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4304
4305 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4306 return SDValue();
4307
4308 SDValue ShiftFullAmt =
4309 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4310 SDValue ShiftAmt;
4311 if (CRHS) {
4312 unsigned RHSVal = CRHS->getZExtValue();
4313 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4314 VT: TargetType);
4315 } else if (Known.getMinValue().getZExtValue() ==
4316 (ElementType.getSizeInBits() - 1)) {
4317 ShiftAmt = ShiftFullAmt;
4318 } else {
4319 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4320 const SDValue ShiftMask =
4321 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4322 // This AND instruction will clamp out of bounds shift values.
4323 // It will also be removed during later instruction selection.
4324 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4325 }
4326
4327 EVT ConcatType;
4328 SDValue Hi;
4329 SDLoc LHSSL(LHS);
4330 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4331 if (VT.isVector()) {
4332 unsigned NElts = TargetType.getVectorNumElements();
4333 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4334 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4335 SmallVector<SDValue, 8> HiOps(NElts);
4336 SmallVector<SDValue, 16> HiAndLoOps;
4337
4338 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4339 for (unsigned I = 0; I != NElts; ++I) {
4340 HiOps[I] = HiAndLoOps[2 * I + 1];
4341 }
4342 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4343 } else {
4344 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4345 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4346 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4347 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4348 }
4349
4350 KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4351 SDValue HiShift;
4352 if (KnownLHS.isNegative()) {
4353 HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4354 } else {
4355 Hi = DAG.getFreeze(V: Hi);
4356 HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4357 }
4358 SDValue NewShift =
4359 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4360
4361 SDValue Vec;
4362 if (VT.isVector()) {
4363 unsigned NElts = TargetType.getVectorNumElements();
4364 SmallVector<SDValue, 8> HiOps;
4365 SmallVector<SDValue, 8> LoOps;
4366 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4367
4368 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4369 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4370 for (unsigned I = 0; I != NElts; ++I) {
4371 HiAndLoOps[2 * I + 1] = HiOps[I];
4372 HiAndLoOps[2 * I] = LoOps[I];
4373 }
4374 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4375 } else {
4376 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4377 }
4378 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4379}
4380
4381SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4382 DAGCombinerInfo &DCI) const {
4383 SDValue RHS = N->getOperand(Num: 1);
4384 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4385 EVT VT = N->getValueType(ResNo: 0);
4386 SDValue LHS = N->getOperand(Num: 0);
4387 SelectionDAG &DAG = DCI.DAG;
4388 SDLoc SL(N);
4389 unsigned RHSVal;
4390
4391 if (CRHS) {
4392 RHSVal = CRHS->getZExtValue();
4393
4394 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4395 // this improves the ability to match BFE patterns in isel.
4396 if (LHS.getOpcode() == ISD::AND) {
4397 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4398 unsigned MaskIdx, MaskLen;
4399 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4400 MaskIdx == RHSVal) {
4401 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4402 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4403 N2: N->getOperand(Num: 1)),
4404 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4405 N2: N->getOperand(Num: 1)));
4406 }
4407 }
4408 }
4409 }
4410
4411 if (VT.getScalarType() != MVT::i64)
4412 return SDValue();
4413
4414 // for C >= 32
4415 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4416
4417 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4418 // common case, splitting this into a move and a 32-bit shift is faster and
4419 // the same code size.
4420 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4421
4422 EVT ElementType = VT.getScalarType();
4423 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4424 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4425
4426 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4427 return SDValue();
4428
4429 SDValue ShiftAmt;
4430 if (CRHS) {
4431 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4432 VT: TargetType);
4433 } else {
4434 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4435 const SDValue ShiftMask =
4436 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4437 // This AND instruction will clamp out of bounds shift values.
4438 // It will also be removed during later instruction selection.
4439 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4440 }
4441
4442 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4443 EVT ConcatType;
4444 SDValue Hi;
4445 SDLoc LHSSL(LHS);
4446 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4447 if (VT.isVector()) {
4448 unsigned NElts = TargetType.getVectorNumElements();
4449 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4450 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4451 SmallVector<SDValue, 8> HiOps(NElts);
4452 SmallVector<SDValue, 16> HiAndLoOps;
4453
4454 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4455 for (unsigned I = 0; I != NElts; ++I)
4456 HiOps[I] = HiAndLoOps[2 * I + 1];
4457 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4458 } else {
4459 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4460 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4461 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4462 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4463 }
4464
4465 SDValue NewShift =
4466 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4467
4468 SDValue Vec;
4469 if (VT.isVector()) {
4470 unsigned NElts = TargetType.getVectorNumElements();
4471 SmallVector<SDValue, 8> LoOps;
4472 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4473
4474 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4475 for (unsigned I = 0; I != NElts; ++I)
4476 HiAndLoOps[2 * I] = LoOps[I];
4477 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4478 } else {
4479 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4480 }
4481 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4482}
4483
4484SDValue AMDGPUTargetLowering::performTruncateCombine(
4485 SDNode *N, DAGCombinerInfo &DCI) const {
4486 SDLoc SL(N);
4487 SelectionDAG &DAG = DCI.DAG;
4488 EVT VT = N->getValueType(ResNo: 0);
4489 SDValue Src = N->getOperand(Num: 0);
4490
4491 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4492 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4493 SDValue Vec = Src.getOperand(i: 0);
4494 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4495 SDValue Elt0 = Vec.getOperand(i: 0);
4496 EVT EltVT = Elt0.getValueType();
4497 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4498 if (EltVT.isFloatingPoint()) {
4499 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4500 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4501 }
4502
4503 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4504 }
4505 }
4506 }
4507
4508 // Equivalent of above for accessing the high element of a vector as an
4509 // integer operation.
4510 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4511 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4512 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4513 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4514 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4515 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4516 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4517 unsigned BitIndex = K->getZExtValue();
4518 unsigned PartIndex = BitIndex / SrcEltSize;
4519
4520 if (PartIndex * SrcEltSize == BitIndex &&
4521 PartIndex < BV.getNumOperands()) {
4522 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4523 SDValue SrcElt =
4524 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4525 Operand: BV.getOperand(i: PartIndex));
4526 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4527 }
4528 }
4529 }
4530 }
4531 }
4532
4533 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4534 //
4535 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4536 // i16 (trunc (srl (i32 (trunc x), K)))
4537 if (VT.getScalarSizeInBits() < 32) {
4538 EVT SrcVT = Src.getValueType();
4539 if (SrcVT.getScalarSizeInBits() > 32 &&
4540 (Src.getOpcode() == ISD::SRL ||
4541 Src.getOpcode() == ISD::SRA ||
4542 Src.getOpcode() == ISD::SHL)) {
4543 SDValue Amt = Src.getOperand(i: 1);
4544 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4545
4546 // - For left shifts, do the transform as long as the shift
4547 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4548 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4549 // losing information stored in the high bits when truncating.
4550 const unsigned MaxCstSize =
4551 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4552 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4553 EVT MidVT = VT.isVector() ?
4554 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4555 NumElements: VT.getVectorNumElements()) : MVT::i32;
4556
4557 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4558 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4559 Operand: Src.getOperand(i: 0));
4560 DCI.AddToWorklist(N: Trunc.getNode());
4561
4562 if (Amt.getValueType() != NewShiftVT) {
4563 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4564 DCI.AddToWorklist(N: Amt.getNode());
4565 }
4566
4567 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4568 N1: Trunc, N2: Amt);
4569 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4570 }
4571 }
4572 }
4573
4574 return SDValue();
4575}
4576
4577// We need to specifically handle i64 mul here to avoid unnecessary conversion
4578// instructions. If we only match on the legalized i64 mul expansion,
4579// SimplifyDemandedBits will be unable to remove them because there will be
4580// multiple uses due to the separate mul + mulh[su].
4581static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4582 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4583 if (Size <= 32) {
4584 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4585 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4586 }
4587
4588 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4589 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4590
4591 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4592 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4593
4594 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4595}
4596
4597/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4598/// return SDValue().
4599static SDValue getAddOneOp(const SDNode *V) {
4600 if (V->getOpcode() != ISD::ADD)
4601 return SDValue();
4602
4603 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4604}
4605
4606SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4607 DAGCombinerInfo &DCI) const {
4608 assert(N->getOpcode() == ISD::MUL);
4609 EVT VT = N->getValueType(ResNo: 0);
4610
4611 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4612 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4613 // unnecessarily). isDivergent() is used as an approximation of whether the
4614 // value is in an SGPR.
4615 if (!N->isDivergent())
4616 return SDValue();
4617
4618 unsigned Size = VT.getSizeInBits();
4619 if (VT.isVector() || Size > 64)
4620 return SDValue();
4621
4622 SelectionDAG &DAG = DCI.DAG;
4623 SDLoc DL(N);
4624
4625 SDValue N0 = N->getOperand(Num: 0);
4626 SDValue N1 = N->getOperand(Num: 1);
4627
4628 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4629 // matching.
4630
4631 // mul x, (add y, 1) -> add (mul x, y), x
4632 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4633 SDValue AddOp = getAddOneOp(V: V.getNode());
4634 if (!AddOp)
4635 return SDValue();
4636
4637 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4638 return U->getOpcode() == ISD::MUL;
4639 }))
4640 return AddOp;
4641
4642 return SDValue();
4643 };
4644
4645 // FIXME: The selection pattern is not properly checking for commuted
4646 // operands, so we have to place the mul in the LHS
4647 if (SDValue MulOper = IsFoldableAdd(N0)) {
4648 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4649 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4650 }
4651
4652 if (SDValue MulOper = IsFoldableAdd(N1)) {
4653 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4654 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4655 }
4656
4657 // There are i16 integer mul/mad.
4658 if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4659 return SDValue();
4660
4661 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4662 // in the source into any_extends if the result of the mul is truncated. Since
4663 // we can assume the high bits are whatever we want, use the underlying value
4664 // to avoid the unknown high bits from interfering.
4665 if (N0.getOpcode() == ISD::ANY_EXTEND)
4666 N0 = N0.getOperand(i: 0);
4667
4668 if (N1.getOpcode() == ISD::ANY_EXTEND)
4669 N1 = N1.getOperand(i: 0);
4670
4671 SDValue Mul;
4672
4673 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4674 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4675 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4676 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4677 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4678 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4679 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4680 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4681 } else {
4682 return SDValue();
4683 }
4684
4685 // We need to use sext even for MUL_U24, because MUL_U24 is used
4686 // for signed multiply of 8 and 16-bit types.
4687 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4688}
4689
4690SDValue
4691AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4692 DAGCombinerInfo &DCI) const {
4693 if (N->getValueType(ResNo: 0) != MVT::i32)
4694 return SDValue();
4695
4696 SelectionDAG &DAG = DCI.DAG;
4697 SDLoc DL(N);
4698
4699 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4700 SDValue N0 = N->getOperand(Num: 0);
4701 SDValue N1 = N->getOperand(Num: 1);
4702
4703 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4704 // in the source into any_extends if the result of the mul is truncated. Since
4705 // we can assume the high bits are whatever we want, use the underlying value
4706 // to avoid the unknown high bits from interfering.
4707 if (N0.getOpcode() == ISD::ANY_EXTEND)
4708 N0 = N0.getOperand(i: 0);
4709 if (N1.getOpcode() == ISD::ANY_EXTEND)
4710 N1 = N1.getOperand(i: 0);
4711
4712 // Try to use two fast 24-bit multiplies (one for each half of the result)
4713 // instead of one slow extending multiply.
4714 unsigned LoOpcode = 0;
4715 unsigned HiOpcode = 0;
4716 if (Signed) {
4717 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4718 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4719 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4720 LoOpcode = AMDGPUISD::MUL_I24;
4721 HiOpcode = AMDGPUISD::MULHI_I24;
4722 }
4723 } else {
4724 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4725 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4726 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4727 LoOpcode = AMDGPUISD::MUL_U24;
4728 HiOpcode = AMDGPUISD::MULHI_U24;
4729 }
4730 }
4731 if (!LoOpcode)
4732 return SDValue();
4733
4734 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4735 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4736 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4737 return SDValue(N, 0);
4738}
4739
4740SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4741 DAGCombinerInfo &DCI) const {
4742 EVT VT = N->getValueType(ResNo: 0);
4743
4744 if (!Subtarget->hasMulI24() || VT.isVector())
4745 return SDValue();
4746
4747 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4748 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4749 // unnecessarily). isDivergent() is used as an approximation of whether the
4750 // value is in an SGPR.
4751 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4752 // valu op anyway)
4753 if (Subtarget->hasSMulHi() && !N->isDivergent())
4754 return SDValue();
4755
4756 SelectionDAG &DAG = DCI.DAG;
4757 SDLoc DL(N);
4758
4759 SDValue N0 = N->getOperand(Num: 0);
4760 SDValue N1 = N->getOperand(Num: 1);
4761
4762 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4763 return SDValue();
4764
4765 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4766 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4767
4768 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4769 DCI.AddToWorklist(N: Mulhi.getNode());
4770 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4771}
4772
4773SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4774 DAGCombinerInfo &DCI) const {
4775 EVT VT = N->getValueType(ResNo: 0);
4776
4777 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4778 return SDValue();
4779
4780 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4781 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4782 // unnecessarily). isDivergent() is used as an approximation of whether the
4783 // value is in an SGPR.
4784 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4785 // valu op anyway)
4786 if (!N->isDivergent() && Subtarget->hasSMulHi())
4787 return SDValue();
4788
4789 SelectionDAG &DAG = DCI.DAG;
4790 SDLoc DL(N);
4791
4792 SDValue N0 = N->getOperand(Num: 0);
4793 SDValue N1 = N->getOperand(Num: 1);
4794
4795 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4796 return SDValue();
4797
4798 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4799 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4800
4801 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4802 DCI.AddToWorklist(N: Mulhi.getNode());
4803 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4804}
4805
4806SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4807 SDValue Op,
4808 const SDLoc &DL,
4809 unsigned Opc) const {
4810 EVT VT = Op.getValueType();
4811 if (VT.bitsGT(VT: MVT::i32))
4812 return SDValue();
4813
4814 if (VT != MVT::i32)
4815 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4816
4817 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4818 if (VT != MVT::i32)
4819 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4820
4821 return FFBX;
4822}
4823
4824// The native instructions return -1 on 0 input. Optimize out a select that
4825// produces -1 on 0.
4826//
4827// TODO: If zero is not undef, we could also do this if the output is compared
4828// against the bitwidth.
4829//
4830// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4831SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4832 SDValue LHS, SDValue RHS,
4833 DAGCombinerInfo &DCI) const {
4834 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4835 return SDValue();
4836
4837 SelectionDAG &DAG = DCI.DAG;
4838 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4839 SDValue CmpLHS = Cond.getOperand(i: 0);
4840
4841 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4842 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4843 if (CCOpcode == ISD::SETEQ &&
4844 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4845 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4846 unsigned Opc =
4847 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4848 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4849 }
4850
4851 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4852 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4853 if (CCOpcode == ISD::SETNE &&
4854 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4855 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
4856 unsigned Opc =
4857 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4858
4859 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4860 }
4861
4862 return SDValue();
4863}
4864
4865static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4866 unsigned Op,
4867 const SDLoc &SL,
4868 SDValue Cond,
4869 SDValue N1,
4870 SDValue N2) {
4871 SelectionDAG &DAG = DCI.DAG;
4872 EVT VT = N1.getValueType();
4873
4874 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4875 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
4876 DCI.AddToWorklist(N: NewSelect.getNode());
4877 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4878}
4879
4880// Pull a free FP operation out of a select so it may fold into uses.
4881//
4882// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4883// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4884//
4885// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4886// select c, (fabs x), +k -> fabs (select c, x, k)
4887SDValue
4888AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4889 SDValue N) const {
4890 SelectionDAG &DAG = DCI.DAG;
4891 SDValue Cond = N.getOperand(i: 0);
4892 SDValue LHS = N.getOperand(i: 1);
4893 SDValue RHS = N.getOperand(i: 2);
4894
4895 EVT VT = N.getValueType();
4896 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4897 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4898 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4899 return SDValue();
4900
4901 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4902 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
4903 }
4904
4905 bool Inv = false;
4906 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4907 std::swap(a&: LHS, b&: RHS);
4908 Inv = true;
4909 }
4910
4911 // TODO: Support vector constants.
4912 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4913 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4914 !selectSupportsSourceMods(N: N.getNode())) {
4915 SDLoc SL(N);
4916 // If one side is an fneg/fabs and the other is a constant, we can push the
4917 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4918 SDValue NewLHS = LHS.getOperand(i: 0);
4919 SDValue NewRHS = RHS;
4920
4921 // Careful: if the neg can be folded up, don't try to pull it back down.
4922 bool ShouldFoldNeg = true;
4923
4924 if (NewLHS.hasOneUse()) {
4925 unsigned Opc = NewLHS.getOpcode();
4926 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4927 ShouldFoldNeg = false;
4928 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4929 ShouldFoldNeg = false;
4930 }
4931
4932 if (ShouldFoldNeg) {
4933 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4934 return SDValue();
4935
4936 // We're going to be forced to use a source modifier anyway, there's no
4937 // point to pulling the negate out unless we can get a size reduction by
4938 // negating the constant.
4939 //
4940 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4941 // about cheaper constants.
4942 if (NewLHS.getOpcode() == ISD::FABS &&
4943 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4944 return SDValue();
4945
4946 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4947 return SDValue();
4948
4949 if (LHS.getOpcode() == ISD::FNEG)
4950 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4951
4952 if (Inv)
4953 std::swap(a&: NewLHS, b&: NewRHS);
4954
4955 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4956 N1: Cond, N2: NewLHS, N3: NewRHS);
4957 DCI.AddToWorklist(N: NewSelect.getNode());
4958 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4959 }
4960 }
4961
4962 return SDValue();
4963}
4964
4965SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4966 DAGCombinerInfo &DCI) const {
4967 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
4968 return Folded;
4969
4970 SDValue Cond = N->getOperand(Num: 0);
4971 if (Cond.getOpcode() != ISD::SETCC)
4972 return SDValue();
4973
4974 EVT VT = N->getValueType(ResNo: 0);
4975 SDValue LHS = Cond.getOperand(i: 0);
4976 SDValue RHS = Cond.getOperand(i: 1);
4977 SDValue CC = Cond.getOperand(i: 2);
4978
4979 SDValue True = N->getOperand(Num: 1);
4980 SDValue False = N->getOperand(Num: 2);
4981
4982 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4983 SelectionDAG &DAG = DCI.DAG;
4984 if (DAG.isConstantValueOfAnyType(N: True) &&
4985 !DAG.isConstantValueOfAnyType(N: False)) {
4986 // Swap cmp + select pair to move constant to false input.
4987 // This will allow using VOPC cndmasks more often.
4988 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4989
4990 SDLoc SL(N);
4991 ISD::CondCode NewCC =
4992 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4993
4994 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4995 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4996 }
4997
4998 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4999 SDValue MinMax
5000 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5001 // Revisit this node so we can catch min3/max3/med3 patterns.
5002 //DCI.AddToWorklist(MinMax.getNode());
5003 return MinMax;
5004 }
5005 }
5006
5007 // There's no reason to not do this if the condition has other uses.
5008 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
5009}
5010
5011static bool isInv2Pi(const APFloat &APF) {
5012 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5013 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5014 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5015
5016 return APF.bitwiseIsEqual(RHS: KF16) ||
5017 APF.bitwiseIsEqual(RHS: KF32) ||
5018 APF.bitwiseIsEqual(RHS: KF64);
5019}
5020
5021// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5022// additional cost to negate them.
5023TargetLowering::NegatibleCost
5024AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
5025 if (C->isZero())
5026 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5027
5028 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5029 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5030
5031 return NegatibleCost::Neutral;
5032}
5033
5034bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5035 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5036 return getConstantNegateCost(C) == NegatibleCost::Expensive;
5037 return false;
5038}
5039
5040bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5041 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5042 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5043 return false;
5044}
5045
5046static unsigned inverseMinMax(unsigned Opc) {
5047 switch (Opc) {
5048 case ISD::FMAXNUM:
5049 return ISD::FMINNUM;
5050 case ISD::FMINNUM:
5051 return ISD::FMAXNUM;
5052 case ISD::FMAXNUM_IEEE:
5053 return ISD::FMINNUM_IEEE;
5054 case ISD::FMINNUM_IEEE:
5055 return ISD::FMAXNUM_IEEE;
5056 case ISD::FMAXIMUM:
5057 return ISD::FMINIMUM;
5058 case ISD::FMINIMUM:
5059 return ISD::FMAXIMUM;
5060 case ISD::FMAXIMUMNUM:
5061 return ISD::FMINIMUMNUM;
5062 case ISD::FMINIMUMNUM:
5063 return ISD::FMAXIMUMNUM;
5064 case AMDGPUISD::FMAX_LEGACY:
5065 return AMDGPUISD::FMIN_LEGACY;
5066 case AMDGPUISD::FMIN_LEGACY:
5067 return AMDGPUISD::FMAX_LEGACY;
5068 default:
5069 llvm_unreachable("invalid min/max opcode");
5070 }
5071}
5072
5073/// \return true if it's profitable to try to push an fneg into its source
5074/// instruction.
5075bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5076 // If the input has multiple uses and we can either fold the negate down, or
5077 // the other uses cannot, give up. This both prevents unprofitable
5078 // transformations and infinite loops: we won't repeatedly try to fold around
5079 // a negate that has no 'good' form.
5080 if (N0.hasOneUse()) {
5081 // This may be able to fold into the source, but at a code size cost. Don't
5082 // fold if the fold into the user is free.
5083 if (allUsesHaveSourceMods(N, CostThreshold: 0))
5084 return false;
5085 } else {
5086 if (fnegFoldsIntoOp(N: N0.getNode()) &&
5087 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
5088 return false;
5089 }
5090
5091 return true;
5092}
5093
5094SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5095 DAGCombinerInfo &DCI) const {
5096 SelectionDAG &DAG = DCI.DAG;
5097 SDValue N0 = N->getOperand(Num: 0);
5098 EVT VT = N->getValueType(ResNo: 0);
5099
5100 unsigned Opc = N0.getOpcode();
5101
5102 if (!shouldFoldFNegIntoSrc(N, N0))
5103 return SDValue();
5104
5105 SDLoc SL(N);
5106 switch (Opc) {
5107 case ISD::FADD: {
5108 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5109 return SDValue();
5110
5111 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5112 SDValue LHS = N0.getOperand(i: 0);
5113 SDValue RHS = N0.getOperand(i: 1);
5114
5115 if (LHS.getOpcode() != ISD::FNEG)
5116 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5117 else
5118 LHS = LHS.getOperand(i: 0);
5119
5120 if (RHS.getOpcode() != ISD::FNEG)
5121 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5122 else
5123 RHS = RHS.getOperand(i: 0);
5124
5125 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5126 if (Res.getOpcode() != ISD::FADD)
5127 return SDValue(); // Op got folded away.
5128 if (!N0.hasOneUse())
5129 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5130 return Res;
5131 }
5132 case ISD::FMUL:
5133 case AMDGPUISD::FMUL_LEGACY: {
5134 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5135 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5136 SDValue LHS = N0.getOperand(i: 0);
5137 SDValue RHS = N0.getOperand(i: 1);
5138
5139 if (LHS.getOpcode() == ISD::FNEG)
5140 LHS = LHS.getOperand(i: 0);
5141 else if (RHS.getOpcode() == ISD::FNEG)
5142 RHS = RHS.getOperand(i: 0);
5143 else
5144 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5145
5146 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5147 if (Res.getOpcode() != Opc)
5148 return SDValue(); // Op got folded away.
5149 if (!N0.hasOneUse())
5150 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5151 return Res;
5152 }
5153 case ISD::FMA:
5154 case ISD::FMAD: {
5155 // TODO: handle llvm.amdgcn.fma.legacy
5156 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5157 return SDValue();
5158
5159 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5160 SDValue LHS = N0.getOperand(i: 0);
5161 SDValue MHS = N0.getOperand(i: 1);
5162 SDValue RHS = N0.getOperand(i: 2);
5163
5164 if (LHS.getOpcode() == ISD::FNEG)
5165 LHS = LHS.getOperand(i: 0);
5166 else if (MHS.getOpcode() == ISD::FNEG)
5167 MHS = MHS.getOperand(i: 0);
5168 else
5169 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5170
5171 if (RHS.getOpcode() != ISD::FNEG)
5172 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5173 else
5174 RHS = RHS.getOperand(i: 0);
5175
5176 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5177 if (Res.getOpcode() != Opc)
5178 return SDValue(); // Op got folded away.
5179 if (!N0.hasOneUse())
5180 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5181 return Res;
5182 }
5183 case ISD::FMAXNUM:
5184 case ISD::FMINNUM:
5185 case ISD::FMAXNUM_IEEE:
5186 case ISD::FMINNUM_IEEE:
5187 case ISD::FMINIMUM:
5188 case ISD::FMAXIMUM:
5189 case ISD::FMINIMUMNUM:
5190 case ISD::FMAXIMUMNUM:
5191 case AMDGPUISD::FMAX_LEGACY:
5192 case AMDGPUISD::FMIN_LEGACY: {
5193 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5194 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5195 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5196 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5197
5198 SDValue LHS = N0.getOperand(i: 0);
5199 SDValue RHS = N0.getOperand(i: 1);
5200
5201 // 0 doesn't have a negated inline immediate.
5202 // TODO: This constant check should be generalized to other operations.
5203 if (isConstantCostlierToNegate(N: RHS))
5204 return SDValue();
5205
5206 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5207 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5208 unsigned Opposite = inverseMinMax(Opc);
5209
5210 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5211 if (Res.getOpcode() != Opposite)
5212 return SDValue(); // Op got folded away.
5213 if (!N0.hasOneUse())
5214 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5215 return Res;
5216 }
5217 case AMDGPUISD::FMED3: {
5218 SDValue Ops[3];
5219 for (unsigned I = 0; I < 3; ++I)
5220 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5221
5222 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5223 if (Res.getOpcode() != AMDGPUISD::FMED3)
5224 return SDValue(); // Op got folded away.
5225
5226 if (!N0.hasOneUse()) {
5227 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5228 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5229
5230 for (SDNode *U : Neg->users())
5231 DCI.AddToWorklist(N: U);
5232 }
5233
5234 return Res;
5235 }
5236 case ISD::FP_EXTEND:
5237 case ISD::FTRUNC:
5238 case ISD::FRINT:
5239 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5240 case ISD::FROUNDEVEN:
5241 case ISD::FSIN:
5242 case ISD::FCANONICALIZE:
5243 case AMDGPUISD::RCP:
5244 case AMDGPUISD::RCP_LEGACY:
5245 case AMDGPUISD::RCP_IFLAG:
5246 case AMDGPUISD::SIN_HW: {
5247 SDValue CvtSrc = N0.getOperand(i: 0);
5248 if (CvtSrc.getOpcode() == ISD::FNEG) {
5249 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5250 // (fneg (rcp (fneg x))) -> (rcp x)
5251 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5252 }
5253
5254 if (!N0.hasOneUse())
5255 return SDValue();
5256
5257 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5258 // (fneg (rcp x)) -> (rcp (fneg x))
5259 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5260 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5261 }
5262 case ISD::FP_ROUND: {
5263 SDValue CvtSrc = N0.getOperand(i: 0);
5264
5265 if (CvtSrc.getOpcode() == ISD::FNEG) {
5266 // (fneg (fp_round (fneg x))) -> (fp_round x)
5267 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5268 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5269 }
5270
5271 if (!N0.hasOneUse())
5272 return SDValue();
5273
5274 // (fneg (fp_round x)) -> (fp_round (fneg x))
5275 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5276 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5277 }
5278 case ISD::FP16_TO_FP: {
5279 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5280 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5281 // Put the fneg back as a legal source operation that can be matched later.
5282 SDLoc SL(N);
5283
5284 SDValue Src = N0.getOperand(i: 0);
5285 EVT SrcVT = Src.getValueType();
5286
5287 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5288 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5289 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5290 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5291 }
5292 case ISD::SELECT: {
5293 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5294 // TODO: Invert conditions of foldFreeOpFromSelect
5295 return SDValue();
5296 }
5297 case ISD::BITCAST: {
5298 SDLoc SL(N);
5299 SDValue BCSrc = N0.getOperand(i: 0);
5300 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5301 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5302 if (HighBits.getValueType().getSizeInBits() != 32 ||
5303 !fnegFoldsIntoOp(N: HighBits.getNode()))
5304 return SDValue();
5305
5306 // f64 fneg only really needs to operate on the high half of of the
5307 // register, so try to force it to an f32 operation to help make use of
5308 // source modifiers.
5309 //
5310 //
5311 // fneg (f64 (bitcast (build_vector x, y))) ->
5312 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5313 // (fneg (bitcast i32:y to f32)))
5314
5315 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5316 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5317 SDValue CastBack =
5318 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5319
5320 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5321 Ops.back() = CastBack;
5322 DCI.AddToWorklist(N: NegHi.getNode());
5323 SDValue Build =
5324 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5325 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5326
5327 if (!N0.hasOneUse())
5328 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5329 return Result;
5330 }
5331
5332 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5333 BCSrc.hasOneUse()) {
5334 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5335 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5336
5337 // TODO: Cast back result for multiple uses is beneficial in some cases.
5338
5339 SDValue LHS =
5340 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5341 SDValue RHS =
5342 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5343
5344 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5345 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5346
5347 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5348 N3: NegRHS);
5349 }
5350
5351 return SDValue();
5352 }
5353 default:
5354 return SDValue();
5355 }
5356}
5357
5358SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5359 DAGCombinerInfo &DCI) const {
5360 SelectionDAG &DAG = DCI.DAG;
5361 SDValue N0 = N->getOperand(Num: 0);
5362
5363 if (!N0.hasOneUse())
5364 return SDValue();
5365
5366 switch (N0.getOpcode()) {
5367 case ISD::FP16_TO_FP: {
5368 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5369 SDLoc SL(N);
5370 SDValue Src = N0.getOperand(i: 0);
5371 EVT SrcVT = Src.getValueType();
5372
5373 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5374 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5375 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5376 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5377 }
5378 default:
5379 return SDValue();
5380 }
5381}
5382
5383SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5384 DAGCombinerInfo &DCI) const {
5385 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5386 if (!CFP)
5387 return SDValue();
5388
5389 // XXX - Should this flush denormals?
5390 const APFloat &Val = CFP->getValueAPF();
5391 APFloat One(Val.getSemantics(), "1.0");
5392 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5393}
5394
5395SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5396 DAGCombinerInfo &DCI) const {
5397 SelectionDAG &DAG = DCI.DAG;
5398 SDLoc DL(N);
5399
5400 switch(N->getOpcode()) {
5401 default:
5402 break;
5403 case ISD::BITCAST: {
5404 EVT DestVT = N->getValueType(ResNo: 0);
5405
5406 // Push casts through vector builds. This helps avoid emitting a large
5407 // number of copies when materializing floating point vector constants.
5408 //
5409 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5410 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5411 if (DestVT.isVector()) {
5412 SDValue Src = N->getOperand(Num: 0);
5413 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5414 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5415 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5416 EVT SrcVT = Src.getValueType();
5417 unsigned NElts = DestVT.getVectorNumElements();
5418
5419 if (SrcVT.getVectorNumElements() == NElts) {
5420 EVT DestEltVT = DestVT.getVectorElementType();
5421
5422 SmallVector<SDValue, 8> CastedElts;
5423 SDLoc SL(N);
5424 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5425 SDValue Elt = Src.getOperand(i: I);
5426 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5427 }
5428
5429 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5430 }
5431 }
5432 }
5433
5434 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5435 break;
5436
5437 // Fold bitcasts of constants.
5438 //
5439 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5440 // TODO: Generalize and move to DAGCombiner
5441 SDValue Src = N->getOperand(Num: 0);
5442 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5443 SDLoc SL(N);
5444 uint64_t CVal = C->getZExtValue();
5445 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5446 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5447 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5448 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5449 }
5450
5451 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5452 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5453 SDLoc SL(N);
5454 uint64_t CVal = Val.getZExtValue();
5455 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5456 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5457 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5458
5459 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5460 }
5461
5462 break;
5463 }
5464 case ISD::SHL:
5465 case ISD::SRA:
5466 case ISD::SRL: {
5467 // Range metadata can be invalidated when loads are converted to legal types
5468 // (e.g. v2i64 -> v4i32).
5469 // Try to convert vector shl/sra/srl before type legalization so that range
5470 // metadata can be utilized.
5471 if (!(N->getValueType(ResNo: 0).isVector() &&
5472 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5473 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5474 break;
5475 if (N->getOpcode() == ISD::SHL)
5476 return performShlCombine(N, DCI);
5477 if (N->getOpcode() == ISD::SRA)
5478 return performSraCombine(N, DCI);
5479 return performSrlCombine(N, DCI);
5480 }
5481 case ISD::TRUNCATE:
5482 return performTruncateCombine(N, DCI);
5483 case ISD::MUL:
5484 return performMulCombine(N, DCI);
5485 case AMDGPUISD::MUL_U24:
5486 case AMDGPUISD::MUL_I24: {
5487 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5488 return Simplified;
5489 break;
5490 }
5491 case AMDGPUISD::MULHI_I24:
5492 case AMDGPUISD::MULHI_U24:
5493 return simplifyMul24(Node24: N, DCI);
5494 case ISD::SMUL_LOHI:
5495 case ISD::UMUL_LOHI:
5496 return performMulLoHiCombine(N, DCI);
5497 case ISD::MULHS:
5498 return performMulhsCombine(N, DCI);
5499 case ISD::MULHU:
5500 return performMulhuCombine(N, DCI);
5501 case ISD::SELECT:
5502 return performSelectCombine(N, DCI);
5503 case ISD::FNEG:
5504 return performFNegCombine(N, DCI);
5505 case ISD::FABS:
5506 return performFAbsCombine(N, DCI);
5507 case AMDGPUISD::BFE_I32:
5508 case AMDGPUISD::BFE_U32: {
5509 assert(!N->getValueType(0).isVector() &&
5510 "Vector handling of BFE not implemented");
5511 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5512 if (!Width)
5513 break;
5514
5515 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5516 if (WidthVal == 0)
5517 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5518
5519 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5520 if (!Offset)
5521 break;
5522
5523 SDValue BitsFrom = N->getOperand(Num: 0);
5524 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5525
5526 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5527
5528 if (OffsetVal == 0) {
5529 // This is already sign / zero extended, so try to fold away extra BFEs.
5530 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5531
5532 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5533 if (OpSignBits >= SignBits)
5534 return BitsFrom;
5535
5536 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5537 if (Signed) {
5538 // This is a sign_extend_inreg. Replace it to take advantage of existing
5539 // DAG Combines. If not eliminated, we will match back to BFE during
5540 // selection.
5541
5542 // TODO: The sext_inreg of extended types ends, although we can could
5543 // handle them in a single BFE.
5544 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5545 N2: DAG.getValueType(SmallVT));
5546 }
5547
5548 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5549 }
5550
5551 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5552 if (Signed) {
5553 return constantFoldBFE<int32_t>(DAG,
5554 Src0: CVal->getSExtValue(),
5555 Offset: OffsetVal,
5556 Width: WidthVal,
5557 DL);
5558 }
5559
5560 return constantFoldBFE<uint32_t>(DAG,
5561 Src0: CVal->getZExtValue(),
5562 Offset: OffsetVal,
5563 Width: WidthVal,
5564 DL);
5565 }
5566
5567 if ((OffsetVal + WidthVal) >= 32 &&
5568 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5569 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5570 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5571 N1: BitsFrom, N2: ShiftVal);
5572 }
5573
5574 if (BitsFrom.hasOneUse()) {
5575 APInt Demanded = APInt::getBitsSet(numBits: 32,
5576 loBit: OffsetVal,
5577 hiBit: OffsetVal + WidthVal);
5578
5579 KnownBits Known;
5580 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5581 !DCI.isBeforeLegalizeOps());
5582 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5583 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5584 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5585 DCI.CommitTargetLoweringOpt(TLO);
5586 }
5587 }
5588
5589 break;
5590 }
5591 case ISD::LOAD:
5592 return performLoadCombine(N, DCI);
5593 case ISD::STORE:
5594 return performStoreCombine(N, DCI);
5595 case AMDGPUISD::RCP:
5596 case AMDGPUISD::RCP_IFLAG:
5597 return performRcpCombine(N, DCI);
5598 case ISD::AssertZext:
5599 case ISD::AssertSext:
5600 return performAssertSZExtCombine(N, DCI);
5601 case ISD::INTRINSIC_WO_CHAIN:
5602 return performIntrinsicWOChainCombine(N, DCI);
5603 case AMDGPUISD::FMAD_FTZ: {
5604 SDValue N0 = N->getOperand(Num: 0);
5605 SDValue N1 = N->getOperand(Num: 1);
5606 SDValue N2 = N->getOperand(Num: 2);
5607 EVT VT = N->getValueType(ResNo: 0);
5608
5609 // FMAD_FTZ is a FMAD + flush denormals to zero.
5610 // We flush the inputs, the intermediate step, and the output.
5611 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5612 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5613 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5614 if (N0CFP && N1CFP && N2CFP) {
5615 const auto FTZ = [](const APFloat &V) {
5616 if (V.isDenormal()) {
5617 APFloat Zero(V.getSemantics(), 0);
5618 return V.isNegative() ? -Zero : Zero;
5619 }
5620 return V;
5621 };
5622
5623 APFloat V0 = FTZ(N0CFP->getValueAPF());
5624 APFloat V1 = FTZ(N1CFP->getValueAPF());
5625 APFloat V2 = FTZ(N2CFP->getValueAPF());
5626 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5627 V0 = FTZ(V0);
5628 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5629 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5630 }
5631 break;
5632 }
5633 }
5634 return SDValue();
5635}
5636
5637//===----------------------------------------------------------------------===//
5638// Helper functions
5639//===----------------------------------------------------------------------===//
5640
5641SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5642 const TargetRegisterClass *RC,
5643 Register Reg, EVT VT,
5644 const SDLoc &SL,
5645 bool RawReg) const {
5646 MachineFunction &MF = DAG.getMachineFunction();
5647 MachineRegisterInfo &MRI = MF.getRegInfo();
5648 Register VReg;
5649
5650 if (!MRI.isLiveIn(Reg)) {
5651 VReg = MRI.createVirtualRegister(RegClass: RC);
5652 MRI.addLiveIn(Reg, vreg: VReg);
5653 } else {
5654 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5655 }
5656
5657 if (RawReg)
5658 return DAG.getRegister(Reg: VReg, VT);
5659
5660 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5661}
5662
5663// This may be called multiple times, and nothing prevents creating multiple
5664// objects at the same offset. See if we already defined this object.
5665static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5666 int64_t Offset) {
5667 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5668 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5669 assert(MFI.getObjectSize(I) == Size);
5670 return I;
5671 }
5672 }
5673
5674 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5675}
5676
5677SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5678 EVT VT,
5679 const SDLoc &SL,
5680 int64_t Offset) const {
5681 MachineFunction &MF = DAG.getMachineFunction();
5682 MachineFrameInfo &MFI = MF.getFrameInfo();
5683 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5684
5685 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5686 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5687
5688 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5689 MMOFlags: MachineMemOperand::MODereferenceable |
5690 MachineMemOperand::MOInvariant);
5691}
5692
5693SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5694 const SDLoc &SL,
5695 SDValue Chain,
5696 SDValue ArgVal,
5697 int64_t Offset) const {
5698 MachineFunction &MF = DAG.getMachineFunction();
5699 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5700 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5701
5702 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5703 // Stores to the argument stack area are relative to the stack pointer.
5704 SDValue SP =
5705 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5706 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5707 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5708 MMOFlags: MachineMemOperand::MODereferenceable);
5709 return Store;
5710}
5711
5712SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5713 const TargetRegisterClass *RC,
5714 EVT VT, const SDLoc &SL,
5715 const ArgDescriptor &Arg) const {
5716 assert(Arg && "Attempting to load missing argument");
5717
5718 SDValue V = Arg.isRegister() ?
5719 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5720 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5721
5722 if (!Arg.isMasked())
5723 return V;
5724
5725 unsigned Mask = Arg.getMask();
5726 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5727 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5728 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5729 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5730 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5731}
5732
5733uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5734 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5735 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5736 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5737 uint64_t ArgOffset =
5738 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5739 switch (Param) {
5740 case FIRST_IMPLICIT:
5741 return ArgOffset;
5742 case PRIVATE_BASE:
5743 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5744 case SHARED_BASE:
5745 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5746 case QUEUE_PTR:
5747 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5748 }
5749 llvm_unreachable("unexpected implicit parameter type");
5750}
5751
5752uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5753 const MachineFunction &MF, const ImplicitParameter Param) const {
5754 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5755 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5756}
5757
5758SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5759 SelectionDAG &DAG, int Enabled,
5760 int &RefinementSteps,
5761 bool &UseOneConstNR,
5762 bool Reciprocal) const {
5763 EVT VT = Operand.getValueType();
5764
5765 if (VT == MVT::f32) {
5766 RefinementSteps = 0;
5767 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5768 }
5769
5770 // TODO: There is also f64 rsq instruction, but the documentation is less
5771 // clear on its precision.
5772
5773 return SDValue();
5774}
5775
5776SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5777 SelectionDAG &DAG, int Enabled,
5778 int &RefinementSteps) const {
5779 EVT VT = Operand.getValueType();
5780
5781 if (VT == MVT::f32) {
5782 // Reciprocal, < 1 ulp error.
5783 //
5784 // This reciprocal approximation converges to < 0.5 ulp error with one
5785 // newton rhapson performed with two fused multiple adds (FMAs).
5786
5787 RefinementSteps = 0;
5788 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5789 }
5790
5791 // TODO: There is also f64 rcp instruction, but the documentation is less
5792 // clear on its precision.
5793
5794 return SDValue();
5795}
5796
5797static unsigned workitemIntrinsicDim(unsigned ID) {
5798 switch (ID) {
5799 case Intrinsic::amdgcn_workitem_id_x:
5800 return 0;
5801 case Intrinsic::amdgcn_workitem_id_y:
5802 return 1;
5803 case Intrinsic::amdgcn_workitem_id_z:
5804 return 2;
5805 default:
5806 llvm_unreachable("not a workitem intrinsic");
5807 }
5808}
5809
5810void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5811 const SDValue Op, KnownBits &Known,
5812 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5813
5814 Known.resetAll(); // Don't know anything.
5815
5816 unsigned Opc = Op.getOpcode();
5817
5818 switch (Opc) {
5819 default:
5820 break;
5821 case AMDGPUISD::CARRY:
5822 case AMDGPUISD::BORROW: {
5823 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
5824 break;
5825 }
5826
5827 case AMDGPUISD::BFE_I32:
5828 case AMDGPUISD::BFE_U32: {
5829 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5830 if (!CWidth)
5831 return;
5832
5833 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5834
5835 if (Opc == AMDGPUISD::BFE_U32)
5836 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
5837
5838 break;
5839 }
5840 case AMDGPUISD::FP_TO_FP16: {
5841 unsigned BitWidth = Known.getBitWidth();
5842
5843 // High bits are zero.
5844 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
5845 break;
5846 }
5847 case AMDGPUISD::MUL_U24:
5848 case AMDGPUISD::MUL_I24: {
5849 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5850 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5851 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5852 RHSKnown.countMinTrailingZeros();
5853 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
5854 // Skip extra check if all bits are known zeros.
5855 if (TrailZ >= 32)
5856 break;
5857
5858 // Truncate to 24 bits.
5859 LHSKnown = LHSKnown.trunc(BitWidth: 24);
5860 RHSKnown = RHSKnown.trunc(BitWidth: 24);
5861
5862 if (Opc == AMDGPUISD::MUL_I24) {
5863 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5864 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5865 unsigned MaxValBits = LHSValBits + RHSValBits;
5866 if (MaxValBits > 32)
5867 break;
5868 unsigned SignBits = 32 - MaxValBits + 1;
5869 bool LHSNegative = LHSKnown.isNegative();
5870 bool LHSNonNegative = LHSKnown.isNonNegative();
5871 bool LHSPositive = LHSKnown.isStrictlyPositive();
5872 bool RHSNegative = RHSKnown.isNegative();
5873 bool RHSNonNegative = RHSKnown.isNonNegative();
5874 bool RHSPositive = RHSKnown.isStrictlyPositive();
5875
5876 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5877 Known.Zero.setHighBits(SignBits);
5878 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5879 Known.One.setHighBits(SignBits);
5880 } else {
5881 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5882 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5883 unsigned MaxValBits = LHSValBits + RHSValBits;
5884 if (MaxValBits >= 32)
5885 break;
5886 Known.Zero.setBitsFrom(MaxValBits);
5887 }
5888 break;
5889 }
5890 case AMDGPUISD::PERM: {
5891 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5892 if (!CMask)
5893 return;
5894
5895 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5896 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5897 unsigned Sel = CMask->getZExtValue();
5898
5899 for (unsigned I = 0; I < 32; I += 8) {
5900 unsigned SelBits = Sel & 0xff;
5901 if (SelBits < 4) {
5902 SelBits *= 8;
5903 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5904 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5905 } else if (SelBits < 7) {
5906 SelBits = (SelBits & 3) * 8;
5907 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5908 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5909 } else if (SelBits == 0x0c) {
5910 Known.Zero |= 0xFFull << I;
5911 } else if (SelBits > 0x0c) {
5912 Known.One |= 0xFFull << I;
5913 }
5914 Sel >>= 8;
5915 }
5916 break;
5917 }
5918 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5919 Known.Zero.setHighBits(24);
5920 break;
5921 }
5922 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5923 Known.Zero.setHighBits(16);
5924 break;
5925 }
5926 case AMDGPUISD::LDS: {
5927 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
5928 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5929
5930 Known.Zero.setHighBits(16);
5931 Known.Zero.setLowBits(Log2(A: Alignment));
5932 break;
5933 }
5934 case AMDGPUISD::SMIN3:
5935 case AMDGPUISD::SMAX3:
5936 case AMDGPUISD::SMED3:
5937 case AMDGPUISD::UMIN3:
5938 case AMDGPUISD::UMAX3:
5939 case AMDGPUISD::UMED3: {
5940 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
5941 if (Known2.isUnknown())
5942 break;
5943
5944 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5945 if (Known1.isUnknown())
5946 break;
5947
5948 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5949 if (Known0.isUnknown())
5950 break;
5951
5952 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5953 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5954 Known.One = Known0.One & Known1.One & Known2.One;
5955 break;
5956 }
5957 case ISD::INTRINSIC_WO_CHAIN: {
5958 unsigned IID = Op.getConstantOperandVal(i: 0);
5959 switch (IID) {
5960 case Intrinsic::amdgcn_workitem_id_x:
5961 case Intrinsic::amdgcn_workitem_id_y:
5962 case Intrinsic::amdgcn_workitem_id_z: {
5963 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5964 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5965 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5966 break;
5967 }
5968 default:
5969 break;
5970 }
5971 }
5972 }
5973}
5974
5975unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5976 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5977 unsigned Depth) const {
5978 switch (Op.getOpcode()) {
5979 case AMDGPUISD::BFE_I32: {
5980 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5981 if (!Width)
5982 return 1;
5983
5984 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5985 if (!isNullConstant(V: Op.getOperand(i: 1)))
5986 return SignBits;
5987
5988 // TODO: Could probably figure something out with non-0 offsets.
5989 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5990 return std::max(a: SignBits, b: Op0SignBits);
5991 }
5992
5993 case AMDGPUISD::BFE_U32: {
5994 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5995 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5996 }
5997
5998 case AMDGPUISD::CARRY:
5999 case AMDGPUISD::BORROW:
6000 return 31;
6001 case AMDGPUISD::BUFFER_LOAD_BYTE:
6002 return 25;
6003 case AMDGPUISD::BUFFER_LOAD_SHORT:
6004 return 17;
6005 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6006 return 24;
6007 case AMDGPUISD::BUFFER_LOAD_USHORT:
6008 return 16;
6009 case AMDGPUISD::FP_TO_FP16:
6010 return 16;
6011 case AMDGPUISD::SMIN3:
6012 case AMDGPUISD::SMAX3:
6013 case AMDGPUISD::SMED3:
6014 case AMDGPUISD::UMIN3:
6015 case AMDGPUISD::UMAX3:
6016 case AMDGPUISD::UMED3: {
6017 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6018 if (Tmp2 == 1)
6019 return 1; // Early out.
6020
6021 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6022 if (Tmp1 == 1)
6023 return 1; // Early out.
6024
6025 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6026 if (Tmp0 == 1)
6027 return 1; // Early out.
6028
6029 return std::min(l: {Tmp0, Tmp1, Tmp2});
6030 }
6031 default:
6032 return 1;
6033 }
6034}
6035
6036unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6037 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6038 const MachineRegisterInfo &MRI, unsigned Depth) const {
6039 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6040 if (!MI)
6041 return 1;
6042
6043 // TODO: Check range metadata on MMO.
6044 switch (MI->getOpcode()) {
6045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6046 return 25;
6047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6048 return 17;
6049 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6050 return 24;
6051 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6052 return 16;
6053 case AMDGPU::G_AMDGPU_SMED3:
6054 case AMDGPU::G_AMDGPU_UMED3: {
6055 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6056 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6057 if (Tmp2 == 1)
6058 return 1;
6059 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6060 if (Tmp1 == 1)
6061 return 1;
6062 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6063 if (Tmp0 == 1)
6064 return 1;
6065 return std::min(l: {Tmp0, Tmp1, Tmp2});
6066 }
6067 default:
6068 return 1;
6069 }
6070}
6071
6072bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6073 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6074 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6075 unsigned Opcode = Op.getOpcode();
6076 switch (Opcode) {
6077 case AMDGPUISD::BFE_I32:
6078 case AMDGPUISD::BFE_U32:
6079 return false;
6080 }
6081 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6082 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6083}
6084
6085bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6086 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6087 unsigned Depth) const {
6088 unsigned Opcode = Op.getOpcode();
6089 switch (Opcode) {
6090 case AMDGPUISD::FMIN_LEGACY:
6091 case AMDGPUISD::FMAX_LEGACY: {
6092 if (SNaN)
6093 return true;
6094
6095 // TODO: Can check no nans on one of the operands for each one, but which
6096 // one?
6097 return false;
6098 }
6099 case AMDGPUISD::FMUL_LEGACY:
6100 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6101 if (SNaN)
6102 return true;
6103 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6104 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6105 }
6106 case AMDGPUISD::FMED3:
6107 case AMDGPUISD::FMIN3:
6108 case AMDGPUISD::FMAX3:
6109 case AMDGPUISD::FMINIMUM3:
6110 case AMDGPUISD::FMAXIMUM3:
6111 case AMDGPUISD::FMAD_FTZ: {
6112 if (SNaN)
6113 return true;
6114 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6115 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6116 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6117 }
6118 case AMDGPUISD::CVT_F32_UBYTE0:
6119 case AMDGPUISD::CVT_F32_UBYTE1:
6120 case AMDGPUISD::CVT_F32_UBYTE2:
6121 case AMDGPUISD::CVT_F32_UBYTE3:
6122 return true;
6123
6124 case AMDGPUISD::RCP:
6125 case AMDGPUISD::RSQ:
6126 case AMDGPUISD::RCP_LEGACY:
6127 case AMDGPUISD::RSQ_CLAMP: {
6128 if (SNaN)
6129 return true;
6130
6131 // TODO: Need is known positive check.
6132 return false;
6133 }
6134 case ISD::FLDEXP:
6135 case AMDGPUISD::FRACT: {
6136 if (SNaN)
6137 return true;
6138 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6139 }
6140 case AMDGPUISD::DIV_SCALE:
6141 case AMDGPUISD::DIV_FMAS:
6142 case AMDGPUISD::DIV_FIXUP:
6143 // TODO: Refine on operands.
6144 return SNaN;
6145 case AMDGPUISD::SIN_HW:
6146 case AMDGPUISD::COS_HW: {
6147 // TODO: Need check for infinity
6148 return SNaN;
6149 }
6150 case ISD::INTRINSIC_WO_CHAIN: {
6151 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6152 // TODO: Handle more intrinsics
6153 switch (IntrinsicID) {
6154 case Intrinsic::amdgcn_cubeid:
6155 case Intrinsic::amdgcn_cvt_off_f32_i4:
6156 return true;
6157
6158 case Intrinsic::amdgcn_frexp_mant: {
6159 if (SNaN)
6160 return true;
6161 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6162 }
6163 case Intrinsic::amdgcn_cvt_pkrtz: {
6164 if (SNaN)
6165 return true;
6166 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6167 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6168 }
6169 case Intrinsic::amdgcn_rcp:
6170 case Intrinsic::amdgcn_rsq:
6171 case Intrinsic::amdgcn_rcp_legacy:
6172 case Intrinsic::amdgcn_rsq_legacy:
6173 case Intrinsic::amdgcn_rsq_clamp:
6174 case Intrinsic::amdgcn_tanh: {
6175 if (SNaN)
6176 return true;
6177
6178 // TODO: Need is known positive check.
6179 return false;
6180 }
6181 case Intrinsic::amdgcn_trig_preop:
6182 case Intrinsic::amdgcn_fdot2:
6183 // TODO: Refine on operand
6184 return SNaN;
6185 case Intrinsic::amdgcn_fma_legacy:
6186 if (SNaN)
6187 return true;
6188 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6189 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6190 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6191 default:
6192 return false;
6193 }
6194 }
6195 default:
6196 return false;
6197 }
6198}
6199
6200bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6201 Register N0, Register N1) const {
6202 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6203}
6204