1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUSelectionDAGInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "llvm/CodeGen/Analysis.h"
23#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/IR/DiagnosticInfo.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/KnownBits.h"
29#include "llvm/Target/TargetMachine.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
35static cl::opt<bool> AMDGPUBypassSlowDiv(
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(Val: true));
39
40// Find a larger type to do a load / store of a vector with.
41EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
48
49 return VT;
50}
51
52unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
53 return DAG.computeKnownBits(Op).countMaxActiveBits();
54}
55
56unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
62AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
68 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
69 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
70 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
73 MaxGluedStoresPerMemcpy = 16;
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
78 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
79
80 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
81 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
82
83 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
84 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
85
86 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
87 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
88
89 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
90 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
91
92 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
93 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
94
95 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
96 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
97
98 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
99 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
100
101 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
102 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
103
104 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
105 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
106
107 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
108 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
109
110 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
111 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
112
113 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
114 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
115
116 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
117 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
118
119 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
120 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
121
122 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
123 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
124
125 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
126 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
127
128 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
129 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
130
131 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
132 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
133
134 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
135 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
136
137 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
138 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
139
140 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
141 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
142
143 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
144 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
145
146 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
147 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
148
149 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
150 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
151
152 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
153 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
154
155 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
156 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
160 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
161
162 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
163 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
164
165 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
166 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
167
168 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
169 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
170
171 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
172 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
173
174 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
175 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
176
177 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
178 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
179
180 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
181 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
186 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
187 Action: Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
195 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
196 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
197 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
198 }
199 }
200
201 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
204 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
205 Action: Expand);
206
207 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
208 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
209 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
210 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
212 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
219 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
221
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
226 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
227 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
228
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
239 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
240 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
241
242 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
243 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
244
245 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
246 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
247
248 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
249 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
250
251 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
252 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
253
254 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
255 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
256
257 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
258 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
259
260 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
261 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
262
263 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
264 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
265
266 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
267 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
268
269 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
270 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
271
272 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
273 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
274
275 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
276 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
277
278 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
279 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
280
281 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
282 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
283
284 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
285 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
286
287 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
288 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
289
290 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
291 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
292
293 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
294 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
295
296 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
297 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
298
299 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
300 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
301
302 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
303 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
304
305 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
306 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
307
308 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
309 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
310
311 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
312 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
313
314 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
315 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
316
317 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
318 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
319
320 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
321 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
322
323 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
324 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
325 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
326 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
327
328 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
329 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
330 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
331 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
332
333 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
334 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
338 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
339 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v6f32, MemVT: MVT::v6f16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
345 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
346 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
348
349 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
350 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
352
353 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
354 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
355 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
356
357 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
358
359 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
362 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
363 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
364 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
365 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
366
367 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
369 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
370 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
372
373 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i1, Action: Expand);
374 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i8, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v5i32, MemVT: MVT::v5i16, Action: Expand);
376
377 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i1, Action: Expand);
378 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i8, Action: Expand);
379 setTruncStoreAction(ValVT: MVT::v6i32, MemVT: MVT::v6i16, Action: Expand);
380
381 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i1, Action: Expand);
382 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i8, Action: Expand);
383 setTruncStoreAction(ValVT: MVT::v7i32, MemVT: MVT::v7i16, Action: Expand);
384
385 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
386 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
387 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
388
389 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
390 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
391 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
392 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
393 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
394 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
395 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
396
397 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
398 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
399
400 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
404 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
408 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409 ISD::FROUNDEVEN, ISD::FTRUNC},
410 VTs: {MVT::f16, MVT::f32}, Action: Legal);
411 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VT: MVT::f32, Action: Legal);
412
413 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
414 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
415 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
416 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
417
418 setOperationAction(
419 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
420 Action: Custom);
421
422 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
423
424 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
425
426 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
427 Action: Expand);
428
429 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
430 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
431 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
432
433 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
434 Action: Custom);
435
436 setOperationAction(Ops: ISD::FCANONICALIZE, VTs: {MVT::f32, MVT::f64}, Action: Legal);
437
438 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
439 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
440 // default unless marked custom/legal.
441 setOperationAction(Ops: ISD::IS_FPCLASS,
442 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
443 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
444 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
445 MVT::v16f64},
446 Action: Custom);
447
448 // Expand to fneg + fadd.
449 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
450
451 setOperationAction(Ops: ISD::CONCAT_VECTORS,
452 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
453 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
454 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
455 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
456 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
457 Action: Custom);
458
459 setOperationAction(
460 Ops: ISD::EXTRACT_SUBVECTOR,
461 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
462 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
463 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
464 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
465 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
466 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
467 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
468 Action: Custom);
469
470 setOperationAction(Ops: {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP}, VT: MVT::f64,
471 Action: Expand);
472 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
473
474 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
475 for (MVT VT : ScalarIntVTs) {
476 // These should use [SU]DIVREM, so set them to expand
477 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
478 Action: Expand);
479
480 // GPU does not have divrem function for signed or unsigned.
481 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
482
483 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
484 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
485
486 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
487
488 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
489 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
490 }
491
492 // The hardware supports 32-bit FSHR, but not FSHL.
493 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
494
495 setOperationAction(Ops: {ISD::ROTL, ISD::ROTR}, VTs: {MVT::i32, MVT::i64}, Action: Expand);
496
497 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
498
499 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
500 setOperationAction(Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT,
501 ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
502 ISD::FP_TO_UINT_SAT},
503 VT: MVT::i64, Action: Custom);
504 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
505
506 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
507 Action: Legal);
508
509 setOperationAction(
510 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
511 VT: MVT::i64, Action: Custom);
512
513 for (auto VT : {MVT::i8, MVT::i16})
514 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
515
516 static const MVT::SimpleValueType VectorIntTypes[] = {
517 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
518 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
519
520 for (MVT VT : VectorIntTypes) {
521 // Expand the following operations for the current type by default.
522 // clang-format off
523 setOperationAction(Ops: {ISD::ADD, ISD::AND,
524 ISD::FP_TO_SINT, ISD::FP_TO_UINT,
525 ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
526 ISD::MUL, ISD::MULHU,
527 ISD::MULHS, ISD::OR,
528 ISD::SHL, ISD::SRA,
529 ISD::SRL, ISD::ROTL,
530 ISD::ROTR, ISD::SUB,
531 ISD::SINT_TO_FP, ISD::UINT_TO_FP,
532 ISD::SDIV, ISD::UDIV,
533 ISD::SREM, ISD::UREM,
534 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
535 ISD::SDIVREM, ISD::UDIVREM,
536 ISD::SELECT, ISD::VSELECT,
537 ISD::SELECT_CC, ISD::XOR,
538 ISD::BSWAP, ISD::CTPOP,
539 ISD::CTTZ, ISD::CTLZ,
540 ISD::VECTOR_SHUFFLE, ISD::SETCC,
541 ISD::ADDRSPACECAST},
542 VT, Action: Expand);
543 // clang-format on
544 }
545
546 static const MVT::SimpleValueType FloatVectorTypes[] = {
547 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
548 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
549
550 for (MVT VT : FloatVectorTypes) {
551 setOperationAction(
552 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
553 ISD::FADD, ISD::FCEIL, ISD::FCOS,
554 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
555 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
556 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
557 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
558 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
559 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
560 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
561 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
562 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
563 VT, Action: Expand);
564 }
565
566 // This causes using an unrolled select operation rather than expansion with
567 // bit operations. This is in general better, but the alternative using BFI
568 // instructions may be better if the select sources are SGPRs.
569 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
570 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
571
572 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
573 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
574
575 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
576 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
577
578 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
579 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
580
581 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
582 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
583
584 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
585 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
586
587 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
588 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
589
590 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
591 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
592
593 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
594 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
595
596 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
597 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
598
599 setSchedulingPreference(Sched::RegPressure);
600 setJumpIsExpensive(true);
601
602 setMinCmpXchgSizeInBits(32);
603 setSupportsUnalignedAtomics(false);
604
605 PredictableSelectIsExpensive = false;
606
607 // We want to find all load dependencies for long chains of stores to enable
608 // merging into very wide vectors. The problem is with vectors with > 4
609 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
610 // vectors are a legal type, even though we have to split the loads
611 // usually. When we can more precisely specify load legality per address
612 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
613 // smarter so that they can figure out what to do in 2 iterations without all
614 // N > 4 stores on the same chain.
615 GatherAllAliasesMaxDepth = 16;
616
617 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
618 // about these during lowering.
619 MaxStoresPerMemcpy = 0xffffffff;
620 MaxStoresPerMemmove = 0xffffffff;
621 MaxStoresPerMemset = 0xffffffff;
622
623 // The expansion for 64-bit division is enormous.
624 if (AMDGPUBypassSlowDiv)
625 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
626
627 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
628 ISD::SRA, ISD::SRL,
629 ISD::TRUNCATE, ISD::MUL,
630 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
631 ISD::MULHU, ISD::MULHS,
632 ISD::SELECT, ISD::SELECT_CC,
633 ISD::STORE, ISD::FADD,
634 ISD::FSUB, ISD::FNEG,
635 ISD::FABS, ISD::AssertZext,
636 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
637
638 setMaxAtomicSizeInBitsSupported(64);
639 setMaxDivRemBitWidthSupported(64);
640 setMaxLargeFPConvertBitWidthSupported(64);
641}
642
643bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
644 const auto Flags = Op.getNode()->getFlags();
645 if (Flags.hasNoSignedZeros())
646 return true;
647
648 return false;
649}
650
651//===----------------------------------------------------------------------===//
652// Target Information
653//===----------------------------------------------------------------------===//
654
655LLVM_READNONE
656static bool fnegFoldsIntoOpcode(unsigned Opc) {
657 switch (Opc) {
658 case ISD::FADD:
659 case ISD::FSUB:
660 case ISD::FMUL:
661 case ISD::FMA:
662 case ISD::FMAD:
663 case ISD::FMINNUM:
664 case ISD::FMAXNUM:
665 case ISD::FMINNUM_IEEE:
666 case ISD::FMAXNUM_IEEE:
667 case ISD::FMINIMUM:
668 case ISD::FMAXIMUM:
669 case ISD::FMINIMUMNUM:
670 case ISD::FMAXIMUMNUM:
671 case ISD::SELECT:
672 case ISD::FSIN:
673 case ISD::FTRUNC:
674 case ISD::FRINT:
675 case ISD::FNEARBYINT:
676 case ISD::FROUNDEVEN:
677 case ISD::FCANONICALIZE:
678 case AMDGPUISD::RCP:
679 case AMDGPUISD::RCP_LEGACY:
680 case AMDGPUISD::RCP_IFLAG:
681 case AMDGPUISD::SIN_HW:
682 case AMDGPUISD::FMUL_LEGACY:
683 case AMDGPUISD::FMIN_LEGACY:
684 case AMDGPUISD::FMAX_LEGACY:
685 case AMDGPUISD::FMED3:
686 // TODO: handle llvm.amdgcn.fma.legacy
687 return true;
688 case ISD::BITCAST:
689 llvm_unreachable("bitcast is special cased");
690 default:
691 return false;
692 }
693}
694
695static bool fnegFoldsIntoOp(const SDNode *N) {
696 unsigned Opc = N->getOpcode();
697 if (Opc == ISD::BITCAST) {
698 // TODO: Is there a benefit to checking the conditions performFNegCombine
699 // does? We don't for the other cases.
700 SDValue BCSrc = N->getOperand(Num: 0);
701 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
702 return BCSrc.getNumOperands() == 2 &&
703 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
704 }
705
706 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
707 }
708
709 return fnegFoldsIntoOpcode(Opc);
710}
711
712/// \p returns true if the operation will definitely need to use a 64-bit
713/// encoding, and thus will use a VOP3 encoding regardless of the source
714/// modifiers.
715LLVM_READONLY
716static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
717 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
718 VT == MVT::f64;
719}
720
721/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
722/// type for ISD::SELECT.
723LLVM_READONLY
724static bool selectSupportsSourceMods(const SDNode *N) {
725 // TODO: Only applies if select will be vector
726 return N->getValueType(ResNo: 0) == MVT::f32;
727}
728
729// Most FP instructions support source modifiers, but this could be refined
730// slightly.
731LLVM_READONLY
732static bool hasSourceMods(const SDNode *N) {
733 if (isa<MemSDNode>(Val: N))
734 return false;
735
736 switch (N->getOpcode()) {
737 case ISD::CopyToReg:
738 case ISD::FDIV:
739 case ISD::FREM:
740 case ISD::INLINEASM:
741 case ISD::INLINEASM_BR:
742 case AMDGPUISD::DIV_SCALE:
743 case ISD::INTRINSIC_W_CHAIN:
744
745 // TODO: Should really be looking at the users of the bitcast. These are
746 // problematic because bitcasts are used to legalize all stores to integer
747 // types.
748 case ISD::BITCAST:
749 return false;
750 case ISD::INTRINSIC_WO_CHAIN: {
751 switch (N->getConstantOperandVal(Num: 0)) {
752 case Intrinsic::amdgcn_interp_p1:
753 case Intrinsic::amdgcn_interp_p2:
754 case Intrinsic::amdgcn_interp_mov:
755 case Intrinsic::amdgcn_interp_p1_f16:
756 case Intrinsic::amdgcn_interp_p2_f16:
757 return false;
758 default:
759 return true;
760 }
761 }
762 case ISD::SELECT:
763 return selectSupportsSourceMods(N);
764 default:
765 return true;
766 }
767}
768
769bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
770 unsigned CostThreshold) {
771 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
772 // it is truly free to use a source modifier in all cases. If there are
773 // multiple users but for each one will necessitate using VOP3, there will be
774 // a code size increase. Try to avoid increasing code size unless we know it
775 // will save on the instruction count.
776 unsigned NumMayIncreaseSize = 0;
777 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
778
779 assert(!N->use_empty());
780
781 // XXX - Should this limit number of uses to check?
782 for (const SDNode *U : N->users()) {
783 if (!hasSourceMods(N: U))
784 return false;
785
786 if (!opMustUseVOP3Encoding(N: U, VT)) {
787 if (++NumMayIncreaseSize > CostThreshold)
788 return false;
789 }
790 }
791
792 return true;
793}
794
795EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
796 ISD::NodeType ExtendKind) const {
797 assert(!VT.isVector() && "only scalar expected");
798
799 // Round to the next multiple of 32-bits.
800 unsigned Size = VT.getSizeInBits();
801 if (Size <= 32)
802 return MVT::i32;
803 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
804}
805
806unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
807 return 32;
808}
809
810bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
811 return true;
812}
813
814// The backend supports 32 and 64 bit floating point immediates.
815// FIXME: Why are we reporting vectors of FP immediates as legal?
816bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
817 bool ForCodeSize) const {
818 return isTypeLegal(VT: VT.getScalarType());
819}
820
821// We don't want to shrink f64 / f32 constants.
822bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
825}
826
827bool AMDGPUTargetLowering::shouldReduceLoadWidth(
828 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
829 std::optional<unsigned> ByteOffset) const {
830 // TODO: This may be worth removing. Check regression tests for diffs.
831 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
832 return false;
833
834 unsigned NewSize = NewVT.getStoreSizeInBits();
835
836 // If we are reducing to a 32-bit load or a smaller multi-dword load,
837 // this is always better.
838 if (NewSize >= 32)
839 return true;
840
841 EVT OldVT = N->getValueType(ResNo: 0);
842 unsigned OldSize = OldVT.getStoreSizeInBits();
843
844 MemSDNode *MN = cast<MemSDNode>(Val: N);
845 unsigned AS = MN->getAddressSpace();
846 // Do not shrink an aligned scalar load to sub-dword.
847 // Scalar engine cannot do sub-dword loads.
848 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
849 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
850 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
851 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
852 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
853 MN->isInvariant())) &&
854 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
855 return false;
856
857 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
858 // extloads, so doing one requires using a buffer_load. In cases where we
859 // still couldn't use a scalar load, using the wider load shouldn't really
860 // hurt anything.
861
862 // If the old size already had to be an extload, there's no harm in continuing
863 // to reduce the width.
864 return (OldSize < 32);
865}
866
867bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
868 const SelectionDAG &DAG,
869 const MachineMemOperand &MMO) const {
870
871 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
872
873 if (LoadTy.getScalarType() == MVT::i32)
874 return false;
875
876 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
877 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
878
879 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
880 return false;
881
882 unsigned Fast = 0;
883 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
884 VT: CastTy, MMO, Fast: &Fast) &&
885 Fast;
886}
887
888// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
889// profitable with the expansion for 64-bit since it's generally good to
890// speculate things.
891bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
892 return true;
893}
894
895bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
896 return true;
897}
898
899bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
900 switch (N->getOpcode()) {
901 case ISD::EntryToken:
902 case ISD::TokenFactor:
903 return true;
904 case ISD::INTRINSIC_WO_CHAIN: {
905 unsigned IntrID = N->getConstantOperandVal(Num: 0);
906 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
907 }
908 case ISD::INTRINSIC_W_CHAIN: {
909 unsigned IntrID = N->getConstantOperandVal(Num: 1);
910 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
911 }
912 case ISD::LOAD:
913 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
914 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
915 return true;
916 return false;
917 case AMDGPUISD::SETCC: // ballot-style instruction
918 return true;
919 }
920 return false;
921}
922
923SDValue AMDGPUTargetLowering::getNegatedExpression(
924 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
925 NegatibleCost &Cost, unsigned Depth) const {
926
927 switch (Op.getOpcode()) {
928 case ISD::FMA:
929 case ISD::FMAD: {
930 // Negating a fma is not free if it has users without source mods.
931 if (!allUsesHaveSourceMods(N: Op.getNode()))
932 return SDValue();
933 break;
934 }
935 case AMDGPUISD::RCP: {
936 SDValue Src = Op.getOperand(i: 0);
937 EVT VT = Op.getValueType();
938 SDLoc SL(Op);
939
940 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
941 ForCodeSize, Cost, Depth: Depth + 1);
942 if (NegSrc)
943 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
944 return SDValue();
945 }
946 default:
947 break;
948 }
949
950 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
951 OptForSize: ForCodeSize, Cost, Depth);
952}
953
954//===---------------------------------------------------------------------===//
955// Target Properties
956//===---------------------------------------------------------------------===//
957
958bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
959 assert(VT.isFloatingPoint());
960
961 // Packed operations do not have a fabs modifier.
962 // Report this based on the end legalized type.
963 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
964}
965
966bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
967 assert(VT.isFloatingPoint());
968 // Report this based on the end legalized type.
969 VT = VT.getScalarType();
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
973bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
974 unsigned NumElem,
975 unsigned AS) const {
976 return true;
977}
978
979bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
980 // There are few operations which truly have vector input operands. Any vector
981 // operation is going to involve operations on each component, and a
982 // build_vector will be a copy per element, so it always makes sense to use a
983 // build_vector input in place of the extracted element to avoid a copy into a
984 // super register.
985 //
986 // We should probably only do this if all users are extracts only, but this
987 // should be the common case.
988 return true;
989}
990
991bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
992 // Truncate is just accessing a subregister.
993
994 unsigned SrcSize = Source.getSizeInBits();
995 unsigned DestSize = Dest.getSizeInBits();
996
997 return DestSize < SrcSize && DestSize % 32 == 0 ;
998}
999
1000bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
1001 // Truncate is just accessing a subregister.
1002
1003 unsigned SrcSize = Source->getScalarSizeInBits();
1004 unsigned DestSize = Dest->getScalarSizeInBits();
1005
1006 if (DestSize== 16 && Subtarget->has16BitInsts())
1007 return SrcSize >= 32;
1008
1009 return DestSize < SrcSize && DestSize % 32 == 0;
1010}
1011
1012bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1013 unsigned SrcSize = Src->getScalarSizeInBits();
1014 unsigned DestSize = Dest->getScalarSizeInBits();
1015
1016 if (SrcSize == 16 && Subtarget->has16BitInsts())
1017 return DestSize >= 32;
1018
1019 return SrcSize == 32 && DestSize == 64;
1020}
1021
1022bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1023 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1024 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1025 // this will enable reducing 64-bit operations the 32-bit, which is always
1026 // good.
1027
1028 if (Src == MVT::i16)
1029 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1030
1031 return Src == MVT::i32 && Dest == MVT::i64;
1032}
1033
1034bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1035 EVT DestVT) const {
1036 switch (N->getOpcode()) {
1037 case ISD::ADD:
1038 case ISD::SUB:
1039 case ISD::SHL:
1040 case ISD::SRL:
1041 case ISD::SRA:
1042 case ISD::AND:
1043 case ISD::OR:
1044 case ISD::XOR:
1045 case ISD::MUL:
1046 case ISD::SETCC:
1047 case ISD::SELECT:
1048 case ISD::SMIN:
1049 case ISD::SMAX:
1050 case ISD::UMIN:
1051 case ISD::UMAX:
1052 if (isTypeLegal(VT: MVT::i16) &&
1053 (!DestVT.isVector() ||
1054 !isOperationLegal(Op: ISD::ADD, VT: MVT::v2i16))) { // Check if VOP3P
1055 // Don't narrow back down to i16 if promoted to i32 already.
1056 if (!N->isDivergent() && DestVT.isInteger() &&
1057 DestVT.getScalarSizeInBits() > 1 &&
1058 DestVT.getScalarSizeInBits() <= 16 &&
1059 SrcVT.getScalarSizeInBits() > 16) {
1060 return false;
1061 }
1062 }
1063 return true;
1064 default:
1065 break;
1066 }
1067
1068 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1069 // limited number of native 64-bit operations. Shrinking an operation to fit
1070 // in a single 32-bit register should always be helpful. As currently used,
1071 // this is much less general than the name suggests, and is only used in
1072 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1073 // not profitable, and may actually be harmful.
1074 if (isa<LoadSDNode>(Val: N))
1075 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1076
1077 return true;
1078}
1079
1080bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1081 const SDNode* N, CombineLevel Level) const {
1082 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1083 N->getOpcode() == ISD::SRL) &&
1084 "Expected shift op");
1085
1086 SDValue ShiftLHS = N->getOperand(Num: 0);
1087 if (!ShiftLHS->hasOneUse())
1088 return false;
1089
1090 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1091 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1092 return false;
1093
1094 // Always commute pre-type legalization and right shifts.
1095 // We're looking for shl(or(x,y),z) patterns.
1096 if (Level < CombineLevel::AfterLegalizeTypes ||
1097 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1098 return true;
1099
1100 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1101 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1102 (N->user_begin()->getOpcode() == ISD::SRA ||
1103 N->user_begin()->getOpcode() == ISD::SRL))
1104 return false;
1105
1106 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1107 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1108 if (LHS.getOpcode() != ISD::SHL)
1109 return false;
1110 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1111 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1112 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1113 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1114 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1115 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1116 };
1117 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1118 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1119 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1120}
1121
1122//===---------------------------------------------------------------------===//
1123// TargetLowering Callbacks
1124//===---------------------------------------------------------------------===//
1125
1126CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1127 bool IsVarArg) {
1128 switch (CC) {
1129 case CallingConv::AMDGPU_VS:
1130 case CallingConv::AMDGPU_GS:
1131 case CallingConv::AMDGPU_PS:
1132 case CallingConv::AMDGPU_CS:
1133 case CallingConv::AMDGPU_HS:
1134 case CallingConv::AMDGPU_ES:
1135 case CallingConv::AMDGPU_LS:
1136 return CC_AMDGPU;
1137 case CallingConv::AMDGPU_CS_Chain:
1138 case CallingConv::AMDGPU_CS_ChainPreserve:
1139 return CC_AMDGPU_CS_CHAIN;
1140 case CallingConv::C:
1141 case CallingConv::Fast:
1142 case CallingConv::Cold:
1143 return CC_AMDGPU_Func;
1144 case CallingConv::AMDGPU_Gfx:
1145 case CallingConv::AMDGPU_Gfx_WholeWave:
1146 return CC_SI_Gfx;
1147 case CallingConv::AMDGPU_KERNEL:
1148 case CallingConv::SPIR_KERNEL:
1149 default:
1150 reportFatalUsageError(reason: "unsupported calling convention for call");
1151 }
1152}
1153
1154CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1155 bool IsVarArg) {
1156 switch (CC) {
1157 case CallingConv::AMDGPU_KERNEL:
1158 case CallingConv::SPIR_KERNEL:
1159 llvm_unreachable("kernels should not be handled here");
1160 case CallingConv::AMDGPU_VS:
1161 case CallingConv::AMDGPU_GS:
1162 case CallingConv::AMDGPU_PS:
1163 case CallingConv::AMDGPU_CS:
1164 case CallingConv::AMDGPU_CS_Chain:
1165 case CallingConv::AMDGPU_CS_ChainPreserve:
1166 case CallingConv::AMDGPU_HS:
1167 case CallingConv::AMDGPU_ES:
1168 case CallingConv::AMDGPU_LS:
1169 return RetCC_SI_Shader;
1170 case CallingConv::AMDGPU_Gfx:
1171 case CallingConv::AMDGPU_Gfx_WholeWave:
1172 return RetCC_SI_Gfx;
1173 case CallingConv::C:
1174 case CallingConv::Fast:
1175 case CallingConv::Cold:
1176 return RetCC_AMDGPU_Func;
1177 default:
1178 reportFatalUsageError(reason: "unsupported calling convention");
1179 }
1180}
1181
1182/// The SelectionDAGBuilder will automatically promote function arguments
1183/// with illegal types. However, this does not work for the AMDGPU targets
1184/// since the function arguments are stored in memory as these illegal types.
1185/// In order to handle this properly we need to get the original types sizes
1186/// from the LLVM IR Function and fixup the ISD:InputArg values before
1187/// passing them to AnalyzeFormalArguments()
1188
1189/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1190/// input values across multiple registers. Each item in the Ins array
1191/// represents a single value that will be stored in registers. Ins[x].VT is
1192/// the value type of the value that will be stored in the register, so
1193/// whatever SDNode we lower the argument to needs to be this type.
1194///
1195/// In order to correctly lower the arguments we need to know the size of each
1196/// argument. Since Ins[x].VT gives us the size of the register that will
1197/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1198/// for the original function argument so that we can deduce the correct memory
1199/// type to use for Ins[x]. In most cases the correct memory type will be
1200/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1201/// we have a kernel argument of type v8i8, this argument will be split into
1202/// 8 parts and each part will be represented by its own item in the Ins array.
1203/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1204/// the argument before it was split. From this, we deduce that the memory type
1205/// for each individual part is i8. We pass the memory type as LocVT to the
1206/// calling convention analysis function and the register type (Ins[x].VT) as
1207/// the ValVT.
1208void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1209 CCState &State,
1210 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1211 const MachineFunction &MF = State.getMachineFunction();
1212 const Function &Fn = MF.getFunction();
1213 LLVMContext &Ctx = Fn.getContext();
1214 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1215 CallingConv::ID CC = Fn.getCallingConv();
1216
1217 Align MaxAlign = Align(1);
1218 uint64_t ExplicitArgOffset = 0;
1219 const DataLayout &DL = Fn.getDataLayout();
1220
1221 unsigned InIndex = 0;
1222
1223 for (const Argument &Arg : Fn.args()) {
1224 const bool IsByRef = Arg.hasByRefAttr();
1225 Type *BaseArgTy = Arg.getType();
1226 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1227 Align Alignment = DL.getValueOrABITypeAlignment(
1228 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1229 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1230 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1231
1232 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1233 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1234
1235 // We're basically throwing away everything passed into us and starting over
1236 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1237 // to us as computed in Ins.
1238 //
1239 // We also need to figure out what type legalization is trying to do to get
1240 // the correct memory offsets.
1241
1242 SmallVector<EVT, 16> ValueVTs;
1243 SmallVector<uint64_t, 16> Offsets;
1244 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1245 FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1246
1247 for (unsigned Value = 0, NumValues = ValueVTs.size();
1248 Value != NumValues; ++Value) {
1249 uint64_t BasePartOffset = Offsets[Value];
1250
1251 EVT ArgVT = ValueVTs[Value];
1252 EVT MemVT = ArgVT;
1253 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1254 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1255
1256 if (NumRegs == 1) {
1257 // This argument is not split, so the IR type is the memory type.
1258 if (ArgVT.isExtended()) {
1259 // We have an extended type, like i24, so we should just use the
1260 // register type.
1261 MemVT = RegisterVT;
1262 } else {
1263 MemVT = ArgVT;
1264 }
1265 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1266 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1267 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1268 // We have a vector value which has been split into a vector with
1269 // the same scalar type, but fewer elements. This should handle
1270 // all the floating-point vector types.
1271 MemVT = RegisterVT;
1272 } else if (ArgVT.isVector() &&
1273 ArgVT.getVectorNumElements() == NumRegs) {
1274 // This arg has been split so that each element is stored in a separate
1275 // register.
1276 MemVT = ArgVT.getScalarType();
1277 } else if (ArgVT.isExtended()) {
1278 // We have an extended type, like i65.
1279 MemVT = RegisterVT;
1280 } else {
1281 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1282 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1283 if (RegisterVT.isInteger()) {
1284 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1285 } else if (RegisterVT.isVector()) {
1286 assert(!RegisterVT.getScalarType().isFloatingPoint());
1287 unsigned NumElements = RegisterVT.getVectorNumElements();
1288 assert(MemoryBits % NumElements == 0);
1289 // This vector type has been split into another vector type with
1290 // a different elements size.
1291 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1292 BitWidth: MemoryBits / NumElements);
1293 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1294 } else {
1295 llvm_unreachable("cannot deduce memory type.");
1296 }
1297 }
1298
1299 // Convert one element vectors to scalar.
1300 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1301 MemVT = MemVT.getScalarType();
1302
1303 // Round up vec3/vec5 argument.
1304 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1305 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1306 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1307 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1308 }
1309
1310 unsigned PartOffset = 0;
1311 for (unsigned i = 0; i != NumRegs; ++i) {
1312 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1313 Offset: BasePartOffset + PartOffset,
1314 LocVT: MemVT.getSimpleVT(),
1315 HTP: CCValAssign::Full));
1316 PartOffset += MemVT.getStoreSize();
1317 }
1318 }
1319 }
1320}
1321
1322SDValue AMDGPUTargetLowering::LowerReturn(
1323 SDValue Chain, CallingConv::ID CallConv,
1324 bool isVarArg,
1325 const SmallVectorImpl<ISD::OutputArg> &Outs,
1326 const SmallVectorImpl<SDValue> &OutVals,
1327 const SDLoc &DL, SelectionDAG &DAG) const {
1328 // FIXME: Fails for r600 tests
1329 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1330 // "wave terminate should not have return values");
1331 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1332}
1333
1334//===---------------------------------------------------------------------===//
1335// Target specific lowering
1336//===---------------------------------------------------------------------===//
1337
1338/// Selects the correct CCAssignFn for a given CallingConvention value.
1339CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1340 bool IsVarArg) {
1341 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1342}
1343
1344CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1345 bool IsVarArg) {
1346 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1347}
1348
1349SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1350 SelectionDAG &DAG,
1351 MachineFrameInfo &MFI,
1352 int ClobberedFI) const {
1353 SmallVector<SDValue, 8> ArgChains;
1354 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1355 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1356
1357 // Include the original chain at the beginning of the list. When this is
1358 // used by target LowerCall hooks, this helps legalize find the
1359 // CALLSEQ_BEGIN node.
1360 ArgChains.push_back(Elt: Chain);
1361
1362 // Add a chain value for each stack argument corresponding
1363 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1364 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1365 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1366 if (FI->getIndex() < 0) {
1367 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1368 int64_t InLastByte = InFirstByte;
1369 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1370
1371 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1372 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1373 ArgChains.push_back(Elt: SDValue(L, 1));
1374 }
1375 }
1376 }
1377 }
1378
1379 // Build a tokenfactor for all the chains.
1380 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1381}
1382
1383SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1384 SmallVectorImpl<SDValue> &InVals,
1385 StringRef Reason) const {
1386 SDValue Callee = CLI.Callee;
1387 SelectionDAG &DAG = CLI.DAG;
1388
1389 const Function &Fn = DAG.getMachineFunction().getFunction();
1390
1391 StringRef FuncName("<unknown>");
1392
1393 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1394 FuncName = G->getSymbol();
1395 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1396 FuncName = G->getGlobal()->getName();
1397
1398 DAG.getContext()->diagnose(
1399 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1400
1401 if (!CLI.IsTailCall) {
1402 for (ISD::InputArg &Arg : CLI.Ins)
1403 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1404 }
1405
1406 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1407 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1408 return CLI.Chain;
1409
1410 SDValue Chain = DAG.getCALLSEQ_START(Chain: CLI.Chain, InSize: 0, OutSize: 0, DL: CLI.DL);
1411 return DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, /*InGlue=*/Glue: SDValue(), DL: CLI.DL);
1412}
1413
1414SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1415 SmallVectorImpl<SDValue> &InVals) const {
1416 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1417}
1418
1419SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1420 SelectionDAG &DAG) const {
1421 const Function &Fn = DAG.getMachineFunction().getFunction();
1422
1423 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1424 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1425 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1426 return DAG.getMergeValues(Ops, dl: SDLoc());
1427}
1428
1429SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1430 SelectionDAG &DAG) const {
1431 switch (Op.getOpcode()) {
1432 default:
1433 Op->print(OS&: errs(), G: &DAG);
1434 llvm_unreachable("Custom lowering code for this "
1435 "instruction is not implemented yet!");
1436 break;
1437 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1438 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1439 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1440 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1441 case ISD::SDIVREM:
1442 return LowerSDIVREM(Op, DAG);
1443 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1444 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1445 case ISD::FRINT: return LowerFRINT(Op, DAG);
1446 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1447 case ISD::FROUNDEVEN:
1448 return LowerFROUNDEVEN(Op, DAG);
1449 case ISD::FROUND: return LowerFROUND(Op, DAG);
1450 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1451 case ISD::FLOG2:
1452 return LowerFLOG2(Op, DAG);
1453 case ISD::FLOG:
1454 case ISD::FLOG10:
1455 return LowerFLOGCommon(Op, DAG);
1456 case ISD::FEXP:
1457 case ISD::FEXP10:
1458 return lowerFEXP(Op, DAG);
1459 case ISD::FEXP2:
1460 return lowerFEXP2(Op, DAG);
1461 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1462 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1463 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1464 case ISD::FP_TO_SINT:
1465 case ISD::FP_TO_UINT:
1466 return LowerFP_TO_INT(Op, DAG);
1467 case ISD::FP_TO_SINT_SAT:
1468 case ISD::FP_TO_UINT_SAT:
1469 return LowerFP_TO_INT_SAT(Op, DAG);
1470 case ISD::CTTZ:
1471 case ISD::CTTZ_ZERO_UNDEF:
1472 case ISD::CTLZ:
1473 case ISD::CTLZ_ZERO_UNDEF:
1474 return LowerCTLZ_CTTZ(Op, DAG);
1475 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1476 }
1477 return Op;
1478}
1479
1480void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1481 SmallVectorImpl<SDValue> &Results,
1482 SelectionDAG &DAG) const {
1483 switch (N->getOpcode()) {
1484 case ISD::SIGN_EXTEND_INREG:
1485 // Different parts of legalization seem to interpret which type of
1486 // sign_extend_inreg is the one to check for custom lowering. The extended
1487 // from type is what really matters, but some places check for custom
1488 // lowering of the result type. This results in trying to use
1489 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1490 // nothing here and let the illegal result integer be handled normally.
1491 return;
1492 case ISD::FLOG2:
1493 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1494 Results.push_back(Elt: Lowered);
1495 return;
1496 case ISD::FLOG:
1497 case ISD::FLOG10:
1498 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1499 Results.push_back(Elt: Lowered);
1500 return;
1501 case ISD::FEXP2:
1502 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1503 Results.push_back(Elt: Lowered);
1504 return;
1505 case ISD::FEXP:
1506 case ISD::FEXP10:
1507 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1508 Results.push_back(Elt: Lowered);
1509 return;
1510 case ISD::CTLZ:
1511 case ISD::CTLZ_ZERO_UNDEF:
1512 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1513 Results.push_back(Elt: Lowered);
1514 return;
1515 default:
1516 return;
1517 }
1518}
1519
1520SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1521 SDValue Op,
1522 SelectionDAG &DAG) const {
1523
1524 const DataLayout &DL = DAG.getDataLayout();
1525 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1526 const GlobalValue *GV = G->getGlobal();
1527
1528 if (!MFI->isModuleEntryFunction()) {
1529 auto IsNamedBarrier = AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV));
1530 if (std::optional<uint32_t> Address =
1531 AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1532 if (IsNamedBarrier) {
1533 unsigned BarCnt = cast<GlobalVariable>(Val: GV)->getGlobalSize(DL) / 16;
1534 MFI->recordNumNamedBarriers(GVAddr: Address.value(), BarCnt);
1535 }
1536 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1537 } else if (IsNamedBarrier) {
1538 llvm_unreachable("named barrier should have an assigned address");
1539 }
1540 }
1541
1542 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1543 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1544 if (!MFI->isModuleEntryFunction() &&
1545 GV->getName() != "llvm.amdgcn.module.lds" &&
1546 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1547 SDLoc DL(Op);
1548 const Function &Fn = DAG.getMachineFunction().getFunction();
1549 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1550 Fn, "local memory global used by non-kernel function",
1551 DL.getDebugLoc(), DS_Warning));
1552
1553 // We currently don't have a way to correctly allocate LDS objects that
1554 // aren't directly associated with a kernel. We do force inlining of
1555 // functions that use local objects. However, if these dead functions are
1556 // not eliminated, we don't want a compile time error. Just emit a warning
1557 // and a trap, since there should be no callable path here.
1558 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1559 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1560 N1: Trap, N2: DAG.getRoot());
1561 DAG.setRoot(OutputChain);
1562 return DAG.getPOISON(VT: Op.getValueType());
1563 }
1564
1565 // XXX: What does the value of G->getOffset() mean?
1566 assert(G->getOffset() == 0 &&
1567 "Do not know what to do with an non-zero offset");
1568
1569 // TODO: We could emit code to handle the initialization somewhere.
1570 // We ignore the initializer for now and legalize it to allow selection.
1571 // The initializer will anyway get errored out during assembly emission.
1572 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1573 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1574 }
1575 return SDValue();
1576}
1577
1578SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1579 SelectionDAG &DAG) const {
1580 SmallVector<SDValue, 8> Args;
1581 SDLoc SL(Op);
1582
1583 EVT VT = Op.getValueType();
1584 if (VT.getVectorElementType().getSizeInBits() < 32) {
1585 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1586 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1587 unsigned NewNumElt = OpBitSize / 32;
1588 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1589 : EVT::getVectorVT(Context&: *DAG.getContext(),
1590 VT: MVT::i32, NumElements: NewNumElt);
1591 for (const SDUse &U : Op->ops()) {
1592 SDValue In = U.get();
1593 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1594 if (NewNumElt > 1)
1595 DAG.ExtractVectorElements(Op: NewIn, Args);
1596 else
1597 Args.push_back(Elt: NewIn);
1598 }
1599
1600 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1601 NumElements: NewNumElt * Op.getNumOperands());
1602 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1603 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1604 }
1605 }
1606
1607 for (const SDUse &U : Op->ops())
1608 DAG.ExtractVectorElements(Op: U.get(), Args);
1609
1610 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1611}
1612
1613SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1614 SelectionDAG &DAG) const {
1615 SDLoc SL(Op);
1616 SmallVector<SDValue, 8> Args;
1617 unsigned Start = Op.getConstantOperandVal(i: 1);
1618 EVT VT = Op.getValueType();
1619 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1620
1621 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1622 unsigned NumElt = VT.getVectorNumElements();
1623 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1624 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1625
1626 // Extract 32-bit registers at a time.
1627 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1628 EVT NewVT = NumElt == 2
1629 ? MVT::i32
1630 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1631 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1632
1633 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1634 if (NumElt == 2)
1635 Tmp = Args[0];
1636 else
1637 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1638
1639 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1640 }
1641
1642 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1643 Count: VT.getVectorNumElements());
1644
1645 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1646}
1647
1648// TODO: Handle fabs too
1649static SDValue peekFNeg(SDValue Val) {
1650 if (Val.getOpcode() == ISD::FNEG)
1651 return Val.getOperand(i: 0);
1652
1653 return Val;
1654}
1655
1656static SDValue peekFPSignOps(SDValue Val) {
1657 if (Val.getOpcode() == ISD::FNEG)
1658 Val = Val.getOperand(i: 0);
1659 if (Val.getOpcode() == ISD::FABS)
1660 Val = Val.getOperand(i: 0);
1661 if (Val.getOpcode() == ISD::FCOPYSIGN)
1662 Val = Val.getOperand(i: 0);
1663 return Val;
1664}
1665
1666SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1667 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1668 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1669 SelectionDAG &DAG = DCI.DAG;
1670 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1671 switch (CCOpcode) {
1672 case ISD::SETOEQ:
1673 case ISD::SETONE:
1674 case ISD::SETUNE:
1675 case ISD::SETNE:
1676 case ISD::SETUEQ:
1677 case ISD::SETEQ:
1678 case ISD::SETFALSE:
1679 case ISD::SETFALSE2:
1680 case ISD::SETTRUE:
1681 case ISD::SETTRUE2:
1682 case ISD::SETUO:
1683 case ISD::SETO:
1684 break;
1685 case ISD::SETULE:
1686 case ISD::SETULT: {
1687 if (LHS == True)
1688 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1689 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1690 }
1691 case ISD::SETOLE:
1692 case ISD::SETOLT:
1693 case ISD::SETLE:
1694 case ISD::SETLT: {
1695 // Ordered. Assume ordered for undefined.
1696
1697 // Only do this after legalization to avoid interfering with other combines
1698 // which might occur.
1699 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1700 !DCI.isCalledByLegalizer())
1701 return SDValue();
1702
1703 // We need to permute the operands to get the correct NaN behavior. The
1704 // selected operand is the second one based on the failing compare with NaN,
1705 // so permute it based on the compare type the hardware uses.
1706 if (LHS == True)
1707 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1708 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1709 }
1710 case ISD::SETUGE:
1711 case ISD::SETUGT: {
1712 if (LHS == True)
1713 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1714 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1715 }
1716 case ISD::SETGT:
1717 case ISD::SETGE:
1718 case ISD::SETOGE:
1719 case ISD::SETOGT: {
1720 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1721 !DCI.isCalledByLegalizer())
1722 return SDValue();
1723
1724 if (LHS == True)
1725 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1726 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1727 }
1728 case ISD::SETCC_INVALID:
1729 llvm_unreachable("Invalid setcc condcode!");
1730 }
1731 return SDValue();
1732}
1733
1734/// Generate Min/Max node
1735SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1736 SDValue LHS, SDValue RHS,
1737 SDValue True, SDValue False,
1738 SDValue CC,
1739 DAGCombinerInfo &DCI) const {
1740 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1741 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1742
1743 SelectionDAG &DAG = DCI.DAG;
1744
1745 // If we can't directly match this, try to see if we can fold an fneg to
1746 // match.
1747
1748 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1749 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1750 SDValue NegTrue = peekFNeg(Val: True);
1751
1752 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1753 // fmin/fmax.
1754 //
1755 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1756 // -> fneg (fmin_legacy lhs, K)
1757 //
1758 // TODO: Use getNegatedExpression
1759 if (LHS == NegTrue && CFalse && CRHS) {
1760 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1761 if (NegRHS == CFalse->getValueAPF()) {
1762 SDValue Combined =
1763 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1764 if (Combined)
1765 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1766 return SDValue();
1767 }
1768 }
1769
1770 return SDValue();
1771}
1772
1773std::pair<SDValue, SDValue>
1774AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1775 SDLoc SL(Op);
1776
1777 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1778
1779 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1780 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1781
1782 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1783 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1784
1785 return std::pair(Lo, Hi);
1786}
1787
1788SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1789 SDLoc SL(Op);
1790
1791 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1792 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1793 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1794}
1795
1796SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1797 SDLoc SL(Op);
1798
1799 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1800 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1801 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1802}
1803
1804// Split a vector type into two parts. The first part is a power of two vector.
1805// The second part is whatever is left over, and is a scalar if it would
1806// otherwise be a 1-vector.
1807std::pair<EVT, EVT>
1808AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1809 EVT LoVT, HiVT;
1810 EVT EltVT = VT.getVectorElementType();
1811 unsigned NumElts = VT.getVectorNumElements();
1812 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1813 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1814 HiVT = NumElts - LoNumElts == 1
1815 ? EltVT
1816 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1817 return std::pair(LoVT, HiVT);
1818}
1819
1820// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1821// scalar.
1822std::pair<SDValue, SDValue>
1823AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1824 const EVT &LoVT, const EVT &HiVT,
1825 SelectionDAG &DAG) const {
1826 EVT VT = N.getValueType();
1827 assert(LoVT.getVectorNumElements() +
1828 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1829 VT.getVectorNumElements() &&
1830 "More vector elements requested than available!");
1831 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1832 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1833
1834 unsigned LoNumElts = LoVT.getVectorNumElements();
1835
1836 if (HiVT.isVector()) {
1837 unsigned HiNumElts = HiVT.getVectorNumElements();
1838 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1839 // Avoid creating an extract_subvector with an index that isn't a multiple
1840 // of the result type.
1841 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HiVT, N1: N,
1842 N2: DAG.getConstant(Val: LoNumElts, DL, VT: MVT::i32));
1843 return {Lo, Hi};
1844 }
1845
1846 SmallVector<SDValue, 8> Elts;
1847 DAG.ExtractVectorElements(Op: N, Args&: Elts, /*Start=*/LoNumElts,
1848 /*Count=*/HiNumElts);
1849 SDValue Hi = DAG.getBuildVector(VT: HiVT, DL, Ops: Elts);
1850 return {Lo, Hi};
1851 }
1852
1853 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: HiVT, N1: N,
1854 N2: DAG.getVectorIdxConstant(Val: LoNumElts, DL));
1855 return {Lo, Hi};
1856}
1857
1858SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1859 SelectionDAG &DAG) const {
1860 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1861 EVT VT = Op.getValueType();
1862 SDLoc SL(Op);
1863
1864
1865 // If this is a 2 element vector, we really want to scalarize and not create
1866 // weird 1 element vectors.
1867 if (VT.getVectorNumElements() == 2) {
1868 SDValue Ops[2];
1869 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1870 return DAG.getMergeValues(Ops, dl: SL);
1871 }
1872
1873 SDValue BasePtr = Load->getBasePtr();
1874 EVT MemVT = Load->getMemoryVT();
1875
1876 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1877
1878 EVT LoVT, HiVT;
1879 EVT LoMemVT, HiMemVT;
1880 SDValue Lo, Hi;
1881
1882 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1883 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1884 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1885
1886 unsigned Size = LoMemVT.getStoreSize();
1887 Align BaseAlign = Load->getAlign();
1888 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1889
1890 SDValue LoLoad = DAG.getExtLoad(
1891 ExtType: Load->getExtensionType(), dl: SL, VT: LoVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1892 MemVT: LoMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1893 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1894 SDValue HiLoad = DAG.getExtLoad(
1895 ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(), Ptr: HiPtr,
1896 PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()), MemVT: HiMemVT, Alignment: HiAlign,
1897 MMOFlags: Load->getMemOperand()->getFlags(), AAInfo: Load->getAAInfo());
1898
1899 SDValue Join;
1900 if (LoVT == HiVT) {
1901 // This is the case that the vector is power of two so was evenly split.
1902 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1903 } else {
1904 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1905 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1906 Join = DAG.getNode(
1907 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1908 VT, N1: Join, N2: HiLoad,
1909 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1910 }
1911
1912 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1913 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1914
1915 return DAG.getMergeValues(Ops, dl: SL);
1916}
1917
1918SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1919 SelectionDAG &DAG) const {
1920 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1921 EVT VT = Op.getValueType();
1922 SDValue BasePtr = Load->getBasePtr();
1923 EVT MemVT = Load->getMemoryVT();
1924 SDLoc SL(Op);
1925 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1926 Align BaseAlign = Load->getAlign();
1927 unsigned NumElements = MemVT.getVectorNumElements();
1928
1929 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1930 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1931 if (NumElements != 3 ||
1932 (BaseAlign < Align(8) &&
1933 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1934 return SplitVectorLoad(Op, DAG);
1935
1936 assert(NumElements == 3);
1937
1938 EVT WideVT =
1939 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1940 EVT WideMemVT =
1941 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1942 SDValue WideLoad = DAG.getExtLoad(
1943 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1944 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1945 return DAG.getMergeValues(
1946 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1947 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1948 WideLoad.getValue(R: 1)},
1949 dl: SL);
1950}
1951
1952SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1953 SelectionDAG &DAG) const {
1954 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1955 SDValue Val = Store->getValue();
1956 EVT VT = Val.getValueType();
1957
1958 // If this is a 2 element vector, we really want to scalarize and not create
1959 // weird 1 element vectors.
1960 if (VT.getVectorNumElements() == 2)
1961 return scalarizeVectorStore(ST: Store, DAG);
1962
1963 EVT MemVT = Store->getMemoryVT();
1964 SDValue Chain = Store->getChain();
1965 SDValue BasePtr = Store->getBasePtr();
1966 SDLoc SL(Op);
1967
1968 EVT LoVT, HiVT;
1969 EVT LoMemVT, HiMemVT;
1970 SDValue Lo, Hi;
1971
1972 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1973 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1974 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1975
1976 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1977
1978 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1979 Align BaseAlign = Store->getAlign();
1980 unsigned Size = LoMemVT.getStoreSize();
1981 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1982
1983 SDValue LoStore =
1984 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1985 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1986 SDValue HiStore = DAG.getTruncStore(
1987 Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size), SVT: HiMemVT, Alignment: HiAlign,
1988 MMOFlags: Store->getMemOperand()->getFlags(), AAInfo: Store->getAAInfo());
1989
1990 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1991}
1992
1993// This is a shortcut for integer division because we have fast i32<->f32
1994// conversions, and fast f32 reciprocal instructions. The fractional part of a
1995// float is enough to accurately represent up to a 24-bit signed integer.
1996SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1997 bool Sign) const {
1998 SDLoc DL(Op);
1999 EVT VT = Op.getValueType();
2000 SDValue LHS = Op.getOperand(i: 0);
2001 SDValue RHS = Op.getOperand(i: 1);
2002 MVT IntVT = MVT::i32;
2003 MVT FltVT = MVT::f32;
2004
2005 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
2006 if (LHSSignBits < 9)
2007 return SDValue();
2008
2009 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
2010 if (RHSSignBits < 9)
2011 return SDValue();
2012
2013 unsigned BitSize = VT.getSizeInBits();
2014 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
2015 unsigned DivBits = BitSize - SignBits;
2016 if (Sign)
2017 ++DivBits;
2018
2019 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2020 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
2021
2022 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
2023
2024 if (Sign) {
2025 // char|short jq = ia ^ ib;
2026 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
2027
2028 // jq = jq >> (bitsize - 2)
2029 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
2030 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
2031
2032 // jq = jq | 0x1
2033 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
2034 }
2035
2036 // int ia = (int)LHS;
2037 SDValue ia = LHS;
2038
2039 // int ib, (int)RHS;
2040 SDValue ib = RHS;
2041
2042 // float fa = (float)ia;
2043 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2044
2045 // float fb = (float)ib;
2046 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2047
2048 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2049 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2050
2051 // fq = trunc(fq);
2052 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2053
2054 // float fqneg = -fq;
2055 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2056
2057 MachineFunction &MF = DAG.getMachineFunction();
2058
2059 bool UseFmadFtz = false;
2060 if (Subtarget->isGCN()) {
2061 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2062 UseFmadFtz =
2063 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2064 }
2065
2066 // float fr = mad(fqneg, fb, fa);
2067 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2068 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2069 : (unsigned)ISD::FMAD;
2070 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2071
2072 // int iq = (int)fq;
2073 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2074
2075 // fr = fabs(fr);
2076 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2077
2078 // fb = fabs(fb);
2079 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2080
2081 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2082
2083 // int cv = fr >= fb;
2084 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2085
2086 // jq = (cv ? jq : 0);
2087 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2088
2089 // dst = iq + jq;
2090 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2091
2092 // Rem needs compensation, it's easier to recompute it
2093 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2094 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2095
2096 // Truncate to number of bits this divide really is.
2097 if (Sign) {
2098 SDValue InRegSize
2099 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2100 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2101 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2102 } else {
2103 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2104 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2105 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2106 }
2107
2108 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2109}
2110
2111void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2112 SelectionDAG &DAG,
2113 SmallVectorImpl<SDValue> &Results) const {
2114 SDLoc DL(Op);
2115 EVT VT = Op.getValueType();
2116
2117 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2118
2119 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2120
2121 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2122 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2123
2124 //HiLo split
2125 SDValue LHS_Lo, LHS_Hi;
2126 SDValue LHS = Op.getOperand(i: 0);
2127 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2128
2129 SDValue RHS_Lo, RHS_Hi;
2130 SDValue RHS = Op.getOperand(i: 1);
2131 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2132
2133 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2134 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2135
2136 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2137 N1: LHS_Lo, N2: RHS_Lo);
2138
2139 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2140 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2141
2142 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2143 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2144 return;
2145 }
2146
2147 if (isTypeLegal(VT: MVT::i64)) {
2148 // The algorithm here is based on ideas from "Software Integer Division",
2149 // Tom Rodeheffer, August 2008.
2150
2151 MachineFunction &MF = DAG.getMachineFunction();
2152 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2153
2154 // Compute denominator reciprocal.
2155 unsigned FMAD =
2156 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2157 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2158 ? (unsigned)ISD::FMAD
2159 : (unsigned)AMDGPUISD::FMAD_FTZ;
2160
2161 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2162 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2163 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2164 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2165 N3: Cvt_Lo);
2166 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2167 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2168 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2169 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2170 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2171 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2172 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2173 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2174 N3: Mul1);
2175 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2176 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2177 SDValue Rcp64 = DAG.getBitcast(VT,
2178 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2179
2180 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2181 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2182 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2183 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2184
2185 // First round of UNR (Unsigned integer Newton-Raphson).
2186 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2187 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2188 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2189 SDValue Mulhi1_Lo, Mulhi1_Hi;
2190 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2191 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2192 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2193 N2: Mulhi1_Lo, N3: Zero1);
2194 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2195 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2196 SDValue Add1 = DAG.getBitcast(VT,
2197 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2198
2199 // Second round of UNR.
2200 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2201 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2202 SDValue Mulhi2_Lo, Mulhi2_Hi;
2203 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2204 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2205 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2206 N2: Mulhi2_Lo, N3: Zero1);
2207 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2208 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2209 SDValue Add2 = DAG.getBitcast(VT,
2210 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2211
2212 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2213
2214 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2215
2216 SDValue Mul3_Lo, Mul3_Hi;
2217 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2218 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2219 N2: Mul3_Lo, N3: Zero1);
2220 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2221 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2222 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2223 SDValue Sub1 = DAG.getBitcast(VT,
2224 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2225
2226 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2227 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2228 Cond: ISD::SETUGE);
2229 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2230 Cond: ISD::SETUGE);
2231 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2232
2233 // TODO: Here and below portions of the code can be enclosed into if/endif.
2234 // Currently control flow is unconditional and we have 4 selects after
2235 // potential endif to substitute PHIs.
2236
2237 // if C3 != 0 ...
2238 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2239 N2: RHS_Lo, N3: Zero1);
2240 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2241 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2242 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2243 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2244 SDValue Sub2 = DAG.getBitcast(VT,
2245 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2246
2247 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2248
2249 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2250 Cond: ISD::SETUGE);
2251 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2252 Cond: ISD::SETUGE);
2253 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2254
2255 // if (C6 != 0)
2256 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2257
2258 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2259 N2: RHS_Lo, N3: Zero1);
2260 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2261 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2262 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2263 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2264 SDValue Sub3 = DAG.getBitcast(VT,
2265 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2266
2267 // endif C6
2268 // endif C3
2269
2270 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2271 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2272
2273 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2274 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2275
2276 Results.push_back(Elt: Div);
2277 Results.push_back(Elt: Rem);
2278
2279 return;
2280 }
2281
2282 // r600 expandion.
2283 // Get Speculative values
2284 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2285 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2286
2287 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2288 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2289 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2290
2291 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2292 SDValue DIV_Lo = Zero;
2293
2294 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2295
2296 for (unsigned i = 0; i < halfBitWidth; ++i) {
2297 const unsigned bitPos = halfBitWidth - i - 1;
2298 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2299 // Get value of high bit
2300 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2301 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2302 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2303
2304 // Shift
2305 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2306 // Add LHS high bit
2307 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2308
2309 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2310 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2311
2312 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2313
2314 // Update REM
2315 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2316 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2317 }
2318
2319 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2320 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2321 Results.push_back(Elt: DIV);
2322 Results.push_back(Elt: REM);
2323}
2324
2325SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2326 SelectionDAG &DAG) const {
2327 SDLoc DL(Op);
2328 EVT VT = Op.getValueType();
2329
2330 if (VT == MVT::i64) {
2331 SmallVector<SDValue, 2> Results;
2332 LowerUDIVREM64(Op, DAG, Results);
2333 return DAG.getMergeValues(Ops: Results, dl: DL);
2334 }
2335
2336 if (VT == MVT::i32) {
2337 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2338 return Res;
2339 }
2340
2341 SDValue X = Op.getOperand(i: 0);
2342 SDValue Y = Op.getOperand(i: 1);
2343
2344 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2345 // algorithm used here.
2346
2347 // Initial estimate of inv(y).
2348 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2349
2350 // One round of UNR.
2351 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2352 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2353 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2354 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2355
2356 // Quotient/remainder estimate.
2357 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2358 SDValue R =
2359 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2360
2361 // First quotient/remainder refinement.
2362 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2363 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2364 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2365 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2366 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2367 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2368 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2369
2370 // Second quotient/remainder refinement.
2371 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2372 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2373 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2374 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2375 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2376
2377 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2378}
2379
2380SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2381 SelectionDAG &DAG) const {
2382 SDLoc DL(Op);
2383 EVT VT = Op.getValueType();
2384
2385 SDValue LHS = Op.getOperand(i: 0);
2386 SDValue RHS = Op.getOperand(i: 1);
2387
2388 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2389 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2390
2391 if (VT == MVT::i32) {
2392 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2393 return Res;
2394 }
2395
2396 if (VT == MVT::i64 &&
2397 DAG.ComputeNumSignBits(Op: LHS) > 32 &&
2398 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2399 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2400
2401 //HiLo split
2402 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2403 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2404 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2405 N1: LHS_Lo, N2: RHS_Lo);
2406 SDValue Res[2] = {
2407 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2408 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2409 };
2410 return DAG.getMergeValues(Ops: Res, dl: DL);
2411 }
2412
2413 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2414 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2415 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2416 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2417
2418 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2419 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2420
2421 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2422 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2423
2424 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2425 SDValue Rem = Div.getValue(R: 1);
2426
2427 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2428 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2429
2430 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2431 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2432
2433 SDValue Res[2] = {
2434 Div,
2435 Rem
2436 };
2437 return DAG.getMergeValues(Ops: Res, dl: DL);
2438}
2439
2440SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2441 SDLoc SL(Op);
2442 SDValue Src = Op.getOperand(i: 0);
2443
2444 // result = trunc(src)
2445 // if (src > 0.0 && src != result)
2446 // result += 1.0
2447
2448 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2449
2450 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2451 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2452
2453 EVT SetCCVT =
2454 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2455
2456 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2457 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2458 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2459
2460 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2461 // TODO: Should this propagate fast-math-flags?
2462 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2463}
2464
2465static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2466 SelectionDAG &DAG) {
2467 const unsigned FractBits = 52;
2468 const unsigned ExpBits = 11;
2469
2470 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2471 N1: Hi,
2472 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2473 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2474 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2475 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2476
2477 return Exp;
2478}
2479
2480SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2481 SDLoc SL(Op);
2482 SDValue Src = Op.getOperand(i: 0);
2483
2484 assert(Op.getValueType() == MVT::f64);
2485
2486 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2487
2488 // Extract the upper half, since this is where we will find the sign and
2489 // exponent.
2490 SDValue Hi = getHiHalf64(Op: Src, DAG);
2491
2492 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2493
2494 const unsigned FractBits = 52;
2495
2496 // Extract the sign bit.
2497 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2498 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2499
2500 // Extend back to 64-bits.
2501 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2502 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2503
2504 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2505 const SDValue FractMask
2506 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2507
2508 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2509 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2510 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2511
2512 EVT SetCCVT =
2513 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2514
2515 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2516
2517 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2518 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2519
2520 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2521 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2522
2523 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2524}
2525
2526SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2527 SelectionDAG &DAG) const {
2528 SDLoc SL(Op);
2529 SDValue Src = Op.getOperand(i: 0);
2530
2531 assert(Op.getValueType() == MVT::f64);
2532
2533 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2534 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2535 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2536
2537 // TODO: Should this propagate fast-math-flags?
2538
2539 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2540 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2541
2542 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2543
2544 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2545 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2546
2547 EVT SetCCVT =
2548 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2549 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2550
2551 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2552}
2553
2554SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2555 SelectionDAG &DAG) const {
2556 // FNEARBYINT and FRINT are the same, except in their handling of FP
2557 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2558 // rint, so just treat them as equivalent.
2559 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2560 Operand: Op.getOperand(i: 0));
2561}
2562
2563SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2564 auto VT = Op.getValueType();
2565 auto Arg = Op.getOperand(i: 0u);
2566 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2567}
2568
2569// XXX - May require not supporting f32 denormals?
2570
2571// Don't handle v2f16. The extra instructions to scalarize and repack around the
2572// compare and vselect end up producing worse code than scalarizing the whole
2573// operation.
2574SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2575 SDLoc SL(Op);
2576 SDValue X = Op.getOperand(i: 0);
2577 EVT VT = Op.getValueType();
2578
2579 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2580
2581 // TODO: Should this propagate fast-math-flags?
2582
2583 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2584
2585 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2586
2587 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2588 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2589
2590 EVT SetCCVT =
2591 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2592
2593 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2594 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2595 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2596
2597 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2598 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2599}
2600
2601SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2602 SDLoc SL(Op);
2603 SDValue Src = Op.getOperand(i: 0);
2604
2605 // result = trunc(src);
2606 // if (src < 0.0 && src != result)
2607 // result += -1.0.
2608
2609 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2610
2611 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2612 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2613
2614 EVT SetCCVT =
2615 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2616
2617 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2618 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2619 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2620
2621 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2622 // TODO: Should this propagate fast-math-flags?
2623 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2624}
2625
2626/// Return true if it's known that \p Src can never be an f32 denormal value.
2627static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2628 switch (Src.getOpcode()) {
2629 case ISD::FP_EXTEND:
2630 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2631 case ISD::FP16_TO_FP:
2632 case ISD::FFREXP:
2633 case ISD::FSQRT:
2634 case AMDGPUISD::LOG:
2635 case AMDGPUISD::EXP:
2636 return true;
2637 case ISD::INTRINSIC_WO_CHAIN: {
2638 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2639 switch (IntrinsicID) {
2640 case Intrinsic::amdgcn_frexp_mant:
2641 case Intrinsic::amdgcn_log:
2642 case Intrinsic::amdgcn_log_clamp:
2643 case Intrinsic::amdgcn_exp2:
2644 case Intrinsic::amdgcn_sqrt:
2645 return true;
2646 default:
2647 return false;
2648 }
2649 }
2650 default:
2651 return false;
2652 }
2653
2654 llvm_unreachable("covered opcode switch");
2655}
2656
2657bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2658 SDNodeFlags Flags) {
2659 return Flags.hasApproximateFuncs();
2660}
2661
2662bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2663 SDValue Src,
2664 SDNodeFlags Flags) {
2665 return !valueIsKnownNeverF32Denorm(Src) &&
2666 DAG.getMachineFunction()
2667 .getDenormalMode(FPType: APFloat::IEEEsingle())
2668 .Input != DenormalMode::PreserveSign;
2669}
2670
2671SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2672 SDValue Src,
2673 SDNodeFlags Flags) const {
2674 SDLoc SL(Src);
2675 EVT VT = Src.getValueType();
2676 const fltSemantics &Semantics = VT.getFltSemantics();
2677 SDValue SmallestNormal =
2678 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2679
2680 // Want to scale denormals up, but negatives and 0 work just as well on the
2681 // scaled path.
2682 SDValue IsLtSmallestNormal = DAG.getSetCC(
2683 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2684 RHS: SmallestNormal, Cond: ISD::SETOLT);
2685
2686 return IsLtSmallestNormal;
2687}
2688
2689SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2690 SDNodeFlags Flags) const {
2691 SDLoc SL(Src);
2692 EVT VT = Src.getValueType();
2693 const fltSemantics &Semantics = VT.getFltSemantics();
2694 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2695
2696 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2697 SDValue IsFinite = DAG.getSetCC(
2698 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2699 RHS: Inf, Cond: ISD::SETOLT);
2700 return IsFinite;
2701}
2702
2703/// If denormal handling is required return the scaled input to FLOG2, and the
2704/// check for denormal range. Otherwise, return null values.
2705std::pair<SDValue, SDValue>
2706AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2707 SDValue Src, SDNodeFlags Flags) const {
2708 if (!needsDenormHandlingF32(DAG, Src, Flags))
2709 return {};
2710
2711 MVT VT = MVT::f32;
2712 const fltSemantics &Semantics = APFloat::IEEEsingle();
2713 SDValue SmallestNormal =
2714 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2715
2716 SDValue IsLtSmallestNormal = DAG.getSetCC(
2717 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2718 RHS: SmallestNormal, Cond: ISD::SETOLT);
2719
2720 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2721 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2722 SDValue ScaleFactor =
2723 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2724
2725 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2726 return {ScaledInput, IsLtSmallestNormal};
2727}
2728
2729SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2730 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2731 // If we have to handle denormals, scale up the input and adjust the result.
2732
2733 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2734 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2735
2736 SDLoc SL(Op);
2737 EVT VT = Op.getValueType();
2738 SDValue Src = Op.getOperand(i: 0);
2739 SDNodeFlags Flags = Op->getFlags();
2740
2741 if (VT == MVT::f16) {
2742 // Nothing in half is a denormal when promoted to f32.
2743 assert(!isTypeLegal(VT));
2744 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2745 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2746 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2747 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2748 }
2749
2750 auto [ScaledInput, IsLtSmallestNormal] =
2751 getScaledLogInput(DAG, SL, Src, Flags);
2752 if (!ScaledInput)
2753 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2754
2755 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2756
2757 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2758 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2759 SDValue ResultOffset =
2760 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2761 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2762}
2763
2764static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2765 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2766 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2767 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2768}
2769
2770SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2771 SelectionDAG &DAG) const {
2772 SDValue X = Op.getOperand(i: 0);
2773 EVT VT = Op.getValueType();
2774 SDNodeFlags Flags = Op->getFlags();
2775 SDLoc DL(Op);
2776 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2777 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2778
2779 const auto &Options = getTargetMachine().Options;
2780 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2781
2782 if (VT == MVT::f16 && !isTypeLegal(VT: MVT::f16)) {
2783 // Log and multiply in f32 is good enough for f16.
2784 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2785 }
2786
2787 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2788 if (VT == MVT::f16 && !isTypeLegal(VT: MVT::f16)) {
2789 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2790 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2791 }
2792
2793 return Lowered;
2794 }
2795
2796 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2797 if (ScaledInput)
2798 X = ScaledInput;
2799
2800 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2801
2802 SDValue R;
2803 if (Subtarget->hasFastFMAF32()) {
2804 // c+cc are ln(2)/ln(10) to more than 49 bits
2805 const float c_log10 = 0x1.344134p-2f;
2806 const float cc_log10 = 0x1.09f79ep-26f;
2807
2808 // c + cc is ln(2) to more than 49 bits
2809 const float c_log = 0x1.62e42ep-1f;
2810 const float cc_log = 0x1.efa39ep-25f;
2811
2812 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2813 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2814 // This adds correction terms for which contraction may lead to an increase
2815 // in the error of the approximation, so disable it.
2816 Flags.setAllowContract(false);
2817 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2818 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2819 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2820 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2821 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2822 } else {
2823 // ch+ct is ln(2)/ln(10) to more than 36 bits
2824 const float ch_log10 = 0x1.344000p-2f;
2825 const float ct_log10 = 0x1.3509f6p-18f;
2826
2827 // ch + ct is ln(2) to more than 36 bits
2828 const float ch_log = 0x1.62e000p-1f;
2829 const float ct_log = 0x1.0bfbe8p-15f;
2830
2831 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2832 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2833
2834 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2835 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2836 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2837 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2838 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2839 // This adds correction terms for which contraction may lead to an increase
2840 // in the error of the approximation, so disable it.
2841 Flags.setAllowContract(false);
2842 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2843 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2844 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2845 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2846 }
2847
2848 const bool IsFiniteOnly =
2849 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2850
2851 // TODO: Check if known finite from source value.
2852 if (!IsFiniteOnly) {
2853 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2854 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2855 }
2856
2857 if (IsScaled) {
2858 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2859 SDValue ShiftK =
2860 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2861 SDValue Shift =
2862 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2863 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2864 }
2865
2866 return R;
2867}
2868
2869SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2870 return LowerFLOGCommon(Op, DAG);
2871}
2872
2873// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2874// promote f16 operation.
2875SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2876 SelectionDAG &DAG, bool IsLog10,
2877 SDNodeFlags Flags) const {
2878 EVT VT = Src.getValueType();
2879 unsigned LogOp =
2880 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2881
2882 double Log2BaseInverted =
2883 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2884
2885 if (VT == MVT::f32) {
2886 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2887 if (ScaledInput) {
2888 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2889 SDValue ScaledResultOffset =
2890 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2891
2892 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2893
2894 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2895 N2: ScaledResultOffset, N3: Zero, Flags);
2896
2897 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2898
2899 if (Subtarget->hasFastFMAF32())
2900 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2901 Flags);
2902 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2903 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2904 }
2905 }
2906
2907 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2908 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2909
2910 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2911 Flags);
2912}
2913
2914SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2915 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2916 // If we have to handle denormals, scale up the input and adjust the result.
2917
2918 SDLoc SL(Op);
2919 EVT VT = Op.getValueType();
2920 SDValue Src = Op.getOperand(i: 0);
2921 SDNodeFlags Flags = Op->getFlags();
2922
2923 if (VT == MVT::f16) {
2924 // Nothing in half is a denormal when promoted to f32.
2925 assert(!isTypeLegal(MVT::f16));
2926 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2927 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2928 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2929 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2930 }
2931
2932 assert(VT == MVT::f32);
2933
2934 if (!needsDenormHandlingF32(DAG, Src, Flags))
2935 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2936
2937 // bool needs_scaling = x < -0x1.f80000p+6f;
2938 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2939
2940 // -nextafter(128.0, -1)
2941 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
2942
2943 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2944
2945 SDValue NeedsScaling =
2946 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2947
2948 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2949 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2950
2951 SDValue AddOffset =
2952 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2953
2954 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2955 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2956
2957 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
2958 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2959 SDValue ResultScale =
2960 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2961
2962 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2963}
2964
2965SDValue AMDGPUTargetLowering::lowerFEXPUnsafeImpl(SDValue X, const SDLoc &SL,
2966 SelectionDAG &DAG,
2967 SDNodeFlags Flags,
2968 bool IsExp10) const {
2969 // exp(x) -> exp2(M_LOG2E_F * x);
2970 // exp10(x) -> exp2(log2(10) * x);
2971 EVT VT = X.getValueType();
2972 SDValue Const =
2973 DAG.getConstantFP(Val: IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, DL: SL, VT);
2974
2975 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Const, Flags);
2976 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2977 : (unsigned)ISD::FEXP2,
2978 DL: SL, VT, Operand: Mul, Flags);
2979}
2980
2981SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2982 SelectionDAG &DAG,
2983 SDNodeFlags Flags) const {
2984 EVT VT = X.getValueType();
2985 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags))
2986 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2987
2988 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2989
2990 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
2991 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2992
2993 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2994
2995 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2996
2997 SDValue AdjustedX =
2998 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2999
3000 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
3001 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
3002
3003 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
3004
3005 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
3006 SDValue AdjustedResult =
3007 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
3008
3009 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
3010 Flags);
3011}
3012
3013/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3014/// handled correctly.
3015SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
3016 SelectionDAG &DAG,
3017 SDNodeFlags Flags) const {
3018 const EVT VT = X.getValueType();
3019
3020 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3021 : static_cast<unsigned>(ISD::FEXP2);
3022
3023 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
3024 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3025 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3026 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3027
3028 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
3029 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3030 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
3031 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3032 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
3033 }
3034
3035 // bool s = x < -0x1.2f7030p+5f;
3036 // x += s ? 0x1.0p+5f : 0.0f;
3037 // exp10 = exp2(x * 0x1.a92000p+1f) *
3038 // exp2(x * 0x1.4f0978p-11f) *
3039 // (s ? 0x1.9f623ep-107f : 1.0f);
3040
3041 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3042
3043 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3044 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3045
3046 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3047 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3048 SDValue AdjustedX =
3049 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3050
3051 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3052 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3053
3054 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3055 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3056 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3057 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3058
3059 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3060
3061 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3062 SDValue AdjustedResult =
3063 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3064
3065 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3066 Flags);
3067}
3068
3069SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3070 EVT VT = Op.getValueType();
3071 SDLoc SL(Op);
3072 SDValue X = Op.getOperand(i: 0);
3073 SDNodeFlags Flags = Op->getFlags();
3074 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3075
3076 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3077 // library behavior. Also, is known-not-daz source sufficient?
3078 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3079 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3080 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3081 }
3082
3083 if (VT.getScalarType() == MVT::f16) {
3084 if (VT.isVector())
3085 return SDValue();
3086
3087 // Nothing in half is a denormal when promoted to f32.
3088 //
3089 // exp(f16 x) ->
3090 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3091 //
3092 // exp10(f16 x) ->
3093 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3094 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3095 SDValue Lowered = lowerFEXPUnsafeImpl(X: Ext, SL, DAG, Flags, IsExp10);
3096 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3097 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3098 }
3099
3100 assert(VT == MVT::f32);
3101
3102 // Algorithm:
3103 //
3104 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3105 //
3106 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3107 // n = 64*m + j, 0 <= j < 64
3108 //
3109 // e^x = 2^((64*m + j + f)/64)
3110 // = (2^m) * (2^(j/64)) * 2^(f/64)
3111 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3112 //
3113 // f = x*(64/ln(2)) - n
3114 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3115 //
3116 // e^x = (2^m) * (2^(j/64)) * e^r
3117 //
3118 // (2^(j/64)) is precomputed
3119 //
3120 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3121 // e^r = 1 + q
3122 //
3123 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3124 //
3125 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3126 SDNodeFlags FlagsNoContract = Flags;
3127 FlagsNoContract.setAllowContract(false);
3128
3129 SDValue PH, PL;
3130 if (Subtarget->hasFastFMAF32()) {
3131 const float c_exp = numbers::log2ef;
3132 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3133 const float c_exp10 = 0x1.a934f0p+1f;
3134 const float cc_exp10 = 0x1.2f346ep-24f;
3135
3136 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3137 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3138
3139 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3140 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3141 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3142 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3143 } else {
3144 const float ch_exp = 0x1.714000p+0f;
3145 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3146
3147 const float ch_exp10 = 0x1.a92000p+1f;
3148 const float cl_exp10 = 0x1.4f0978p-11f;
3149
3150 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3151 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3152
3153 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3154 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3155 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3156 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3157 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3158
3159 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3160
3161 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3162 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3163 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3164 }
3165
3166 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3167
3168 // It is unsafe to contract this fsub into the PH multiply.
3169 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3170
3171 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3172 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3173 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3174
3175 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3176
3177 SDValue UnderflowCheckConst =
3178 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3179
3180 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3181 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3182 SDValue Underflow =
3183 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3184
3185 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3186
3187 if (!Flags.hasNoInfs()) {
3188 SDValue OverflowCheckConst =
3189 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3190 SDValue Overflow =
3191 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3192 SDValue Inf =
3193 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3194 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3195 }
3196
3197 return R;
3198}
3199
3200static bool isCtlzOpc(unsigned Opc) {
3201 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3202}
3203
3204static bool isCttzOpc(unsigned Opc) {
3205 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3206}
3207
3208SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3209 SelectionDAG &DAG) const {
3210 auto SL = SDLoc(Op);
3211 auto Opc = Op.getOpcode();
3212 auto Arg = Op.getOperand(i: 0u);
3213 auto ResultVT = Op.getValueType();
3214
3215 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3216 return {};
3217
3218 assert(isCtlzOpc(Opc));
3219 assert(ResultVT == Arg.getValueType());
3220
3221 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3222 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3223 SDValue NewOp;
3224
3225 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3226 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3227 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3228 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3229 } else {
3230 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3231 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3232 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3233 }
3234
3235 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3236}
3237
3238SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3239 SDLoc SL(Op);
3240 SDValue Src = Op.getOperand(i: 0);
3241
3242 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3243 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3244 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3245
3246 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3247 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3248 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3249
3250 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3251 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3252 // (cttz hi:lo) -> (umin (ffbl src), 32)
3253 // (ctlz_zero_undef src) -> (ffbh src)
3254 // (cttz_zero_undef src) -> (ffbl src)
3255
3256 // 64-bit scalar version produce 32-bit result
3257 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3258 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3259 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3260 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3261 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3262 if (!ZeroUndef) {
3263 const SDValue ConstVal = DAG.getConstant(
3264 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3265 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3266 }
3267 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3268 }
3269
3270 SDValue Lo, Hi;
3271 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3272
3273 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3274 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3275
3276 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3277 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3278 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3279 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3280
3281 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3282 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3283 if (Ctlz)
3284 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3285 else
3286 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3287
3288 SDValue NewOpr;
3289 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3290 if (!ZeroUndef) {
3291 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3292 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3293 }
3294
3295 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3296}
3297
3298SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3299 bool Signed) const {
3300 // The regular method converting a 64-bit integer to float roughly consists of
3301 // 2 steps: normalization and rounding. In fact, after normalization, the
3302 // conversion from a 64-bit integer to a float is essentially the same as the
3303 // one from a 32-bit integer. The only difference is that it has more
3304 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3305 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3306 // converted into the correct float number. The basic steps for the unsigned
3307 // conversion are illustrated in the following pseudo code:
3308 //
3309 // f32 uitofp(i64 u) {
3310 // i32 hi, lo = split(u);
3311 // // Only count the leading zeros in hi as we have native support of the
3312 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3313 // // reduced to a 32-bit one automatically.
3314 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3315 // u <<= shamt;
3316 // hi, lo = split(u);
3317 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3318 // // convert it as a 32-bit integer and scale the result back.
3319 // return uitofp(hi) * 2^(32 - shamt);
3320 // }
3321 //
3322 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3323 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3324 // converted instead followed by negation based its sign bit.
3325
3326 SDLoc SL(Op);
3327 SDValue Src = Op.getOperand(i: 0);
3328
3329 SDValue Lo, Hi;
3330 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3331 SDValue Sign;
3332 SDValue ShAmt;
3333 if (Signed && Subtarget->isGCN()) {
3334 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3335 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3336 // account. That is, the maximal shift is
3337 // - 32 if Lo and Hi have opposite signs;
3338 // - 33 if Lo and Hi have the same sign.
3339 //
3340 // Or, MaxShAmt = 33 + OppositeSign, where
3341 //
3342 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3343 // - -1 if Lo and Hi have opposite signs; and
3344 // - 0 otherwise.
3345 //
3346 // All in all, ShAmt is calculated as
3347 //
3348 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3349 //
3350 // or
3351 //
3352 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3353 //
3354 // to reduce the critical path.
3355 SDValue OppositeSign = DAG.getNode(
3356 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3357 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3358 SDValue MaxShAmt =
3359 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3360 N2: OppositeSign);
3361 // Count the leading sign bits.
3362 ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3363 // Different from unsigned conversion, the shift should be one bit less to
3364 // preserve the sign bit.
3365 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3366 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3367 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3368 } else {
3369 if (Signed) {
3370 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3371 // absolute value first.
3372 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3373 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3374 SDValue Abs =
3375 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3376 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3377 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3378 }
3379 // Count the leading zeros.
3380 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3381 // The shift amount for signed integers is [0, 32].
3382 }
3383 // Normalize the given 64-bit integer.
3384 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3385 // Split it again.
3386 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3387 // Calculate the adjust bit for rounding.
3388 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3389 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3390 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3391 // Get the 32-bit normalized integer.
3392 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3393 // Convert the normalized 32-bit integer into f32.
3394
3395 bool UseLDEXP = isOperationLegal(Op: ISD::FLDEXP, VT: MVT::f32);
3396 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3397 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3398
3399 // Finally, need to scale back the converted floating number as the original
3400 // 64-bit integer is converted as a 32-bit one.
3401 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3402 N2: ShAmt);
3403 // On GCN, use LDEXP directly.
3404 if (UseLDEXP)
3405 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3406
3407 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3408 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3409 // exponent is enough to avoid overflowing into the sign bit.
3410 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3411 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3412 SDValue IVal =
3413 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3414 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3415 if (Signed) {
3416 // Set the sign bit.
3417 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3418 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3419 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3420 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3421 }
3422 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3423}
3424
3425SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3426 bool Signed) const {
3427 SDLoc SL(Op);
3428 SDValue Src = Op.getOperand(i: 0);
3429
3430 SDValue Lo, Hi;
3431 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3432
3433 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3434 DL: SL, VT: MVT::f64, Operand: Hi);
3435
3436 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3437
3438 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3439 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3440 // TODO: Should this propagate fast-math-flags?
3441 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3442}
3443
3444SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3445 SelectionDAG &DAG) const {
3446 // TODO: Factor out code common with LowerSINT_TO_FP.
3447 EVT DestVT = Op.getValueType();
3448 SDValue Src = Op.getOperand(i: 0);
3449 EVT SrcVT = Src.getValueType();
3450
3451 if (SrcVT == MVT::i16) {
3452 if (DestVT == MVT::f16)
3453 return Op;
3454 SDLoc DL(Op);
3455
3456 // Promote src to i32
3457 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3458 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3459 }
3460
3461 if (DestVT == MVT::bf16) {
3462 SDLoc SL(Op);
3463 SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3464 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3465 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3466 }
3467
3468 if (SrcVT != MVT::i64)
3469 return Op;
3470
3471 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3472 SDLoc DL(Op);
3473
3474 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3475 SDValue FPRoundFlag =
3476 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3477 SDValue FPRound =
3478 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3479
3480 return FPRound;
3481 }
3482
3483 if (DestVT == MVT::f32)
3484 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3485
3486 assert(DestVT == MVT::f64);
3487 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3488}
3489
3490SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3491 SelectionDAG &DAG) const {
3492 EVT DestVT = Op.getValueType();
3493
3494 SDValue Src = Op.getOperand(i: 0);
3495 EVT SrcVT = Src.getValueType();
3496
3497 if (SrcVT == MVT::i16) {
3498 if (DestVT == MVT::f16)
3499 return Op;
3500
3501 SDLoc DL(Op);
3502 // Promote src to i32
3503 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3504 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3505 }
3506
3507 if (DestVT == MVT::bf16) {
3508 SDLoc SL(Op);
3509 SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3510 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3511 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3512 }
3513
3514 if (SrcVT != MVT::i64)
3515 return Op;
3516
3517 // TODO: Factor out code common with LowerUINT_TO_FP.
3518
3519 if (DestVT == MVT::f16 && isTypeLegal(VT: MVT::f16)) {
3520 SDLoc DL(Op);
3521 SDValue Src = Op.getOperand(i: 0);
3522
3523 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3524 SDValue FPRoundFlag =
3525 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3526 SDValue FPRound =
3527 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3528
3529 return FPRound;
3530 }
3531
3532 if (DestVT == MVT::f32)
3533 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3534
3535 assert(DestVT == MVT::f64);
3536 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3537}
3538
3539SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3540 bool Signed) const {
3541 SDLoc SL(Op);
3542
3543 SDValue Src = Op.getOperand(i: 0);
3544 EVT SrcVT = Src.getValueType();
3545
3546 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3547
3548 // The basic idea of converting a floating point number into a pair of 32-bit
3549 // integers is illustrated as follows:
3550 //
3551 // tf := trunc(val);
3552 // hif := floor(tf * 2^-32);
3553 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3554 // hi := fptoi(hif);
3555 // lo := fptoi(lof);
3556 //
3557 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3558 SDValue Sign;
3559 if (Signed && SrcVT == MVT::f32) {
3560 // However, a 32-bit floating point number has only 23 bits mantissa and
3561 // it's not enough to hold all the significant bits of `lof` if val is
3562 // negative. To avoid the loss of precision, We need to take the absolute
3563 // value after truncating and flip the result back based on the original
3564 // signedness.
3565 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3566 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3567 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3568 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3569 }
3570
3571 SDValue K0, K1;
3572 if (SrcVT == MVT::f64) {
3573 K0 = DAG.getConstantFP(
3574 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3575 VT: SrcVT);
3576 K1 = DAG.getConstantFP(
3577 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3578 VT: SrcVT);
3579 } else {
3580 K0 = DAG.getConstantFP(
3581 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3582 K1 = DAG.getConstantFP(
3583 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3584 }
3585 // TODO: Should this propagate fast-math-flags?
3586 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3587
3588 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3589
3590 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3591
3592 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3593 : ISD::FP_TO_UINT,
3594 DL: SL, VT: MVT::i32, Operand: FloorMul);
3595 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3596
3597 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3598 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3599
3600 if (Signed && SrcVT == MVT::f32) {
3601 assert(Sign);
3602 // Flip the result based on the signedness, which is either all 0s or 1s.
3603 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3604 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3605 // r := xor(r, sign) - sign;
3606 Result =
3607 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3608 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3609 }
3610
3611 return Result;
3612}
3613
3614SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3615 SDLoc DL(Op);
3616 SDValue N0 = Op.getOperand(i: 0);
3617
3618 // Convert to target node to get known bits
3619 if (N0.getValueType() == MVT::f32)
3620 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3621
3622 if (Op->getFlags().hasApproximateFuncs()) {
3623 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3624 return SDValue();
3625 }
3626
3627 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3628}
3629
3630// return node in i32
3631SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3632 SelectionDAG &DAG) const {
3633 assert(Src.getSimpleValueType() == MVT::f64);
3634
3635 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3636 // TODO: We can generate better code for True16.
3637 const unsigned ExpMask = 0x7ff;
3638 const unsigned ExpBiasf64 = 1023;
3639 const unsigned ExpBiasf16 = 15;
3640 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3641 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3642 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3643 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3644 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3645 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3646 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3647 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3648 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3649 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3650 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3651 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3652 // add the f16 bias (15) to get the biased exponent for the f16 format.
3653 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3654 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3655
3656 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3657 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3658 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3659 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3660
3661 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3662 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3663 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3664
3665 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3666 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3667
3668 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3669 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3670 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3671 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3672
3673 // N = M | (E << 12);
3674 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3675 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3676 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3677
3678 // B = clamp(1-E, 0, 13);
3679 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3680 N1: One, N2: E);
3681 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3682 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3683 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3684
3685 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3686 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3687
3688 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3689 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3690 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3691 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3692
3693 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3694 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3695 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3696 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3697 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3698 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3699 True: One, False: Zero, Cond: ISD::SETEQ);
3700 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3701 True: One, False: Zero, Cond: ISD::SETGT);
3702 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3703 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3704
3705 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3706 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3707 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3708 True: I, False: V, Cond: ISD::SETEQ);
3709
3710 // Extract the sign bit.
3711 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3712 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3713 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3714 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3715
3716 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3717}
3718
3719SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3720 SelectionDAG &DAG) const {
3721 SDValue Src = Op.getOperand(i: 0);
3722 unsigned OpOpcode = Op.getOpcode();
3723 EVT SrcVT = Src.getValueType();
3724 EVT DestVT = Op.getValueType();
3725
3726 // Will be selected natively
3727 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3728 return Op;
3729
3730 if (SrcVT == MVT::bf16) {
3731 SDLoc DL(Op);
3732 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3733 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3734 }
3735
3736 // Promote i16 to i32
3737 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3738 SDLoc DL(Op);
3739
3740 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3741 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3742 }
3743
3744 if (DestVT != MVT::i64)
3745 return Op;
3746
3747 if (SrcVT == MVT::f16 ||
3748 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3749 SDLoc DL(Op);
3750
3751 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3752 unsigned Ext =
3753 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3754 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3755 }
3756
3757 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3758 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3759
3760 return SDValue();
3761}
3762
3763SDValue AMDGPUTargetLowering::LowerFP_TO_INT_SAT(const SDValue Op,
3764 SelectionDAG &DAG) const {
3765 SDValue Src = Op.getOperand(i: 0);
3766 unsigned OpOpcode = Op.getOpcode();
3767 EVT SrcVT = Src.getValueType();
3768 EVT DstVT = Op.getValueType();
3769 SDValue SatVTOp = Op.getNode()->getOperand(Num: 1);
3770 EVT SatVT = cast<VTSDNode>(Val&: SatVTOp)->getVT();
3771 SDLoc DL(Op);
3772
3773 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3774 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3775 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3776
3777 // Will be selected natively
3778 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3779 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3780 return Op;
3781
3782 const SDValue Int32VT = DAG.getValueType(MVT::i32);
3783
3784 // Perform all saturation at i32 and truncate
3785 if (SatWidth < DstWidth) {
3786 const uint64_t Int32Width = 32;
3787 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: Int32VT);
3788 SDValue Int32SatVal;
3789
3790 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3791 SDValue MinConst = DAG.getConstant(
3792 Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3793 SDValue MaxConst = DAG.getConstant(
3794 Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: Int32Width), DL, VT: MVT::i32);
3795 SDValue MinVal =
3796 DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3797 Int32SatVal = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: MinVal, N2: MaxConst);
3798 } else {
3799 SDValue MinConst = DAG.getConstant(
3800 Val: APInt::getMaxValue(numBits: SatWidth).zext(width: Int32Width), DL, VT: MVT::i32);
3801 Int32SatVal = DAG.getNode(Opcode: ISD::UMIN, DL, VT: MVT::i32, N1: FpToInt32, N2: MinConst);
3802 }
3803
3804 if (DstWidth == Int32Width)
3805 return Int32SatVal;
3806 if (DstWidth < Int32Width)
3807 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Int32SatVal);
3808
3809 // DstWidth > Int32Width
3810 const unsigned Ext =
3811 OpOpcode == ISD::FP_TO_SINT_SAT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3812 return DAG.getNode(Opcode: Ext, DL, VT: DstVT, Operand: FpToInt32);
3813 }
3814
3815 // SatWidth == DstWidth
3816
3817 // Saturate at i32 for i64 dst and 16b src (will invoke f16 promotion below)
3818 if (DstVT == MVT::i64 &&
3819 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3820 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3821 return DAG.getNode(Opcode: OpOpcode, DL, VT: DstVT, N1: Src, N2: Int32VT);
3822 }
3823
3824 // Promote f16/bf16 src to f32
3825 if (SrcVT == MVT::f16 || SrcVT == MVT::bf16) {
3826 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3827 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: PromotedSrc, N2: SatVTOp);
3828 }
3829
3830 // Promote sub-i32 dst to i32 with sub-i32 saturation
3831 if (DstWidth < 32) {
3832 // Note: this triggers SatWidth < DstWidth above to generate saturated
3833 // truncate by requesting MVT::i32 destination with SatWidth < 32.
3834 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, N1: Src, N2: SatVTOp);
3835 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: FpToInt32);
3836 }
3837
3838 // TODO: can we implement i64 dst for f32/f64?
3839
3840 return SDValue();
3841}
3842
3843SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3844 SelectionDAG &DAG) const {
3845 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3846 MVT VT = Op.getSimpleValueType();
3847 MVT ScalarVT = VT.getScalarType();
3848
3849 assert(VT.isVector());
3850
3851 SDValue Src = Op.getOperand(i: 0);
3852 SDLoc DL(Op);
3853
3854 // TODO: Don't scalarize on Evergreen?
3855 unsigned NElts = VT.getVectorNumElements();
3856 SmallVector<SDValue, 8> Args;
3857 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
3858
3859 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3860 for (unsigned I = 0; I < NElts; ++I)
3861 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
3862
3863 return DAG.getBuildVector(VT, DL, Ops: Args);
3864}
3865
3866//===----------------------------------------------------------------------===//
3867// Custom DAG optimizations
3868//===----------------------------------------------------------------------===//
3869
3870static bool isU24(SDValue Op, SelectionDAG &DAG) {
3871 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3872}
3873
3874static bool isI24(SDValue Op, SelectionDAG &DAG) {
3875 EVT VT = Op.getValueType();
3876 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3877 // as unsigned 24-bit values.
3878 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3879}
3880
3881static SDValue simplifyMul24(SDNode *Node24,
3882 TargetLowering::DAGCombinerInfo &DCI) {
3883 SelectionDAG &DAG = DCI.DAG;
3884 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3885 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3886
3887 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
3888 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
3889 unsigned NewOpcode = Node24->getOpcode();
3890 if (IsIntrin) {
3891 unsigned IID = Node24->getConstantOperandVal(Num: 0);
3892 switch (IID) {
3893 case Intrinsic::amdgcn_mul_i24:
3894 NewOpcode = AMDGPUISD::MUL_I24;
3895 break;
3896 case Intrinsic::amdgcn_mul_u24:
3897 NewOpcode = AMDGPUISD::MUL_U24;
3898 break;
3899 case Intrinsic::amdgcn_mulhi_i24:
3900 NewOpcode = AMDGPUISD::MULHI_I24;
3901 break;
3902 case Intrinsic::amdgcn_mulhi_u24:
3903 NewOpcode = AMDGPUISD::MULHI_U24;
3904 break;
3905 default:
3906 llvm_unreachable("Expected 24-bit mul intrinsic");
3907 }
3908 }
3909
3910 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
3911
3912 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3913 // the operands to have other uses, but will only perform simplifications that
3914 // involve bypassing some nodes for this user.
3915 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3916 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3917 if (DemandedLHS || DemandedRHS)
3918 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
3919 N1: DemandedLHS ? DemandedLHS : LHS,
3920 N2: DemandedRHS ? DemandedRHS : RHS);
3921
3922 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3923 // operands if this node is the only user.
3924 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3925 return SDValue(Node24, 0);
3926 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3927 return SDValue(Node24, 0);
3928
3929 return SDValue();
3930}
3931
3932template <typename IntTy>
3933static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3934 uint32_t Width, const SDLoc &DL) {
3935 if (Width + Offset < 32) {
3936 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3937 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3938 if constexpr (std::is_signed_v<IntTy>) {
3939 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
3940 } else {
3941 return DAG.getConstant(Result, DL, MVT::i32);
3942 }
3943 }
3944
3945 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3946}
3947
3948static bool hasVolatileUser(SDNode *Val) {
3949 for (SDNode *U : Val->users()) {
3950 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3951 if (M->isVolatile())
3952 return true;
3953 }
3954 }
3955
3956 return false;
3957}
3958
3959bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3960 // i32 vectors are the canonical memory type.
3961 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3962 return false;
3963
3964 if (!VT.isByteSized())
3965 return false;
3966
3967 unsigned Size = VT.getStoreSize();
3968
3969 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3970 return false;
3971
3972 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3973 return false;
3974
3975 return true;
3976}
3977
3978// Replace load of an illegal type with a bitcast from a load of a friendlier
3979// type.
3980SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3981 DAGCombinerInfo &DCI) const {
3982 if (!DCI.isBeforeLegalize())
3983 return SDValue();
3984
3985 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3986 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
3987 return SDValue();
3988
3989 SDLoc SL(N);
3990 SelectionDAG &DAG = DCI.DAG;
3991 EVT VT = LN->getMemoryVT();
3992
3993 unsigned Size = VT.getStoreSize();
3994 Align Alignment = LN->getAlign();
3995 if (Alignment < Size && isTypeLegal(VT)) {
3996 unsigned IsFast;
3997 unsigned AS = LN->getAddressSpace();
3998
3999 // Expand unaligned loads earlier than legalization. Due to visitation order
4000 // problems during legalization, the emitted instructions to pack and unpack
4001 // the bytes again are not eliminated in the case of an unaligned copy.
4002 if (!allowsMisalignedMemoryAccesses(
4003 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
4004 if (VT.isVector())
4005 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
4006
4007 SDValue Ops[2];
4008 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
4009
4010 return DAG.getMergeValues(Ops, dl: SDLoc(N));
4011 }
4012
4013 if (!IsFast)
4014 return SDValue();
4015 }
4016
4017 if (!shouldCombineMemoryType(VT))
4018 return SDValue();
4019
4020 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4021
4022 SDValue NewLoad
4023 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
4024 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
4025
4026 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
4027 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
4028 return SDValue(N, 0);
4029}
4030
4031// Replace store of an illegal type with a store of a bitcast to a friendlier
4032// type.
4033SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
4034 DAGCombinerInfo &DCI) const {
4035 if (!DCI.isBeforeLegalize())
4036 return SDValue();
4037
4038 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
4039 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
4040 return SDValue();
4041
4042 EVT VT = SN->getMemoryVT();
4043 unsigned Size = VT.getStoreSize();
4044
4045 SDLoc SL(N);
4046 SelectionDAG &DAG = DCI.DAG;
4047 Align Alignment = SN->getAlign();
4048 if (Alignment < Size && isTypeLegal(VT)) {
4049 unsigned IsFast;
4050 unsigned AS = SN->getAddressSpace();
4051
4052 // Expand unaligned stores earlier than legalization. Due to visitation
4053 // order problems during legalization, the emitted instructions to pack and
4054 // unpack the bytes again are not eliminated in the case of an unaligned
4055 // copy.
4056 if (!allowsMisalignedMemoryAccesses(
4057 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
4058 if (VT.isVector())
4059 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
4060
4061 return expandUnalignedStore(ST: SN, DAG);
4062 }
4063
4064 if (!IsFast)
4065 return SDValue();
4066 }
4067
4068 if (!shouldCombineMemoryType(VT))
4069 return SDValue();
4070
4071 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
4072 SDValue Val = SN->getValue();
4073
4074 //DCI.AddToWorklist(Val.getNode());
4075
4076 bool OtherUses = !Val.hasOneUse();
4077 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
4078 if (OtherUses) {
4079 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
4080 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
4081 }
4082
4083 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
4084 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
4085}
4086
4087// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4088// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4089// issues.
4090SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
4091 DAGCombinerInfo &DCI) const {
4092 SelectionDAG &DAG = DCI.DAG;
4093 SDValue N0 = N->getOperand(Num: 0);
4094
4095 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4096 // (vt2 (truncate (assertzext vt0:x, vt1)))
4097 if (N0.getOpcode() == ISD::TRUNCATE) {
4098 SDValue N1 = N->getOperand(Num: 1);
4099 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
4100 SDLoc SL(N);
4101
4102 SDValue Src = N0.getOperand(i: 0);
4103 EVT SrcVT = Src.getValueType();
4104 if (SrcVT.bitsGE(VT: ExtVT)) {
4105 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
4106 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
4107 }
4108 }
4109
4110 return SDValue();
4111}
4112
4113SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
4114 SDNode *N, DAGCombinerInfo &DCI) const {
4115 unsigned IID = N->getConstantOperandVal(Num: 0);
4116 switch (IID) {
4117 case Intrinsic::amdgcn_mul_i24:
4118 case Intrinsic::amdgcn_mul_u24:
4119 case Intrinsic::amdgcn_mulhi_i24:
4120 case Intrinsic::amdgcn_mulhi_u24:
4121 return simplifyMul24(Node24: N, DCI);
4122 case Intrinsic::amdgcn_fract:
4123 case Intrinsic::amdgcn_rsq:
4124 case Intrinsic::amdgcn_rcp_legacy:
4125 case Intrinsic::amdgcn_rsq_legacy:
4126 case Intrinsic::amdgcn_rsq_clamp:
4127 case Intrinsic::amdgcn_tanh:
4128 case Intrinsic::amdgcn_prng_b32: {
4129 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4130 SDValue Src = N->getOperand(Num: 1);
4131 return Src.isUndef() ? Src : SDValue();
4132 }
4133 case Intrinsic::amdgcn_frexp_exp: {
4134 // frexp_exp (fneg x) -> frexp_exp x
4135 // frexp_exp (fabs x) -> frexp_exp x
4136 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4137 SDValue Src = N->getOperand(Num: 1);
4138 SDValue PeekSign = peekFPSignOps(Val: Src);
4139 if (PeekSign == Src)
4140 return SDValue();
4141 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4142 0);
4143 }
4144 default:
4145 return SDValue();
4146 }
4147}
4148
4149/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4150/// binary operation \p Opc to it with the corresponding constant operands.
4151SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4152 DAGCombinerInfo &DCI, const SDLoc &SL,
4153 unsigned Opc, SDValue LHS,
4154 uint32_t ValLo, uint32_t ValHi) const {
4155 SelectionDAG &DAG = DCI.DAG;
4156 SDValue Lo, Hi;
4157 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4158
4159 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4160 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4161
4162 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4163 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4164
4165 // Re-visit the ands. It's possible we eliminated one of them and it could
4166 // simplify the vector.
4167 DCI.AddToWorklist(N: Lo.getNode());
4168 DCI.AddToWorklist(N: Hi.getNode());
4169
4170 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4171 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4172}
4173
4174SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4175 DAGCombinerInfo &DCI) const {
4176 EVT VT = N->getValueType(ResNo: 0);
4177 SDValue LHS = N->getOperand(Num: 0);
4178 SDValue RHS = N->getOperand(Num: 1);
4179 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4180 SDLoc SL(N);
4181 SelectionDAG &DAG = DCI.DAG;
4182
4183 unsigned RHSVal;
4184 if (CRHS) {
4185 RHSVal = CRHS->getZExtValue();
4186 if (!RHSVal)
4187 return LHS;
4188
4189 switch (LHS->getOpcode()) {
4190 default:
4191 break;
4192 case ISD::ZERO_EXTEND:
4193 case ISD::SIGN_EXTEND:
4194 case ISD::ANY_EXTEND: {
4195 SDValue X = LHS->getOperand(Num: 0);
4196
4197 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4198 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4199 // Prefer build_vector as the canonical form if packed types are legal.
4200 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4201 SDValue Vec = DAG.getBuildVector(
4202 VT: MVT::v2i16, DL: SL,
4203 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4204 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4205 }
4206
4207 // shl (ext x) => zext (shl x), if shift does not overflow int
4208 if (VT != MVT::i64)
4209 break;
4210 KnownBits Known = DAG.computeKnownBits(Op: X);
4211 unsigned LZ = Known.countMinLeadingZeros();
4212 if (LZ < RHSVal)
4213 break;
4214 EVT XVT = X.getValueType();
4215 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4216 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4217 }
4218 }
4219 }
4220
4221 if (VT.getScalarType() != MVT::i64)
4222 return SDValue();
4223
4224 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4225 // common case, splitting this into a move and a 32-bit shift is faster and
4226 // the same code size.
4227 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4228
4229 EVT ElementType = VT.getScalarType();
4230 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4231 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4232
4233 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4234 return SDValue();
4235 SDValue ShiftAmt;
4236
4237 if (CRHS) {
4238 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4239 VT: TargetType);
4240 } else {
4241 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4242 const SDValue ShiftMask =
4243 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4244 // This AND instruction will clamp out of bounds shift values.
4245 // It will also be removed during later instruction selection.
4246 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4247 }
4248
4249 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4250 SDValue NewShift =
4251 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4252
4253 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4254 SDValue Vec;
4255
4256 if (VT.isVector()) {
4257 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4258 unsigned NElts = TargetType.getVectorNumElements();
4259 SmallVector<SDValue, 8> HiOps;
4260 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4261
4262 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4263 for (unsigned I = 0; I != NElts; ++I)
4264 HiAndLoOps[2 * I + 1] = HiOps[I];
4265 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4266 } else {
4267 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4268 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4269 }
4270 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4271}
4272
4273SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4274 DAGCombinerInfo &DCI) const {
4275 SDValue RHS = N->getOperand(Num: 1);
4276 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4277 EVT VT = N->getValueType(ResNo: 0);
4278 SDValue LHS = N->getOperand(Num: 0);
4279 SelectionDAG &DAG = DCI.DAG;
4280 SDLoc SL(N);
4281
4282 if (VT.getScalarType() != MVT::i64)
4283 return SDValue();
4284
4285 // For C >= 32
4286 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4287
4288 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4289 // common case, splitting this into a move and a 32-bit shift is faster and
4290 // the same code size.
4291 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4292
4293 EVT ElementType = VT.getScalarType();
4294 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4295 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4296
4297 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4298 return SDValue();
4299
4300 SDValue ShiftFullAmt =
4301 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4302 SDValue ShiftAmt;
4303 if (CRHS) {
4304 unsigned RHSVal = CRHS->getZExtValue();
4305 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4306 VT: TargetType);
4307 } else if (Known.getMinValue().getZExtValue() ==
4308 (ElementType.getSizeInBits() - 1)) {
4309 ShiftAmt = ShiftFullAmt;
4310 } else {
4311 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4312 const SDValue ShiftMask =
4313 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4314 // This AND instruction will clamp out of bounds shift values.
4315 // It will also be removed during later instruction selection.
4316 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4317 }
4318
4319 EVT ConcatType;
4320 SDValue Hi;
4321 SDLoc LHSSL(LHS);
4322 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4323 if (VT.isVector()) {
4324 unsigned NElts = TargetType.getVectorNumElements();
4325 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4326 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4327 SmallVector<SDValue, 8> HiOps(NElts);
4328 SmallVector<SDValue, 16> HiAndLoOps;
4329
4330 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4331 for (unsigned I = 0; I != NElts; ++I) {
4332 HiOps[I] = HiAndLoOps[2 * I + 1];
4333 }
4334 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4335 } else {
4336 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4337 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4338 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4339 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4340 }
4341
4342 KnownBits KnownLHS = DAG.computeKnownBits(Op: LHS);
4343 SDValue HiShift;
4344 if (KnownLHS.isNegative()) {
4345 HiShift = DAG.getAllOnesConstant(DL: SL, VT: TargetType);
4346 } else {
4347 Hi = DAG.getFreeze(V: Hi);
4348 HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4349 }
4350 SDValue NewShift =
4351 DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4352
4353 SDValue Vec;
4354 if (VT.isVector()) {
4355 unsigned NElts = TargetType.getVectorNumElements();
4356 SmallVector<SDValue, 8> HiOps;
4357 SmallVector<SDValue, 8> LoOps;
4358 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4359
4360 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4361 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4362 for (unsigned I = 0; I != NElts; ++I) {
4363 HiAndLoOps[2 * I + 1] = HiOps[I];
4364 HiAndLoOps[2 * I] = LoOps[I];
4365 }
4366 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4367 } else {
4368 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4369 }
4370 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4371}
4372
4373SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4374 DAGCombinerInfo &DCI) const {
4375 SDValue RHS = N->getOperand(Num: 1);
4376 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4377 EVT VT = N->getValueType(ResNo: 0);
4378 SDValue LHS = N->getOperand(Num: 0);
4379 SelectionDAG &DAG = DCI.DAG;
4380 SDLoc SL(N);
4381 unsigned RHSVal;
4382
4383 if (CRHS) {
4384 RHSVal = CRHS->getZExtValue();
4385
4386 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4387 // this improves the ability to match BFE patterns in isel.
4388 if (LHS.getOpcode() == ISD::AND) {
4389 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4390 unsigned MaskIdx, MaskLen;
4391 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4392 MaskIdx == RHSVal) {
4393 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4394 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4395 N2: N->getOperand(Num: 1)),
4396 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4397 N2: N->getOperand(Num: 1)));
4398 }
4399 }
4400 }
4401 }
4402
4403 if (VT.getScalarType() != MVT::i64)
4404 return SDValue();
4405
4406 // for C >= 32
4407 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4408
4409 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4410 // common case, splitting this into a move and a 32-bit shift is faster and
4411 // the same code size.
4412 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4413
4414 EVT ElementType = VT.getScalarType();
4415 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4416 EVT TargetType = VT.changeElementType(Context&: *DAG.getContext(), EltVT: TargetScalarType);
4417
4418 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4419 return SDValue();
4420
4421 SDValue ShiftAmt;
4422 if (CRHS) {
4423 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4424 VT: TargetType);
4425 } else {
4426 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4427 const SDValue ShiftMask =
4428 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4429 // This AND instruction will clamp out of bounds shift values.
4430 // It will also be removed during later instruction selection.
4431 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4432 }
4433
4434 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4435 EVT ConcatType;
4436 SDValue Hi;
4437 SDLoc LHSSL(LHS);
4438 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4439 if (VT.isVector()) {
4440 unsigned NElts = TargetType.getVectorNumElements();
4441 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4442 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4443 SmallVector<SDValue, 8> HiOps(NElts);
4444 SmallVector<SDValue, 16> HiAndLoOps;
4445
4446 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4447 for (unsigned I = 0; I != NElts; ++I)
4448 HiOps[I] = HiAndLoOps[2 * I + 1];
4449 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4450 } else {
4451 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4452 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4453 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4454 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4455 }
4456
4457 SDValue NewShift =
4458 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt, Flags: N->getFlags());
4459
4460 SDValue Vec;
4461 if (VT.isVector()) {
4462 unsigned NElts = TargetType.getVectorNumElements();
4463 SmallVector<SDValue, 8> LoOps;
4464 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4465
4466 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4467 for (unsigned I = 0; I != NElts; ++I)
4468 HiAndLoOps[2 * I] = LoOps[I];
4469 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4470 } else {
4471 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4472 }
4473 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4474}
4475
4476SDValue AMDGPUTargetLowering::performTruncateCombine(
4477 SDNode *N, DAGCombinerInfo &DCI) const {
4478 SDLoc SL(N);
4479 SelectionDAG &DAG = DCI.DAG;
4480 EVT VT = N->getValueType(ResNo: 0);
4481 SDValue Src = N->getOperand(Num: 0);
4482
4483 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4484 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4485 SDValue Vec = Src.getOperand(i: 0);
4486 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4487 SDValue Elt0 = Vec.getOperand(i: 0);
4488 EVT EltVT = Elt0.getValueType();
4489 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4490 if (EltVT.isFloatingPoint()) {
4491 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4492 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4493 }
4494
4495 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4496 }
4497 }
4498 }
4499
4500 // Equivalent of above for accessing the high element of a vector as an
4501 // integer operation.
4502 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4503 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4504 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4505 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4506 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4507 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4508 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4509 unsigned BitIndex = K->getZExtValue();
4510 unsigned PartIndex = BitIndex / SrcEltSize;
4511
4512 if (PartIndex * SrcEltSize == BitIndex &&
4513 PartIndex < BV.getNumOperands()) {
4514 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4515 SDValue SrcElt =
4516 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4517 Operand: BV.getOperand(i: PartIndex));
4518 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4519 }
4520 }
4521 }
4522 }
4523 }
4524
4525 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4526 //
4527 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4528 // i16 (trunc (srl (i32 (trunc x), K)))
4529 if (VT.getScalarSizeInBits() < 32) {
4530 EVT SrcVT = Src.getValueType();
4531 if (SrcVT.getScalarSizeInBits() > 32 &&
4532 (Src.getOpcode() == ISD::SRL ||
4533 Src.getOpcode() == ISD::SRA ||
4534 Src.getOpcode() == ISD::SHL)) {
4535 SDValue Amt = Src.getOperand(i: 1);
4536 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4537
4538 // - For left shifts, do the transform as long as the shift
4539 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4540 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4541 // losing information stored in the high bits when truncating.
4542 const unsigned MaxCstSize =
4543 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4544 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4545 EVT MidVT = VT.isVector() ?
4546 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4547 NumElements: VT.getVectorNumElements()) : MVT::i32;
4548
4549 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4550 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4551 Operand: Src.getOperand(i: 0));
4552 DCI.AddToWorklist(N: Trunc.getNode());
4553
4554 if (Amt.getValueType() != NewShiftVT) {
4555 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4556 DCI.AddToWorklist(N: Amt.getNode());
4557 }
4558
4559 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4560 N1: Trunc, N2: Amt);
4561 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4562 }
4563 }
4564 }
4565
4566 return SDValue();
4567}
4568
4569// We need to specifically handle i64 mul here to avoid unnecessary conversion
4570// instructions. If we only match on the legalized i64 mul expansion,
4571// SimplifyDemandedBits will be unable to remove them because there will be
4572// multiple uses due to the separate mul + mulh[su].
4573static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4574 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4575 if (Size <= 32) {
4576 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4577 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4578 }
4579
4580 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4581 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4582
4583 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4584 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4585
4586 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4587}
4588
4589/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4590/// return SDValue().
4591static SDValue getAddOneOp(const SDNode *V) {
4592 if (V->getOpcode() != ISD::ADD)
4593 return SDValue();
4594
4595 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4596}
4597
4598SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4599 DAGCombinerInfo &DCI) const {
4600 assert(N->getOpcode() == ISD::MUL);
4601 EVT VT = N->getValueType(ResNo: 0);
4602
4603 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4604 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4605 // unnecessarily). isDivergent() is used as an approximation of whether the
4606 // value is in an SGPR.
4607 if (!N->isDivergent())
4608 return SDValue();
4609
4610 unsigned Size = VT.getSizeInBits();
4611 if (VT.isVector() || Size > 64)
4612 return SDValue();
4613
4614 SelectionDAG &DAG = DCI.DAG;
4615 SDLoc DL(N);
4616
4617 SDValue N0 = N->getOperand(Num: 0);
4618 SDValue N1 = N->getOperand(Num: 1);
4619
4620 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4621 // matching.
4622
4623 // mul x, (add y, 1) -> add (mul x, y), x
4624 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4625 SDValue AddOp = getAddOneOp(V: V.getNode());
4626 if (!AddOp)
4627 return SDValue();
4628
4629 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4630 return U->getOpcode() == ISD::MUL;
4631 }))
4632 return AddOp;
4633
4634 return SDValue();
4635 };
4636
4637 // FIXME: The selection pattern is not properly checking for commuted
4638 // operands, so we have to place the mul in the LHS
4639 if (SDValue MulOper = IsFoldableAdd(N0)) {
4640 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4641 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4642 }
4643
4644 if (SDValue MulOper = IsFoldableAdd(N1)) {
4645 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4646 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4647 }
4648
4649 // There are i16 integer mul/mad.
4650 if (isTypeLegal(VT: MVT::i16) && VT.getScalarType().bitsLE(VT: MVT::i16))
4651 return SDValue();
4652
4653 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4654 // in the source into any_extends if the result of the mul is truncated. Since
4655 // we can assume the high bits are whatever we want, use the underlying value
4656 // to avoid the unknown high bits from interfering.
4657 if (N0.getOpcode() == ISD::ANY_EXTEND)
4658 N0 = N0.getOperand(i: 0);
4659
4660 if (N1.getOpcode() == ISD::ANY_EXTEND)
4661 N1 = N1.getOperand(i: 0);
4662
4663 SDValue Mul;
4664
4665 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4666 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4667 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4668 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4669 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4670 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4671 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4672 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4673 } else {
4674 return SDValue();
4675 }
4676
4677 // We need to use sext even for MUL_U24, because MUL_U24 is used
4678 // for signed multiply of 8 and 16-bit types.
4679 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4680}
4681
4682SDValue
4683AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4684 DAGCombinerInfo &DCI) const {
4685 if (N->getValueType(ResNo: 0) != MVT::i32)
4686 return SDValue();
4687
4688 SelectionDAG &DAG = DCI.DAG;
4689 SDLoc DL(N);
4690
4691 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4692 SDValue N0 = N->getOperand(Num: 0);
4693 SDValue N1 = N->getOperand(Num: 1);
4694
4695 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4696 // in the source into any_extends if the result of the mul is truncated. Since
4697 // we can assume the high bits are whatever we want, use the underlying value
4698 // to avoid the unknown high bits from interfering.
4699 if (N0.getOpcode() == ISD::ANY_EXTEND)
4700 N0 = N0.getOperand(i: 0);
4701 if (N1.getOpcode() == ISD::ANY_EXTEND)
4702 N1 = N1.getOperand(i: 0);
4703
4704 // Try to use two fast 24-bit multiplies (one for each half of the result)
4705 // instead of one slow extending multiply.
4706 unsigned LoOpcode = 0;
4707 unsigned HiOpcode = 0;
4708 if (Signed) {
4709 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4710 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4711 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4712 LoOpcode = AMDGPUISD::MUL_I24;
4713 HiOpcode = AMDGPUISD::MULHI_I24;
4714 }
4715 } else {
4716 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4717 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4718 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4719 LoOpcode = AMDGPUISD::MUL_U24;
4720 HiOpcode = AMDGPUISD::MULHI_U24;
4721 }
4722 }
4723 if (!LoOpcode)
4724 return SDValue();
4725
4726 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4727 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4728 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4729 return SDValue(N, 0);
4730}
4731
4732SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4733 DAGCombinerInfo &DCI) const {
4734 EVT VT = N->getValueType(ResNo: 0);
4735
4736 if (!Subtarget->hasMulI24() || VT.isVector())
4737 return SDValue();
4738
4739 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4740 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4741 // unnecessarily). isDivergent() is used as an approximation of whether the
4742 // value is in an SGPR.
4743 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4744 // valu op anyway)
4745 if (Subtarget->hasSMulHi() && !N->isDivergent())
4746 return SDValue();
4747
4748 SelectionDAG &DAG = DCI.DAG;
4749 SDLoc DL(N);
4750
4751 SDValue N0 = N->getOperand(Num: 0);
4752 SDValue N1 = N->getOperand(Num: 1);
4753
4754 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4755 return SDValue();
4756
4757 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4758 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4759
4760 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4761 DCI.AddToWorklist(N: Mulhi.getNode());
4762 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4763}
4764
4765SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4766 DAGCombinerInfo &DCI) const {
4767 EVT VT = N->getValueType(ResNo: 0);
4768
4769 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4770 return SDValue();
4771
4772 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4773 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4774 // unnecessarily). isDivergent() is used as an approximation of whether the
4775 // value is in an SGPR.
4776 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4777 // valu op anyway)
4778 if (!N->isDivergent() && Subtarget->hasSMulHi())
4779 return SDValue();
4780
4781 SelectionDAG &DAG = DCI.DAG;
4782 SDLoc DL(N);
4783
4784 SDValue N0 = N->getOperand(Num: 0);
4785 SDValue N1 = N->getOperand(Num: 1);
4786
4787 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4788 return SDValue();
4789
4790 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4791 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4792
4793 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4794 DCI.AddToWorklist(N: Mulhi.getNode());
4795 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4796}
4797
4798SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4799 SDValue Op,
4800 const SDLoc &DL,
4801 unsigned Opc) const {
4802 EVT VT = Op.getValueType();
4803 if (VT.bitsGT(VT: MVT::i32))
4804 return SDValue();
4805
4806 if (VT != MVT::i32)
4807 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4808
4809 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4810 if (VT != MVT::i32)
4811 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4812
4813 return FFBX;
4814}
4815
4816// The native instructions return -1 on 0 input. Optimize out a select that
4817// produces -1 on 0.
4818//
4819// TODO: If zero is not undef, we could also do this if the output is compared
4820// against the bitwidth.
4821//
4822// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4823SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4824 SDValue LHS, SDValue RHS,
4825 DAGCombinerInfo &DCI) const {
4826 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4827 return SDValue();
4828
4829 SelectionDAG &DAG = DCI.DAG;
4830 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4831 SDValue CmpLHS = Cond.getOperand(i: 0);
4832
4833 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4834 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4835 if (CCOpcode == ISD::SETEQ &&
4836 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4837 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4838 unsigned Opc =
4839 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4840 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4841 }
4842
4843 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4844 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4845 if (CCOpcode == ISD::SETNE &&
4846 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4847 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
4848 unsigned Opc =
4849 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4850
4851 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4852 }
4853
4854 return SDValue();
4855}
4856
4857static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4858 unsigned Op,
4859 const SDLoc &SL,
4860 SDValue Cond,
4861 SDValue N1,
4862 SDValue N2) {
4863 SelectionDAG &DAG = DCI.DAG;
4864 EVT VT = N1.getValueType();
4865
4866 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4867 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
4868 DCI.AddToWorklist(N: NewSelect.getNode());
4869 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4870}
4871
4872// Pull a free FP operation out of a select so it may fold into uses.
4873//
4874// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4875// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4876//
4877// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4878// select c, (fabs x), +k -> fabs (select c, x, k)
4879SDValue
4880AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4881 SDValue N) const {
4882 SelectionDAG &DAG = DCI.DAG;
4883 SDValue Cond = N.getOperand(i: 0);
4884 SDValue LHS = N.getOperand(i: 1);
4885 SDValue RHS = N.getOperand(i: 2);
4886
4887 EVT VT = N.getValueType();
4888 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4889 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4890 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4891 return SDValue();
4892
4893 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4894 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
4895 }
4896
4897 bool Inv = false;
4898 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4899 std::swap(a&: LHS, b&: RHS);
4900 Inv = true;
4901 }
4902
4903 // TODO: Support vector constants.
4904 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4905 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4906 !selectSupportsSourceMods(N: N.getNode())) {
4907 SDLoc SL(N);
4908 // If one side is an fneg/fabs and the other is a constant, we can push the
4909 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4910 SDValue NewLHS = LHS.getOperand(i: 0);
4911 SDValue NewRHS = RHS;
4912
4913 // Careful: if the neg can be folded up, don't try to pull it back down.
4914 bool ShouldFoldNeg = true;
4915
4916 if (NewLHS.hasOneUse()) {
4917 unsigned Opc = NewLHS.getOpcode();
4918 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4919 ShouldFoldNeg = false;
4920 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4921 ShouldFoldNeg = false;
4922 }
4923
4924 if (ShouldFoldNeg) {
4925 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4926 return SDValue();
4927
4928 // We're going to be forced to use a source modifier anyway, there's no
4929 // point to pulling the negate out unless we can get a size reduction by
4930 // negating the constant.
4931 //
4932 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4933 // about cheaper constants.
4934 if (NewLHS.getOpcode() == ISD::FABS &&
4935 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4936 return SDValue();
4937
4938 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4939 return SDValue();
4940
4941 if (LHS.getOpcode() == ISD::FNEG)
4942 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4943
4944 if (Inv)
4945 std::swap(a&: NewLHS, b&: NewRHS);
4946
4947 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4948 N1: Cond, N2: NewLHS, N3: NewRHS);
4949 DCI.AddToWorklist(N: NewSelect.getNode());
4950 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4951 }
4952 }
4953
4954 return SDValue();
4955}
4956
4957SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4958 DAGCombinerInfo &DCI) const {
4959 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
4960 return Folded;
4961
4962 SDValue Cond = N->getOperand(Num: 0);
4963 if (Cond.getOpcode() != ISD::SETCC)
4964 return SDValue();
4965
4966 EVT VT = N->getValueType(ResNo: 0);
4967 SDValue LHS = Cond.getOperand(i: 0);
4968 SDValue RHS = Cond.getOperand(i: 1);
4969 SDValue CC = Cond.getOperand(i: 2);
4970
4971 SDValue True = N->getOperand(Num: 1);
4972 SDValue False = N->getOperand(Num: 2);
4973
4974 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4975 SelectionDAG &DAG = DCI.DAG;
4976 if (DAG.isConstantValueOfAnyType(N: True) &&
4977 !DAG.isConstantValueOfAnyType(N: False)) {
4978 // Swap cmp + select pair to move constant to false input.
4979 // This will allow using VOPC cndmasks more often.
4980 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4981
4982 SDLoc SL(N);
4983 ISD::CondCode NewCC =
4984 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4985
4986 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4987 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4988 }
4989
4990 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4991 SDValue MinMax
4992 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4993 // Revisit this node so we can catch min3/max3/med3 patterns.
4994 //DCI.AddToWorklist(MinMax.getNode());
4995 return MinMax;
4996 }
4997 }
4998
4999 // There's no reason to not do this if the condition has other uses.
5000 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
5001}
5002
5003static bool isInv2Pi(const APFloat &APF) {
5004 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5005 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5006 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5007
5008 return APF.bitwiseIsEqual(RHS: KF16) ||
5009 APF.bitwiseIsEqual(RHS: KF32) ||
5010 APF.bitwiseIsEqual(RHS: KF64);
5011}
5012
5013// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5014// additional cost to negate them.
5015TargetLowering::NegatibleCost
5016AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
5017 if (C->isZero())
5018 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5019
5020 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
5021 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5022
5023 return NegatibleCost::Neutral;
5024}
5025
5026bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
5027 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5028 return getConstantNegateCost(C) == NegatibleCost::Expensive;
5029 return false;
5030}
5031
5032bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
5033 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
5034 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
5035 return false;
5036}
5037
5038static unsigned inverseMinMax(unsigned Opc) {
5039 switch (Opc) {
5040 case ISD::FMAXNUM:
5041 return ISD::FMINNUM;
5042 case ISD::FMINNUM:
5043 return ISD::FMAXNUM;
5044 case ISD::FMAXNUM_IEEE:
5045 return ISD::FMINNUM_IEEE;
5046 case ISD::FMINNUM_IEEE:
5047 return ISD::FMAXNUM_IEEE;
5048 case ISD::FMAXIMUM:
5049 return ISD::FMINIMUM;
5050 case ISD::FMINIMUM:
5051 return ISD::FMAXIMUM;
5052 case ISD::FMAXIMUMNUM:
5053 return ISD::FMINIMUMNUM;
5054 case ISD::FMINIMUMNUM:
5055 return ISD::FMAXIMUMNUM;
5056 case AMDGPUISD::FMAX_LEGACY:
5057 return AMDGPUISD::FMIN_LEGACY;
5058 case AMDGPUISD::FMIN_LEGACY:
5059 return AMDGPUISD::FMAX_LEGACY;
5060 default:
5061 llvm_unreachable("invalid min/max opcode");
5062 }
5063}
5064
5065/// \return true if it's profitable to try to push an fneg into its source
5066/// instruction.
5067bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
5068 // If the input has multiple uses and we can either fold the negate down, or
5069 // the other uses cannot, give up. This both prevents unprofitable
5070 // transformations and infinite loops: we won't repeatedly try to fold around
5071 // a negate that has no 'good' form.
5072 if (N0.hasOneUse()) {
5073 // This may be able to fold into the source, but at a code size cost. Don't
5074 // fold if the fold into the user is free.
5075 if (allUsesHaveSourceMods(N, CostThreshold: 0))
5076 return false;
5077 } else {
5078 if (fnegFoldsIntoOp(N: N0.getNode()) &&
5079 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
5080 return false;
5081 }
5082
5083 return true;
5084}
5085
5086SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
5087 DAGCombinerInfo &DCI) const {
5088 SelectionDAG &DAG = DCI.DAG;
5089 SDValue N0 = N->getOperand(Num: 0);
5090 EVT VT = N->getValueType(ResNo: 0);
5091
5092 unsigned Opc = N0.getOpcode();
5093
5094 if (!shouldFoldFNegIntoSrc(N, N0))
5095 return SDValue();
5096
5097 SDLoc SL(N);
5098 switch (Opc) {
5099 case ISD::FADD: {
5100 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5101 return SDValue();
5102
5103 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5104 SDValue LHS = N0.getOperand(i: 0);
5105 SDValue RHS = N0.getOperand(i: 1);
5106
5107 if (LHS.getOpcode() != ISD::FNEG)
5108 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5109 else
5110 LHS = LHS.getOperand(i: 0);
5111
5112 if (RHS.getOpcode() != ISD::FNEG)
5113 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5114 else
5115 RHS = RHS.getOperand(i: 0);
5116
5117 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5118 if (Res.getOpcode() != ISD::FADD)
5119 return SDValue(); // Op got folded away.
5120 if (!N0.hasOneUse())
5121 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5122 return Res;
5123 }
5124 case ISD::FMUL:
5125 case AMDGPUISD::FMUL_LEGACY: {
5126 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5127 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5128 SDValue LHS = N0.getOperand(i: 0);
5129 SDValue RHS = N0.getOperand(i: 1);
5130
5131 if (LHS.getOpcode() == ISD::FNEG)
5132 LHS = LHS.getOperand(i: 0);
5133 else if (RHS.getOpcode() == ISD::FNEG)
5134 RHS = RHS.getOperand(i: 0);
5135 else
5136 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5137
5138 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5139 if (Res.getOpcode() != Opc)
5140 return SDValue(); // Op got folded away.
5141 if (!N0.hasOneUse())
5142 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5143 return Res;
5144 }
5145 case ISD::FMA:
5146 case ISD::FMAD: {
5147 // TODO: handle llvm.amdgcn.fma.legacy
5148 if (!mayIgnoreSignedZero(Op: N0) && !N->getFlags().hasNoSignedZeros())
5149 return SDValue();
5150
5151 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5152 SDValue LHS = N0.getOperand(i: 0);
5153 SDValue MHS = N0.getOperand(i: 1);
5154 SDValue RHS = N0.getOperand(i: 2);
5155
5156 if (LHS.getOpcode() == ISD::FNEG)
5157 LHS = LHS.getOperand(i: 0);
5158 else if (MHS.getOpcode() == ISD::FNEG)
5159 MHS = MHS.getOperand(i: 0);
5160 else
5161 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5162
5163 if (RHS.getOpcode() != ISD::FNEG)
5164 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5165 else
5166 RHS = RHS.getOperand(i: 0);
5167
5168 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5169 if (Res.getOpcode() != Opc)
5170 return SDValue(); // Op got folded away.
5171 if (!N0.hasOneUse())
5172 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5173 return Res;
5174 }
5175 case ISD::FMAXNUM:
5176 case ISD::FMINNUM:
5177 case ISD::FMAXNUM_IEEE:
5178 case ISD::FMINNUM_IEEE:
5179 case ISD::FMINIMUM:
5180 case ISD::FMAXIMUM:
5181 case ISD::FMINIMUMNUM:
5182 case ISD::FMAXIMUMNUM:
5183 case AMDGPUISD::FMAX_LEGACY:
5184 case AMDGPUISD::FMIN_LEGACY: {
5185 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5186 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5187 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5188 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5189
5190 SDValue LHS = N0.getOperand(i: 0);
5191 SDValue RHS = N0.getOperand(i: 1);
5192
5193 // 0 doesn't have a negated inline immediate.
5194 // TODO: This constant check should be generalized to other operations.
5195 if (isConstantCostlierToNegate(N: RHS))
5196 return SDValue();
5197
5198 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5199 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5200 unsigned Opposite = inverseMinMax(Opc);
5201
5202 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5203 if (Res.getOpcode() != Opposite)
5204 return SDValue(); // Op got folded away.
5205 if (!N0.hasOneUse())
5206 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5207 return Res;
5208 }
5209 case AMDGPUISD::FMED3: {
5210 SDValue Ops[3];
5211 for (unsigned I = 0; I < 3; ++I)
5212 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5213
5214 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5215 if (Res.getOpcode() != AMDGPUISD::FMED3)
5216 return SDValue(); // Op got folded away.
5217
5218 if (!N0.hasOneUse()) {
5219 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5220 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5221
5222 for (SDNode *U : Neg->users())
5223 DCI.AddToWorklist(N: U);
5224 }
5225
5226 return Res;
5227 }
5228 case ISD::FP_EXTEND:
5229 case ISD::FTRUNC:
5230 case ISD::FRINT:
5231 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5232 case ISD::FROUNDEVEN:
5233 case ISD::FSIN:
5234 case ISD::FCANONICALIZE:
5235 case AMDGPUISD::RCP:
5236 case AMDGPUISD::RCP_LEGACY:
5237 case AMDGPUISD::RCP_IFLAG:
5238 case AMDGPUISD::SIN_HW: {
5239 SDValue CvtSrc = N0.getOperand(i: 0);
5240 if (CvtSrc.getOpcode() == ISD::FNEG) {
5241 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5242 // (fneg (rcp (fneg x))) -> (rcp x)
5243 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5244 }
5245
5246 if (!N0.hasOneUse())
5247 return SDValue();
5248
5249 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5250 // (fneg (rcp x)) -> (rcp (fneg x))
5251 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5252 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5253 }
5254 case ISD::FP_ROUND: {
5255 SDValue CvtSrc = N0.getOperand(i: 0);
5256
5257 if (CvtSrc.getOpcode() == ISD::FNEG) {
5258 // (fneg (fp_round (fneg x))) -> (fp_round x)
5259 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5260 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5261 }
5262
5263 if (!N0.hasOneUse())
5264 return SDValue();
5265
5266 // (fneg (fp_round x)) -> (fp_round (fneg x))
5267 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5268 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5269 }
5270 case ISD::FP16_TO_FP: {
5271 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5272 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5273 // Put the fneg back as a legal source operation that can be matched later.
5274 SDLoc SL(N);
5275
5276 SDValue Src = N0.getOperand(i: 0);
5277 EVT SrcVT = Src.getValueType();
5278
5279 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5280 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5281 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5282 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5283 }
5284 case ISD::SELECT: {
5285 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5286 // TODO: Invert conditions of foldFreeOpFromSelect
5287 return SDValue();
5288 }
5289 case ISD::BITCAST: {
5290 SDLoc SL(N);
5291 SDValue BCSrc = N0.getOperand(i: 0);
5292 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5293 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5294 if (HighBits.getValueType().getSizeInBits() != 32 ||
5295 !fnegFoldsIntoOp(N: HighBits.getNode()))
5296 return SDValue();
5297
5298 // f64 fneg only really needs to operate on the high half of of the
5299 // register, so try to force it to an f32 operation to help make use of
5300 // source modifiers.
5301 //
5302 //
5303 // fneg (f64 (bitcast (build_vector x, y))) ->
5304 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5305 // (fneg (bitcast i32:y to f32)))
5306
5307 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5308 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5309 SDValue CastBack =
5310 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5311
5312 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5313 Ops.back() = CastBack;
5314 DCI.AddToWorklist(N: NegHi.getNode());
5315 SDValue Build =
5316 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5317 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5318
5319 if (!N0.hasOneUse())
5320 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5321 return Result;
5322 }
5323
5324 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5325 BCSrc.hasOneUse()) {
5326 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5327 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5328
5329 // TODO: Cast back result for multiple uses is beneficial in some cases.
5330
5331 SDValue LHS =
5332 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5333 SDValue RHS =
5334 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5335
5336 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5337 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5338
5339 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5340 N3: NegRHS);
5341 }
5342
5343 return SDValue();
5344 }
5345 default:
5346 return SDValue();
5347 }
5348}
5349
5350SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5351 DAGCombinerInfo &DCI) const {
5352 SelectionDAG &DAG = DCI.DAG;
5353 SDValue N0 = N->getOperand(Num: 0);
5354
5355 if (!N0.hasOneUse())
5356 return SDValue();
5357
5358 switch (N0.getOpcode()) {
5359 case ISD::FP16_TO_FP: {
5360 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5361 SDLoc SL(N);
5362 SDValue Src = N0.getOperand(i: 0);
5363 EVT SrcVT = Src.getValueType();
5364
5365 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5366 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5367 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5368 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5369 }
5370 default:
5371 return SDValue();
5372 }
5373}
5374
5375SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5376 DAGCombinerInfo &DCI) const {
5377 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5378 if (!CFP)
5379 return SDValue();
5380
5381 // XXX - Should this flush denormals?
5382 const APFloat &Val = CFP->getValueAPF();
5383 APFloat One(Val.getSemantics(), "1.0");
5384 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5385}
5386
5387SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5388 DAGCombinerInfo &DCI) const {
5389 SelectionDAG &DAG = DCI.DAG;
5390 SDLoc DL(N);
5391
5392 switch(N->getOpcode()) {
5393 default:
5394 break;
5395 case ISD::BITCAST: {
5396 EVT DestVT = N->getValueType(ResNo: 0);
5397
5398 // Push casts through vector builds. This helps avoid emitting a large
5399 // number of copies when materializing floating point vector constants.
5400 //
5401 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5402 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5403 if (DestVT.isVector()) {
5404 SDValue Src = N->getOperand(Num: 0);
5405 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5406 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5407 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5408 EVT SrcVT = Src.getValueType();
5409 unsigned NElts = DestVT.getVectorNumElements();
5410
5411 if (SrcVT.getVectorNumElements() == NElts) {
5412 EVT DestEltVT = DestVT.getVectorElementType();
5413
5414 SmallVector<SDValue, 8> CastedElts;
5415 SDLoc SL(N);
5416 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5417 SDValue Elt = Src.getOperand(i: I);
5418 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5419 }
5420
5421 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5422 }
5423 }
5424 }
5425
5426 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5427 break;
5428
5429 // Fold bitcasts of constants.
5430 //
5431 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5432 // TODO: Generalize and move to DAGCombiner
5433 SDValue Src = N->getOperand(Num: 0);
5434 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5435 SDLoc SL(N);
5436 uint64_t CVal = C->getZExtValue();
5437 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5438 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5439 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5440 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5441 }
5442
5443 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5444 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5445 SDLoc SL(N);
5446 uint64_t CVal = Val.getZExtValue();
5447 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5448 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5449 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5450
5451 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5452 }
5453
5454 break;
5455 }
5456 case ISD::SHL:
5457 case ISD::SRA:
5458 case ISD::SRL: {
5459 // Range metadata can be invalidated when loads are converted to legal types
5460 // (e.g. v2i64 -> v4i32).
5461 // Try to convert vector shl/sra/srl before type legalization so that range
5462 // metadata can be utilized.
5463 if (!(N->getValueType(ResNo: 0).isVector() &&
5464 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5465 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5466 break;
5467 if (N->getOpcode() == ISD::SHL)
5468 return performShlCombine(N, DCI);
5469 if (N->getOpcode() == ISD::SRA)
5470 return performSraCombine(N, DCI);
5471 return performSrlCombine(N, DCI);
5472 }
5473 case ISD::TRUNCATE:
5474 return performTruncateCombine(N, DCI);
5475 case ISD::MUL:
5476 return performMulCombine(N, DCI);
5477 case AMDGPUISD::MUL_U24:
5478 case AMDGPUISD::MUL_I24: {
5479 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5480 return Simplified;
5481 break;
5482 }
5483 case AMDGPUISD::MULHI_I24:
5484 case AMDGPUISD::MULHI_U24:
5485 return simplifyMul24(Node24: N, DCI);
5486 case ISD::SMUL_LOHI:
5487 case ISD::UMUL_LOHI:
5488 return performMulLoHiCombine(N, DCI);
5489 case ISD::MULHS:
5490 return performMulhsCombine(N, DCI);
5491 case ISD::MULHU:
5492 return performMulhuCombine(N, DCI);
5493 case ISD::SELECT:
5494 return performSelectCombine(N, DCI);
5495 case ISD::FNEG:
5496 return performFNegCombine(N, DCI);
5497 case ISD::FABS:
5498 return performFAbsCombine(N, DCI);
5499 case AMDGPUISD::BFE_I32:
5500 case AMDGPUISD::BFE_U32: {
5501 assert(!N->getValueType(0).isVector() &&
5502 "Vector handling of BFE not implemented");
5503 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5504 if (!Width)
5505 break;
5506
5507 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5508 if (WidthVal == 0)
5509 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5510
5511 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5512 if (!Offset)
5513 break;
5514
5515 SDValue BitsFrom = N->getOperand(Num: 0);
5516 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5517
5518 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5519
5520 if (OffsetVal == 0) {
5521 // This is already sign / zero extended, so try to fold away extra BFEs.
5522 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5523
5524 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5525 if (OpSignBits >= SignBits)
5526 return BitsFrom;
5527
5528 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5529 if (Signed) {
5530 // This is a sign_extend_inreg. Replace it to take advantage of existing
5531 // DAG Combines. If not eliminated, we will match back to BFE during
5532 // selection.
5533
5534 // TODO: The sext_inreg of extended types ends, although we can could
5535 // handle them in a single BFE.
5536 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5537 N2: DAG.getValueType(SmallVT));
5538 }
5539
5540 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5541 }
5542
5543 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5544 if (Signed) {
5545 return constantFoldBFE<int32_t>(DAG,
5546 Src0: CVal->getSExtValue(),
5547 Offset: OffsetVal,
5548 Width: WidthVal,
5549 DL);
5550 }
5551
5552 return constantFoldBFE<uint32_t>(DAG,
5553 Src0: CVal->getZExtValue(),
5554 Offset: OffsetVal,
5555 Width: WidthVal,
5556 DL);
5557 }
5558
5559 if ((OffsetVal + WidthVal) >= 32 &&
5560 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5561 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5562 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5563 N1: BitsFrom, N2: ShiftVal);
5564 }
5565
5566 if (BitsFrom.hasOneUse()) {
5567 APInt Demanded = APInt::getBitsSet(numBits: 32,
5568 loBit: OffsetVal,
5569 hiBit: OffsetVal + WidthVal);
5570
5571 KnownBits Known;
5572 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5573 !DCI.isBeforeLegalizeOps());
5574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5575 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5576 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5577 DCI.CommitTargetLoweringOpt(TLO);
5578 }
5579 }
5580
5581 break;
5582 }
5583 case ISD::LOAD:
5584 return performLoadCombine(N, DCI);
5585 case ISD::STORE:
5586 return performStoreCombine(N, DCI);
5587 case AMDGPUISD::RCP:
5588 case AMDGPUISD::RCP_IFLAG:
5589 return performRcpCombine(N, DCI);
5590 case ISD::AssertZext:
5591 case ISD::AssertSext:
5592 return performAssertSZExtCombine(N, DCI);
5593 case ISD::INTRINSIC_WO_CHAIN:
5594 return performIntrinsicWOChainCombine(N, DCI);
5595 case AMDGPUISD::FMAD_FTZ: {
5596 SDValue N0 = N->getOperand(Num: 0);
5597 SDValue N1 = N->getOperand(Num: 1);
5598 SDValue N2 = N->getOperand(Num: 2);
5599 EVT VT = N->getValueType(ResNo: 0);
5600
5601 // FMAD_FTZ is a FMAD + flush denormals to zero.
5602 // We flush the inputs, the intermediate step, and the output.
5603 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5604 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5605 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5606 if (N0CFP && N1CFP && N2CFP) {
5607 const auto FTZ = [](const APFloat &V) {
5608 if (V.isDenormal()) {
5609 APFloat Zero(V.getSemantics(), 0);
5610 return V.isNegative() ? -Zero : Zero;
5611 }
5612 return V;
5613 };
5614
5615 APFloat V0 = FTZ(N0CFP->getValueAPF());
5616 APFloat V1 = FTZ(N1CFP->getValueAPF());
5617 APFloat V2 = FTZ(N2CFP->getValueAPF());
5618 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5619 V0 = FTZ(V0);
5620 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5621 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5622 }
5623 break;
5624 }
5625 }
5626 return SDValue();
5627}
5628
5629//===----------------------------------------------------------------------===//
5630// Helper functions
5631//===----------------------------------------------------------------------===//
5632
5633SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5634 const TargetRegisterClass *RC,
5635 Register Reg, EVT VT,
5636 const SDLoc &SL,
5637 bool RawReg) const {
5638 MachineFunction &MF = DAG.getMachineFunction();
5639 MachineRegisterInfo &MRI = MF.getRegInfo();
5640 Register VReg;
5641
5642 if (!MRI.isLiveIn(Reg)) {
5643 VReg = MRI.createVirtualRegister(RegClass: RC);
5644 MRI.addLiveIn(Reg, vreg: VReg);
5645 } else {
5646 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5647 }
5648
5649 if (RawReg)
5650 return DAG.getRegister(Reg: VReg, VT);
5651
5652 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5653}
5654
5655// This may be called multiple times, and nothing prevents creating multiple
5656// objects at the same offset. See if we already defined this object.
5657static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5658 int64_t Offset) {
5659 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5660 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5661 assert(MFI.getObjectSize(I) == Size);
5662 return I;
5663 }
5664 }
5665
5666 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5667}
5668
5669SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5670 EVT VT,
5671 const SDLoc &SL,
5672 int64_t Offset) const {
5673 MachineFunction &MF = DAG.getMachineFunction();
5674 MachineFrameInfo &MFI = MF.getFrameInfo();
5675 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5676
5677 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5678 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5679
5680 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5681 MMOFlags: MachineMemOperand::MODereferenceable |
5682 MachineMemOperand::MOInvariant);
5683}
5684
5685SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5686 const SDLoc &SL,
5687 SDValue Chain,
5688 SDValue ArgVal,
5689 int64_t Offset) const {
5690 MachineFunction &MF = DAG.getMachineFunction();
5691 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5692 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5693
5694 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5695 // Stores to the argument stack area are relative to the stack pointer.
5696 SDValue SP =
5697 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5698 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5699 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5700 MMOFlags: MachineMemOperand::MODereferenceable);
5701 return Store;
5702}
5703
5704SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5705 const TargetRegisterClass *RC,
5706 EVT VT, const SDLoc &SL,
5707 const ArgDescriptor &Arg) const {
5708 assert(Arg && "Attempting to load missing argument");
5709
5710 SDValue V = Arg.isRegister() ?
5711 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5712 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5713
5714 if (!Arg.isMasked())
5715 return V;
5716
5717 unsigned Mask = Arg.getMask();
5718 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5719 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5720 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5721 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5722 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5723}
5724
5725uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5726 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5727 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5728 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5729 uint64_t ArgOffset =
5730 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5731 switch (Param) {
5732 case FIRST_IMPLICIT:
5733 return ArgOffset;
5734 case PRIVATE_BASE:
5735 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5736 case SHARED_BASE:
5737 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5738 case QUEUE_PTR:
5739 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5740 }
5741 llvm_unreachable("unexpected implicit parameter type");
5742}
5743
5744uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5745 const MachineFunction &MF, const ImplicitParameter Param) const {
5746 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5747 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5748}
5749
5750SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5751 SelectionDAG &DAG, int Enabled,
5752 int &RefinementSteps,
5753 bool &UseOneConstNR,
5754 bool Reciprocal) const {
5755 EVT VT = Operand.getValueType();
5756
5757 if (VT == MVT::f32) {
5758 RefinementSteps = 0;
5759 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5760 }
5761
5762 // TODO: There is also f64 rsq instruction, but the documentation is less
5763 // clear on its precision.
5764
5765 return SDValue();
5766}
5767
5768SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5769 SelectionDAG &DAG, int Enabled,
5770 int &RefinementSteps) const {
5771 EVT VT = Operand.getValueType();
5772
5773 if (VT == MVT::f32) {
5774 // Reciprocal, < 1 ulp error.
5775 //
5776 // This reciprocal approximation converges to < 0.5 ulp error with one
5777 // newton rhapson performed with two fused multiple adds (FMAs).
5778
5779 RefinementSteps = 0;
5780 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5781 }
5782
5783 // TODO: There is also f64 rcp instruction, but the documentation is less
5784 // clear on its precision.
5785
5786 return SDValue();
5787}
5788
5789static unsigned workitemIntrinsicDim(unsigned ID) {
5790 switch (ID) {
5791 case Intrinsic::amdgcn_workitem_id_x:
5792 return 0;
5793 case Intrinsic::amdgcn_workitem_id_y:
5794 return 1;
5795 case Intrinsic::amdgcn_workitem_id_z:
5796 return 2;
5797 default:
5798 llvm_unreachable("not a workitem intrinsic");
5799 }
5800}
5801
5802void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5803 const SDValue Op, KnownBits &Known,
5804 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5805
5806 Known.resetAll(); // Don't know anything.
5807
5808 unsigned Opc = Op.getOpcode();
5809
5810 switch (Opc) {
5811 default:
5812 break;
5813 case AMDGPUISD::CARRY:
5814 case AMDGPUISD::BORROW: {
5815 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
5816 break;
5817 }
5818
5819 case AMDGPUISD::BFE_I32:
5820 case AMDGPUISD::BFE_U32: {
5821 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5822 if (!CWidth)
5823 return;
5824
5825 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5826
5827 if (Opc == AMDGPUISD::BFE_U32)
5828 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
5829
5830 break;
5831 }
5832 case AMDGPUISD::FP_TO_FP16: {
5833 unsigned BitWidth = Known.getBitWidth();
5834
5835 // High bits are zero.
5836 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
5837 break;
5838 }
5839 case AMDGPUISD::MUL_U24:
5840 case AMDGPUISD::MUL_I24: {
5841 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5842 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5843 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5844 RHSKnown.countMinTrailingZeros();
5845 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
5846 // Skip extra check if all bits are known zeros.
5847 if (TrailZ >= 32)
5848 break;
5849
5850 // Truncate to 24 bits.
5851 LHSKnown = LHSKnown.trunc(BitWidth: 24);
5852 RHSKnown = RHSKnown.trunc(BitWidth: 24);
5853
5854 if (Opc == AMDGPUISD::MUL_I24) {
5855 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5856 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5857 unsigned MaxValBits = LHSValBits + RHSValBits;
5858 if (MaxValBits > 32)
5859 break;
5860 unsigned SignBits = 32 - MaxValBits + 1;
5861 bool LHSNegative = LHSKnown.isNegative();
5862 bool LHSNonNegative = LHSKnown.isNonNegative();
5863 bool LHSPositive = LHSKnown.isStrictlyPositive();
5864 bool RHSNegative = RHSKnown.isNegative();
5865 bool RHSNonNegative = RHSKnown.isNonNegative();
5866 bool RHSPositive = RHSKnown.isStrictlyPositive();
5867
5868 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5869 Known.Zero.setHighBits(SignBits);
5870 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5871 Known.One.setHighBits(SignBits);
5872 } else {
5873 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5874 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5875 unsigned MaxValBits = LHSValBits + RHSValBits;
5876 if (MaxValBits >= 32)
5877 break;
5878 Known.Zero.setBitsFrom(MaxValBits);
5879 }
5880 break;
5881 }
5882 case AMDGPUISD::PERM: {
5883 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5884 if (!CMask)
5885 return;
5886
5887 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5888 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5889 unsigned Sel = CMask->getZExtValue();
5890
5891 for (unsigned I = 0; I < 32; I += 8) {
5892 unsigned SelBits = Sel & 0xff;
5893 if (SelBits < 4) {
5894 SelBits *= 8;
5895 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5896 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5897 } else if (SelBits < 7) {
5898 SelBits = (SelBits & 3) * 8;
5899 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5900 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5901 } else if (SelBits == 0x0c) {
5902 Known.Zero |= 0xFFull << I;
5903 } else if (SelBits > 0x0c) {
5904 Known.One |= 0xFFull << I;
5905 }
5906 Sel >>= 8;
5907 }
5908 break;
5909 }
5910 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5911 Known.Zero.setHighBits(24);
5912 break;
5913 }
5914 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5915 Known.Zero.setHighBits(16);
5916 break;
5917 }
5918 case AMDGPUISD::LDS: {
5919 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
5920 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5921
5922 Known.Zero.setHighBits(16);
5923 Known.Zero.setLowBits(Log2(A: Alignment));
5924 break;
5925 }
5926 case AMDGPUISD::SMIN3:
5927 case AMDGPUISD::SMAX3:
5928 case AMDGPUISD::SMED3:
5929 case AMDGPUISD::UMIN3:
5930 case AMDGPUISD::UMAX3:
5931 case AMDGPUISD::UMED3: {
5932 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
5933 if (Known2.isUnknown())
5934 break;
5935
5936 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5937 if (Known1.isUnknown())
5938 break;
5939
5940 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5941 if (Known0.isUnknown())
5942 break;
5943
5944 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5945 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5946 Known.One = Known0.One & Known1.One & Known2.One;
5947 break;
5948 }
5949 case ISD::INTRINSIC_WO_CHAIN: {
5950 unsigned IID = Op.getConstantOperandVal(i: 0);
5951 switch (IID) {
5952 case Intrinsic::amdgcn_workitem_id_x:
5953 case Intrinsic::amdgcn_workitem_id_y:
5954 case Intrinsic::amdgcn_workitem_id_z: {
5955 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5956 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5957 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5958 break;
5959 }
5960 default:
5961 break;
5962 }
5963 }
5964 }
5965}
5966
5967unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
5968 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5969 unsigned Depth) const {
5970 switch (Op.getOpcode()) {
5971 case AMDGPUISD::BFE_I32: {
5972 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5973 if (!Width)
5974 return 1;
5975
5976 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5977 if (!isNullConstant(V: Op.getOperand(i: 1)))
5978 return SignBits;
5979
5980 // TODO: Could probably figure something out with non-0 offsets.
5981 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5982 return std::max(a: SignBits, b: Op0SignBits);
5983 }
5984
5985 case AMDGPUISD::BFE_U32: {
5986 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5987 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5988 }
5989
5990 case AMDGPUISD::CARRY:
5991 case AMDGPUISD::BORROW:
5992 return 31;
5993 case AMDGPUISD::BUFFER_LOAD_BYTE:
5994 return 25;
5995 case AMDGPUISD::BUFFER_LOAD_SHORT:
5996 return 17;
5997 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5998 return 24;
5999 case AMDGPUISD::BUFFER_LOAD_USHORT:
6000 return 16;
6001 case AMDGPUISD::FP_TO_FP16:
6002 return 16;
6003 case AMDGPUISD::SMIN3:
6004 case AMDGPUISD::SMAX3:
6005 case AMDGPUISD::SMED3:
6006 case AMDGPUISD::UMIN3:
6007 case AMDGPUISD::UMAX3:
6008 case AMDGPUISD::UMED3: {
6009 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6010 if (Tmp2 == 1)
6011 return 1; // Early out.
6012
6013 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6014 if (Tmp1 == 1)
6015 return 1; // Early out.
6016
6017 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6018 if (Tmp0 == 1)
6019 return 1; // Early out.
6020
6021 return std::min(l: {Tmp0, Tmp1, Tmp2});
6022 }
6023 default:
6024 return 1;
6025 }
6026}
6027
6028unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6029 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6030 const MachineRegisterInfo &MRI, unsigned Depth) const {
6031 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6032 if (!MI)
6033 return 1;
6034
6035 // TODO: Check range metadata on MMO.
6036 switch (MI->getOpcode()) {
6037 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6038 return 25;
6039 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6040 return 17;
6041 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6042 return 24;
6043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6044 return 16;
6045 case AMDGPU::G_AMDGPU_SMED3:
6046 case AMDGPU::G_AMDGPU_UMED3: {
6047 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6048 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6049 if (Tmp2 == 1)
6050 return 1;
6051 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6052 if (Tmp1 == 1)
6053 return 1;
6054 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6055 if (Tmp0 == 1)
6056 return 1;
6057 return std::min(l: {Tmp0, Tmp1, Tmp2});
6058 }
6059 default:
6060 return 1;
6061 }
6062}
6063
6064bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
6065 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6066 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6067 unsigned Opcode = Op.getOpcode();
6068 switch (Opcode) {
6069 case AMDGPUISD::BFE_I32:
6070 case AMDGPUISD::BFE_U32:
6071 return false;
6072 }
6073 return TargetLowering::canCreateUndefOrPoisonForTargetNode(
6074 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6075}
6076
6077bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6078 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6079 unsigned Depth) const {
6080 unsigned Opcode = Op.getOpcode();
6081 switch (Opcode) {
6082 case AMDGPUISD::FMIN_LEGACY:
6083 case AMDGPUISD::FMAX_LEGACY: {
6084 if (SNaN)
6085 return true;
6086
6087 // TODO: Can check no nans on one of the operands for each one, but which
6088 // one?
6089 return false;
6090 }
6091 case AMDGPUISD::FMUL_LEGACY:
6092 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6093 if (SNaN)
6094 return true;
6095 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6096 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6097 }
6098 case AMDGPUISD::FMED3:
6099 case AMDGPUISD::FMIN3:
6100 case AMDGPUISD::FMAX3:
6101 case AMDGPUISD::FMINIMUM3:
6102 case AMDGPUISD::FMAXIMUM3:
6103 case AMDGPUISD::FMAD_FTZ: {
6104 if (SNaN)
6105 return true;
6106 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6107 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6108 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6109 }
6110 case AMDGPUISD::CVT_F32_UBYTE0:
6111 case AMDGPUISD::CVT_F32_UBYTE1:
6112 case AMDGPUISD::CVT_F32_UBYTE2:
6113 case AMDGPUISD::CVT_F32_UBYTE3:
6114 return true;
6115
6116 case AMDGPUISD::RCP:
6117 case AMDGPUISD::RSQ:
6118 case AMDGPUISD::RCP_LEGACY:
6119 case AMDGPUISD::RSQ_CLAMP: {
6120 if (SNaN)
6121 return true;
6122
6123 // TODO: Need is known positive check.
6124 return false;
6125 }
6126 case ISD::FLDEXP:
6127 case AMDGPUISD::FRACT: {
6128 if (SNaN)
6129 return true;
6130 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6131 }
6132 case AMDGPUISD::DIV_SCALE:
6133 case AMDGPUISD::DIV_FMAS:
6134 case AMDGPUISD::DIV_FIXUP:
6135 // TODO: Refine on operands.
6136 return SNaN;
6137 case AMDGPUISD::SIN_HW:
6138 case AMDGPUISD::COS_HW: {
6139 // TODO: Need check for infinity
6140 return SNaN;
6141 }
6142 case ISD::INTRINSIC_WO_CHAIN: {
6143 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6144 // TODO: Handle more intrinsics
6145 switch (IntrinsicID) {
6146 case Intrinsic::amdgcn_cubeid:
6147 case Intrinsic::amdgcn_cvt_off_f32_i4:
6148 return true;
6149
6150 case Intrinsic::amdgcn_frexp_mant: {
6151 if (SNaN)
6152 return true;
6153 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6154 }
6155 case Intrinsic::amdgcn_cvt_pkrtz: {
6156 if (SNaN)
6157 return true;
6158 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6159 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6160 }
6161 case Intrinsic::amdgcn_rcp:
6162 case Intrinsic::amdgcn_rsq:
6163 case Intrinsic::amdgcn_rcp_legacy:
6164 case Intrinsic::amdgcn_rsq_legacy:
6165 case Intrinsic::amdgcn_rsq_clamp:
6166 case Intrinsic::amdgcn_tanh: {
6167 if (SNaN)
6168 return true;
6169
6170 // TODO: Need is known positive check.
6171 return false;
6172 }
6173 case Intrinsic::amdgcn_trig_preop:
6174 case Intrinsic::amdgcn_fdot2:
6175 // TODO: Refine on operand
6176 return SNaN;
6177 case Intrinsic::amdgcn_fma_legacy:
6178 if (SNaN)
6179 return true;
6180 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6181 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6182 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6183 default:
6184 return false;
6185 }
6186 }
6187 default:
6188 return false;
6189 }
6190}
6191
6192bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6193 Register N0, Register N1) const {
6194 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6195}
6196