1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPUMachineFunction.h"
19#include "AMDGPUMemoryUtils.h"
20#include "SIMachineFunctionInfo.h"
21#include "llvm/CodeGen/Analysis.h"
22#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/IR/DiagnosticInfo.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
26#include "llvm/Support/CommandLine.h"
27#include "llvm/Support/KnownBits.h"
28#include "llvm/Target/TargetMachine.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
34static cl::opt<bool> AMDGPUBypassSlowDiv(
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(Val: true));
38
39// Find a larger type to do a load / store of a vector with.
40EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Context&: Ctx, BitWidth: StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i32, NumElements: StoreSize / 32);
47
48 return VT;
49}
50
51unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
52 return DAG.computeKnownBits(Op).countMaxActiveBits();
53}
54
55unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
61AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
66 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
67 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
68 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
71 MaxGluedStoresPerMemcpy = 16;
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
75 setOperationAction(Op: ISD::LOAD, VT: MVT::f32, Action: Promote);
76 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
77
78 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f32, Action: Promote);
79 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
80
81 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f32, Action: Promote);
82 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
83
84 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f32, Action: Promote);
85 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
86
87 setOperationAction(Op: ISD::LOAD, VT: MVT::v5f32, Action: Promote);
88 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
89
90 setOperationAction(Op: ISD::LOAD, VT: MVT::v6f32, Action: Promote);
91 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
92
93 setOperationAction(Op: ISD::LOAD, VT: MVT::v7f32, Action: Promote);
94 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
95
96 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Promote);
97 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
98
99 setOperationAction(Op: ISD::LOAD, VT: MVT::v9f32, Action: Promote);
100 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
101
102 setOperationAction(Op: ISD::LOAD, VT: MVT::v10f32, Action: Promote);
103 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
104
105 setOperationAction(Op: ISD::LOAD, VT: MVT::v11f32, Action: Promote);
106 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
107
108 setOperationAction(Op: ISD::LOAD, VT: MVT::v12f32, Action: Promote);
109 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
110
111 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f32, Action: Promote);
112 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
113
114 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f32, Action: Promote);
115 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
116
117 setOperationAction(Op: ISD::LOAD, VT: MVT::i64, Action: Promote);
118 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i64, DestVT: MVT::v2i32);
119
120 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i64, Action: Promote);
121 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
122
123 setOperationAction(Op: ISD::LOAD, VT: MVT::f64, Action: Promote);
124 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f64, DestVT: MVT::v2i32);
125
126 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f64, Action: Promote);
127 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
128
129 setOperationAction(Op: ISD::LOAD, VT: MVT::v3i64, Action: Promote);
130 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
131
132 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Promote);
133 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
134
135 setOperationAction(Op: ISD::LOAD, VT: MVT::v3f64, Action: Promote);
136 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
137
138 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Promote);
139 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
140
141 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i64, Action: Promote);
142 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
143
144 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f64, Action: Promote);
145 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
146
147 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i64, Action: Promote);
148 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
149
150 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f64, Action: Promote);
151 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
152
153 setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Promote);
154 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::i128, DestVT: MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
157 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f32, Action: Promote);
158 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f32, DestVT: MVT::i32);
159
160 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f64, Action: Promote);
161 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f64, DestVT: MVT::i64);
162
163 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::f16, Action: Promote);
164 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
165
166 setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::bf16, Action: Promote);
167 AddPromotedToType(Opc: ISD::ATOMIC_LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
168
169 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f32, Action: Promote);
170 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
171
172 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f64, Action: Promote);
173 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f64, DestVT: MVT::i64);
174
175 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::f16, Action: Promote);
176 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
177
178 setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::bf16, Action: Promote);
179 AddPromotedToType(Opc: ISD::ATOMIC_STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
184 setLoadExtAction(ExtTypes: {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, ValVT: MVT::i64, MemVT: VT,
185 Action: Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i1, Action: Promote);
193 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i8, Action: Legal);
194 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i16, Action: Legal);
195 setLoadExtAction(ExtType: Op, ValVT: VT, MemVT: MVT::i32, Action: Expand);
196 }
197 }
198
199 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
202 setLoadExtAction(ExtTypes: {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, ValVT: VT, MemVT,
203 Action: Expand);
204
205 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
206 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
207 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
208 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
209 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
210 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
211 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
212 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
213 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
214 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
215 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
216 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
217 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
218 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
219
220 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
221 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
222 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
223 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
224 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
225 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
226
227 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
228 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
229 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
230 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
231 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
232 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
233 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
234 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
235 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
236 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
237 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
238 setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
239
240 setOperationAction(Op: ISD::STORE, VT: MVT::f32, Action: Promote);
241 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f32, DestVT: MVT::i32);
242
243 setOperationAction(Op: ISD::STORE, VT: MVT::v2f32, Action: Promote);
244 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
245
246 setOperationAction(Op: ISD::STORE, VT: MVT::v3f32, Action: Promote);
247 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
248
249 setOperationAction(Op: ISD::STORE, VT: MVT::v4f32, Action: Promote);
250 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
251
252 setOperationAction(Op: ISD::STORE, VT: MVT::v5f32, Action: Promote);
253 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
254
255 setOperationAction(Op: ISD::STORE, VT: MVT::v6f32, Action: Promote);
256 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
257
258 setOperationAction(Op: ISD::STORE, VT: MVT::v7f32, Action: Promote);
259 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
260
261 setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Promote);
262 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f32, DestVT: MVT::v8i32);
263
264 setOperationAction(Op: ISD::STORE, VT: MVT::v9f32, Action: Promote);
265 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
266
267 setOperationAction(Op: ISD::STORE, VT: MVT::v10f32, Action: Promote);
268 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
269
270 setOperationAction(Op: ISD::STORE, VT: MVT::v11f32, Action: Promote);
271 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
272
273 setOperationAction(Op: ISD::STORE, VT: MVT::v12f32, Action: Promote);
274 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
275
276 setOperationAction(Op: ISD::STORE, VT: MVT::v16f32, Action: Promote);
277 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f32, DestVT: MVT::v16i32);
278
279 setOperationAction(Op: ISD::STORE, VT: MVT::v32f32, Action: Promote);
280 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f32, DestVT: MVT::v32i32);
281
282 setOperationAction(Op: ISD::STORE, VT: MVT::i64, Action: Promote);
283 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i64, DestVT: MVT::v2i32);
284
285 setOperationAction(Op: ISD::STORE, VT: MVT::v2i64, Action: Promote);
286 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i64, DestVT: MVT::v4i32);
287
288 setOperationAction(Op: ISD::STORE, VT: MVT::f64, Action: Promote);
289 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f64, DestVT: MVT::v2i32);
290
291 setOperationAction(Op: ISD::STORE, VT: MVT::v2f64, Action: Promote);
292 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f64, DestVT: MVT::v4i32);
293
294 setOperationAction(Op: ISD::STORE, VT: MVT::v3i64, Action: Promote);
295 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3i64, DestVT: MVT::v6i32);
296
297 setOperationAction(Op: ISD::STORE, VT: MVT::v3f64, Action: Promote);
298 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v3f64, DestVT: MVT::v6i32);
299
300 setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Promote);
301 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i64, DestVT: MVT::v8i32);
302
303 setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Promote);
304 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f64, DestVT: MVT::v8i32);
305
306 setOperationAction(Op: ISD::STORE, VT: MVT::v8i64, Action: Promote);
307 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i64, DestVT: MVT::v16i32);
308
309 setOperationAction(Op: ISD::STORE, VT: MVT::v8f64, Action: Promote);
310 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f64, DestVT: MVT::v16i32);
311
312 setOperationAction(Op: ISD::STORE, VT: MVT::v16i64, Action: Promote);
313 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i64, DestVT: MVT::v32i32);
314
315 setOperationAction(Op: ISD::STORE, VT: MVT::v16f64, Action: Promote);
316 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f64, DestVT: MVT::v32i32);
317
318 setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Promote);
319 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::i128, DestVT: MVT::v4i32);
320
321 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i1, Action: Expand);
322 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i8, Action: Expand);
323 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
324 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i32, Action: Expand);
325
326 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i1, Action: Expand);
327 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Expand);
328 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Expand);
329 setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Expand);
330
331 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::bf16, Action: Expand);
332 setTruncStoreAction(ValVT: MVT::f32, MemVT: MVT::f16, Action: Expand);
333 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2bf16, Action: Expand);
334 setTruncStoreAction(ValVT: MVT::v2f32, MemVT: MVT::v2f16, Action: Expand);
335 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3bf16, Action: Expand);
336 setTruncStoreAction(ValVT: MVT::v3f32, MemVT: MVT::v3f16, Action: Expand);
337 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4bf16, Action: Expand);
338 setTruncStoreAction(ValVT: MVT::v4f32, MemVT: MVT::v4f16, Action: Expand);
339 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8bf16, Action: Expand);
340 setTruncStoreAction(ValVT: MVT::v8f32, MemVT: MVT::v8f16, Action: Expand);
341 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16bf16, Action: Expand);
342 setTruncStoreAction(ValVT: MVT::v16f32, MemVT: MVT::v16f16, Action: Expand);
343 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32bf16, Action: Expand);
344 setTruncStoreAction(ValVT: MVT::v32f32, MemVT: MVT::v32f16, Action: Expand);
345
346 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::bf16, Action: Expand);
347 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f16, Action: Expand);
348 setTruncStoreAction(ValVT: MVT::f64, MemVT: MVT::f32, Action: Expand);
349
350 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f32, Action: Expand);
351 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2bf16, Action: Expand);
352 setTruncStoreAction(ValVT: MVT::v2f64, MemVT: MVT::v2f16, Action: Expand);
353
354 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i8, Action: Expand);
355
356 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
357 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
358 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i8, Action: Expand);
359 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i1, Action: Expand);
360 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f32, Action: Expand);
361 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3bf16, Action: Expand);
362 setTruncStoreAction(ValVT: MVT::v3f64, MemVT: MVT::v3f16, Action: Expand);
363
364 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i32, Action: Expand);
365 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i16, Action: Expand);
366 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f32, Action: Expand);
367 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4bf16, Action: Expand);
368 setTruncStoreAction(ValVT: MVT::v4f64, MemVT: MVT::v4f16, Action: Expand);
369
370 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f32, Action: Expand);
371 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8bf16, Action: Expand);
372 setTruncStoreAction(ValVT: MVT::v8f64, MemVT: MVT::v8f16, Action: Expand);
373
374 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f32, Action: Expand);
375 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16bf16, Action: Expand);
376 setTruncStoreAction(ValVT: MVT::v16f64, MemVT: MVT::v16f16, Action: Expand);
377 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
378 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i16, Action: Expand);
379 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
380 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i8, Action: Expand);
381 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i1, Action: Expand);
382
383 setOperationAction(Ops: ISD::Constant, VTs: {MVT::i32, MVT::i64}, Action: Legal);
384 setOperationAction(Ops: ISD::ConstantFP, VTs: {MVT::f32, MVT::f64}, Action: Legal);
385
386 setOperationAction(Ops: {ISD::BR_JT, ISD::BRIND}, VT: MVT::Other, Action: Expand);
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
390 setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i32, Action: Custom);
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
394 setOperationAction(Ops: {ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
395 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
396 VT: MVT::f32, Action: Legal);
397
398 setOperationAction(Op: ISD::FLOG2, VT: MVT::f32, Action: Custom);
399 setOperationAction(Ops: ISD::FROUND, VTs: {MVT::f32, MVT::f64}, Action: Custom);
400 setOperationAction(Ops: {ISD::LROUND, ISD::LLROUND},
401 VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Expand);
402
403 setOperationAction(
404 Ops: {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, VT: MVT::f32,
405 Action: Custom);
406
407 setOperationAction(Ops: ISD::FNEARBYINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
408
409 setOperationAction(Ops: ISD::FRINT, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
410
411 setOperationAction(Ops: {ISD::LRINT, ISD::LLRINT}, VTs: {MVT::f16, MVT::f32, MVT::f64},
412 Action: Expand);
413
414 setOperationAction(Ops: ISD::FREM, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
418 else {
419 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f32, MVT::f64}, Action: Legal);
420 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Custom);
421 }
422
423 setOperationAction(Ops: {ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, VT: MVT::f16,
424 Action: Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
429 setOperationAction(Ops: ISD::IS_FPCLASS,
430 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
431 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
432 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
433 MVT::v16f64},
434 Action: Custom);
435
436 if (isTypeLegal(VT: MVT::f16))
437 setOperationAction(Ops: ISD::IS_FPCLASS,
438 VTs: {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
439 Action: Custom);
440
441 // Expand to fneg + fadd.
442 setOperationAction(Op: ISD::FSUB, VT: MVT::f64, Action: Expand);
443
444 setOperationAction(Ops: ISD::CONCAT_VECTORS,
445 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
446 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
450 Action: Custom);
451
452 setOperationAction(
453 Ops: ISD::EXTRACT_SUBVECTOR,
454 VTs: {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
455 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
456 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
457 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
458 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
459 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
460 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
461 Action: Custom);
462
463 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::f64, Action: Expand);
464 setOperationAction(Ops: ISD::FP_TO_FP16, VTs: {MVT::f64, MVT::f32}, Action: Custom);
465
466 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
467 for (MVT VT : ScalarIntVTs) {
468 // These should use [SU]DIVREM, so set them to expand
469 setOperationAction(Ops: {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
470 Action: Expand);
471
472 // GPU does not have divrem function for signed or unsigned.
473 setOperationAction(Ops: {ISD::SDIVREM, ISD::UDIVREM}, VT, Action: Custom);
474
475 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
476 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Action: Expand);
477
478 setOperationAction(Ops: {ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Action: Expand);
479
480 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
481 setOperationAction(Ops: {ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Action: Legal);
482 }
483
484 // The hardware supports 32-bit FSHR, but not FSHL.
485 setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Legal);
486
487 // The hardware supports 32-bit ROTR, but not ROTL.
488 setOperationAction(Ops: ISD::ROTL, VTs: {MVT::i32, MVT::i64}, Action: Expand);
489 setOperationAction(Op: ISD::ROTR, VT: MVT::i64, Action: Expand);
490
491 setOperationAction(Ops: {ISD::MULHU, ISD::MULHS}, VT: MVT::i16, Action: Expand);
492
493 setOperationAction(Ops: {ISD::MUL, ISD::MULHU, ISD::MULHS}, VT: MVT::i64, Action: Expand);
494 setOperationAction(
495 Ops: {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
496 VT: MVT::i64, Action: Custom);
497 setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Expand);
498
499 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i32,
500 Action: Legal);
501
502 setOperationAction(
503 Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
504 VT: MVT::i64, Action: Custom);
505
506 for (auto VT : {MVT::i8, MVT::i16})
507 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Action: Custom);
508
509 static const MVT::SimpleValueType VectorIntTypes[] = {
510 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
511 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
512
513 for (MVT VT : VectorIntTypes) {
514 // Expand the following operations for the current type by default.
515 setOperationAction(Ops: {ISD::ADD, ISD::AND, ISD::FP_TO_SINT,
516 ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU,
517 ISD::MULHS, ISD::OR, ISD::SHL,
518 ISD::SRA, ISD::SRL, ISD::ROTL,
519 ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP,
520 ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV,
521 ISD::SREM, ISD::UREM, ISD::SMUL_LOHI,
522 ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM,
523 ISD::SELECT, ISD::VSELECT, ISD::SELECT_CC,
524 ISD::XOR, ISD::BSWAP, ISD::CTPOP,
525 ISD::CTTZ, ISD::CTLZ, ISD::VECTOR_SHUFFLE,
526 ISD::SETCC, ISD::ADDRSPACECAST},
527 VT, Action: Expand);
528 }
529
530 static const MVT::SimpleValueType FloatVectorTypes[] = {
531 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
532 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
533
534 for (MVT VT : FloatVectorTypes) {
535 setOperationAction(
536 Ops: {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
537 ISD::FADD, ISD::FCEIL, ISD::FCOS,
538 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
539 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
540 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
541 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
542 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
543 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
544 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
545 ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
546 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
547 VT, Action: Expand);
548 }
549
550 // This causes using an unrolled select operation rather than expansion with
551 // bit operations. This is in general better, but the alternative using BFI
552 // instructions may be better if the select sources are SGPRs.
553 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f32, Action: Promote);
554 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f32, DestVT: MVT::v2i32);
555
556 setOperationAction(Op: ISD::SELECT, VT: MVT::v3f32, Action: Promote);
557 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v3f32, DestVT: MVT::v3i32);
558
559 setOperationAction(Op: ISD::SELECT, VT: MVT::v4f32, Action: Promote);
560 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v4f32, DestVT: MVT::v4i32);
561
562 setOperationAction(Op: ISD::SELECT, VT: MVT::v5f32, Action: Promote);
563 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v5f32, DestVT: MVT::v5i32);
564
565 setOperationAction(Op: ISD::SELECT, VT: MVT::v6f32, Action: Promote);
566 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v6f32, DestVT: MVT::v6i32);
567
568 setOperationAction(Op: ISD::SELECT, VT: MVT::v7f32, Action: Promote);
569 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v7f32, DestVT: MVT::v7i32);
570
571 setOperationAction(Op: ISD::SELECT, VT: MVT::v9f32, Action: Promote);
572 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v9f32, DestVT: MVT::v9i32);
573
574 setOperationAction(Op: ISD::SELECT, VT: MVT::v10f32, Action: Promote);
575 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v10f32, DestVT: MVT::v10i32);
576
577 setOperationAction(Op: ISD::SELECT, VT: MVT::v11f32, Action: Promote);
578 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v11f32, DestVT: MVT::v11i32);
579
580 setOperationAction(Op: ISD::SELECT, VT: MVT::v12f32, Action: Promote);
581 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v12f32, DestVT: MVT::v12i32);
582
583 setSchedulingPreference(Sched::RegPressure);
584 setJumpIsExpensive(true);
585
586 // FIXME: This is only partially true. If we have to do vector compares, any
587 // SGPR pair can be a condition register. If we have a uniform condition, we
588 // are better off doing SALU operations, where there is only one SCC. For now,
589 // we don't have a way of knowing during instruction selection if a condition
590 // will be uniform and we always use vector compares. Assume we are using
591 // vector compares until that is fixed.
592 setHasMultipleConditionRegisters(true);
593
594 setMinCmpXchgSizeInBits(32);
595 setSupportsUnalignedAtomics(false);
596
597 PredictableSelectIsExpensive = false;
598
599 // We want to find all load dependencies for long chains of stores to enable
600 // merging into very wide vectors. The problem is with vectors with > 4
601 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
602 // vectors are a legal type, even though we have to split the loads
603 // usually. When we can more precisely specify load legality per address
604 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
605 // smarter so that they can figure out what to do in 2 iterations without all
606 // N > 4 stores on the same chain.
607 GatherAllAliasesMaxDepth = 16;
608
609 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
610 // about these during lowering.
611 MaxStoresPerMemcpy = 0xffffffff;
612 MaxStoresPerMemmove = 0xffffffff;
613 MaxStoresPerMemset = 0xffffffff;
614
615 // The expansion for 64-bit division is enormous.
616 if (AMDGPUBypassSlowDiv)
617 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
618
619 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
620 ISD::SRA, ISD::SRL,
621 ISD::TRUNCATE, ISD::MUL,
622 ISD::SMUL_LOHI, ISD::UMUL_LOHI,
623 ISD::MULHU, ISD::MULHS,
624 ISD::SELECT, ISD::SELECT_CC,
625 ISD::STORE, ISD::FADD,
626 ISD::FSUB, ISD::FNEG,
627 ISD::FABS, ISD::AssertZext,
628 ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
629
630 setMaxAtomicSizeInBitsSupported(64);
631 setMaxDivRemBitWidthSupported(64);
632 setMaxLargeFPConvertBitWidthSupported(64);
633}
634
635bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
636 if (getTargetMachine().Options.NoSignedZerosFPMath)
637 return true;
638
639 const auto Flags = Op.getNode()->getFlags();
640 if (Flags.hasNoSignedZeros())
641 return true;
642
643 return false;
644}
645
646//===----------------------------------------------------------------------===//
647// Target Information
648//===----------------------------------------------------------------------===//
649
650LLVM_READNONE
651static bool fnegFoldsIntoOpcode(unsigned Opc) {
652 switch (Opc) {
653 case ISD::FADD:
654 case ISD::FSUB:
655 case ISD::FMUL:
656 case ISD::FMA:
657 case ISD::FMAD:
658 case ISD::FMINNUM:
659 case ISD::FMAXNUM:
660 case ISD::FMINNUM_IEEE:
661 case ISD::FMAXNUM_IEEE:
662 case ISD::FMINIMUM:
663 case ISD::FMAXIMUM:
664 case ISD::FMINIMUMNUM:
665 case ISD::FMAXIMUMNUM:
666 case ISD::SELECT:
667 case ISD::FSIN:
668 case ISD::FTRUNC:
669 case ISD::FRINT:
670 case ISD::FNEARBYINT:
671 case ISD::FROUNDEVEN:
672 case ISD::FCANONICALIZE:
673 case AMDGPUISD::RCP:
674 case AMDGPUISD::RCP_LEGACY:
675 case AMDGPUISD::RCP_IFLAG:
676 case AMDGPUISD::SIN_HW:
677 case AMDGPUISD::FMUL_LEGACY:
678 case AMDGPUISD::FMIN_LEGACY:
679 case AMDGPUISD::FMAX_LEGACY:
680 case AMDGPUISD::FMED3:
681 // TODO: handle llvm.amdgcn.fma.legacy
682 return true;
683 case ISD::BITCAST:
684 llvm_unreachable("bitcast is special cased");
685 default:
686 return false;
687 }
688}
689
690static bool fnegFoldsIntoOp(const SDNode *N) {
691 unsigned Opc = N->getOpcode();
692 if (Opc == ISD::BITCAST) {
693 // TODO: Is there a benefit to checking the conditions performFNegCombine
694 // does? We don't for the other cases.
695 SDValue BCSrc = N->getOperand(Num: 0);
696 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
697 return BCSrc.getNumOperands() == 2 &&
698 BCSrc.getOperand(i: 1).getValueSizeInBits() == 32;
699 }
700
701 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
702 }
703
704 return fnegFoldsIntoOpcode(Opc);
705}
706
707/// \p returns true if the operation will definitely need to use a 64-bit
708/// encoding, and thus will use a VOP3 encoding regardless of the source
709/// modifiers.
710LLVM_READONLY
711static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
712 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
713 VT == MVT::f64;
714}
715
716/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
717/// type for ISD::SELECT.
718LLVM_READONLY
719static bool selectSupportsSourceMods(const SDNode *N) {
720 // TODO: Only applies if select will be vector
721 return N->getValueType(ResNo: 0) == MVT::f32;
722}
723
724// Most FP instructions support source modifiers, but this could be refined
725// slightly.
726LLVM_READONLY
727static bool hasSourceMods(const SDNode *N) {
728 if (isa<MemSDNode>(Val: N))
729 return false;
730
731 switch (N->getOpcode()) {
732 case ISD::CopyToReg:
733 case ISD::FDIV:
734 case ISD::FREM:
735 case ISD::INLINEASM:
736 case ISD::INLINEASM_BR:
737 case AMDGPUISD::DIV_SCALE:
738 case ISD::INTRINSIC_W_CHAIN:
739
740 // TODO: Should really be looking at the users of the bitcast. These are
741 // problematic because bitcasts are used to legalize all stores to integer
742 // types.
743 case ISD::BITCAST:
744 return false;
745 case ISD::INTRINSIC_WO_CHAIN: {
746 switch (N->getConstantOperandVal(Num: 0)) {
747 case Intrinsic::amdgcn_interp_p1:
748 case Intrinsic::amdgcn_interp_p2:
749 case Intrinsic::amdgcn_interp_mov:
750 case Intrinsic::amdgcn_interp_p1_f16:
751 case Intrinsic::amdgcn_interp_p2_f16:
752 return false;
753 default:
754 return true;
755 }
756 }
757 case ISD::SELECT:
758 return selectSupportsSourceMods(N);
759 default:
760 return true;
761 }
762}
763
764bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
765 unsigned CostThreshold) {
766 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
767 // it is truly free to use a source modifier in all cases. If there are
768 // multiple users but for each one will necessitate using VOP3, there will be
769 // a code size increase. Try to avoid increasing code size unless we know it
770 // will save on the instruction count.
771 unsigned NumMayIncreaseSize = 0;
772 MVT VT = N->getValueType(ResNo: 0).getScalarType().getSimpleVT();
773
774 assert(!N->use_empty());
775
776 // XXX - Should this limit number of uses to check?
777 for (const SDNode *U : N->users()) {
778 if (!hasSourceMods(N: U))
779 return false;
780
781 if (!opMustUseVOP3Encoding(N: U, VT)) {
782 if (++NumMayIncreaseSize > CostThreshold)
783 return false;
784 }
785 }
786
787 return true;
788}
789
790EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
791 ISD::NodeType ExtendKind) const {
792 assert(!VT.isVector() && "only scalar expected");
793
794 // Round to the next multiple of 32-bits.
795 unsigned Size = VT.getSizeInBits();
796 if (Size <= 32)
797 return MVT::i32;
798 return EVT::getIntegerVT(Context, BitWidth: 32 * ((Size + 31) / 32));
799}
800
801unsigned AMDGPUTargetLowering::getVectorIdxWidth(const DataLayout &) const {
802 return 32;
803}
804
805bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
806 return true;
807}
808
809// The backend supports 32 and 64 bit floating point immediates.
810// FIXME: Why are we reporting vectors of FP immediates as legal?
811bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
812 bool ForCodeSize) const {
813 EVT ScalarVT = VT.getScalarType();
814 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
815 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
816}
817
818// We don't want to shrink f64 / f32 constants.
819bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
820 EVT ScalarVT = VT.getScalarType();
821 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
822}
823
824bool AMDGPUTargetLowering::shouldReduceLoadWidth(
825 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
826 std::optional<unsigned> ByteOffset) const {
827 // TODO: This may be worth removing. Check regression tests for diffs.
828 if (!TargetLoweringBase::shouldReduceLoadWidth(Load: N, ExtTy, NewVT, ByteOffset))
829 return false;
830
831 unsigned NewSize = NewVT.getStoreSizeInBits();
832
833 // If we are reducing to a 32-bit load or a smaller multi-dword load,
834 // this is always better.
835 if (NewSize >= 32)
836 return true;
837
838 EVT OldVT = N->getValueType(ResNo: 0);
839 unsigned OldSize = OldVT.getStoreSizeInBits();
840
841 MemSDNode *MN = cast<MemSDNode>(Val: N);
842 unsigned AS = MN->getAddressSpace();
843 // Do not shrink an aligned scalar load to sub-dword.
844 // Scalar engine cannot do sub-dword loads.
845 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
846 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
847 (AS == AMDGPUAS::CONSTANT_ADDRESS ||
848 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
849 (isa<LoadSDNode>(Val: N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
850 MN->isInvariant())) &&
851 AMDGPU::isUniformMMO(MMO: MN->getMemOperand()))
852 return false;
853
854 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
855 // extloads, so doing one requires using a buffer_load. In cases where we
856 // still couldn't use a scalar load, using the wider load shouldn't really
857 // hurt anything.
858
859 // If the old size already had to be an extload, there's no harm in continuing
860 // to reduce the width.
861 return (OldSize < 32);
862}
863
864bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
865 const SelectionDAG &DAG,
866 const MachineMemOperand &MMO) const {
867
868 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
869
870 if (LoadTy.getScalarType() == MVT::i32)
871 return false;
872
873 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
874 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
875
876 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
877 return false;
878
879 unsigned Fast = 0;
880 return allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
881 VT: CastTy, MMO, Fast: &Fast) &&
882 Fast;
883}
884
885// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
886// profitable with the expansion for 64-bit since it's generally good to
887// speculate things.
888bool AMDGPUTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
889 return true;
890}
891
892bool AMDGPUTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
893 return true;
894}
895
896bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
897 switch (N->getOpcode()) {
898 case ISD::EntryToken:
899 case ISD::TokenFactor:
900 return true;
901 case ISD::INTRINSIC_WO_CHAIN: {
902 unsigned IntrID = N->getConstantOperandVal(Num: 0);
903 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
904 }
905 case ISD::INTRINSIC_W_CHAIN: {
906 unsigned IntrID = N->getConstantOperandVal(Num: 1);
907 return AMDGPU::isIntrinsicAlwaysUniform(IntrID);
908 }
909 case ISD::LOAD:
910 if (cast<LoadSDNode>(Val: N)->getMemOperand()->getAddrSpace() ==
911 AMDGPUAS::CONSTANT_ADDRESS_32BIT)
912 return true;
913 return false;
914 case AMDGPUISD::SETCC: // ballot-style instruction
915 return true;
916 }
917 return false;
918}
919
920SDValue AMDGPUTargetLowering::getNegatedExpression(
921 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
922 NegatibleCost &Cost, unsigned Depth) const {
923
924 switch (Op.getOpcode()) {
925 case ISD::FMA:
926 case ISD::FMAD: {
927 // Negating a fma is not free if it has users without source mods.
928 if (!allUsesHaveSourceMods(N: Op.getNode()))
929 return SDValue();
930 break;
931 }
932 case AMDGPUISD::RCP: {
933 SDValue Src = Op.getOperand(i: 0);
934 EVT VT = Op.getValueType();
935 SDLoc SL(Op);
936
937 SDValue NegSrc = getNegatedExpression(Op: Src, DAG, LegalOperations,
938 ForCodeSize, Cost, Depth: Depth + 1);
939 if (NegSrc)
940 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: NegSrc, Flags: Op->getFlags());
941 return SDValue();
942 }
943 default:
944 break;
945 }
946
947 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps: LegalOperations,
948 OptForSize: ForCodeSize, Cost, Depth);
949}
950
951//===---------------------------------------------------------------------===//
952// Target Properties
953//===---------------------------------------------------------------------===//
954
955bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
956 assert(VT.isFloatingPoint());
957
958 // Packed operations do not have a fabs modifier.
959 return VT == MVT::f32 || VT == MVT::f64 ||
960 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
961}
962
963bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
964 assert(VT.isFloatingPoint());
965 // Report this based on the end legalized type.
966 VT = VT.getScalarType();
967 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
968}
969
970bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
971 unsigned NumElem,
972 unsigned AS) const {
973 return true;
974}
975
976bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
977 // There are few operations which truly have vector input operands. Any vector
978 // operation is going to involve operations on each component, and a
979 // build_vector will be a copy per element, so it always makes sense to use a
980 // build_vector input in place of the extracted element to avoid a copy into a
981 // super register.
982 //
983 // We should probably only do this if all users are extracts only, but this
984 // should be the common case.
985 return true;
986}
987
988bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
989 // Truncate is just accessing a subregister.
990
991 unsigned SrcSize = Source.getSizeInBits();
992 unsigned DestSize = Dest.getSizeInBits();
993
994 return DestSize < SrcSize && DestSize % 32 == 0 ;
995}
996
997bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
998 // Truncate is just accessing a subregister.
999
1000 unsigned SrcSize = Source->getScalarSizeInBits();
1001 unsigned DestSize = Dest->getScalarSizeInBits();
1002
1003 if (DestSize== 16 && Subtarget->has16BitInsts())
1004 return SrcSize >= 32;
1005
1006 return DestSize < SrcSize && DestSize % 32 == 0;
1007}
1008
1009bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
1010 unsigned SrcSize = Src->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (SrcSize == 16 && Subtarget->has16BitInsts())
1014 return DestSize >= 32;
1015
1016 return SrcSize == 32 && DestSize == 64;
1017}
1018
1019bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
1020 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1021 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1022 // this will enable reducing 64-bit operations the 32-bit, which is always
1023 // good.
1024
1025 if (Src == MVT::i16)
1026 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1027
1028 return Src == MVT::i32 && Dest == MVT::i64;
1029}
1030
1031bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
1032 EVT DestVT) const {
1033 switch (N->getOpcode()) {
1034 case ISD::ADD:
1035 case ISD::SUB:
1036 case ISD::SHL:
1037 case ISD::SRL:
1038 case ISD::SRA:
1039 case ISD::AND:
1040 case ISD::OR:
1041 case ISD::XOR:
1042 case ISD::MUL:
1043 case ISD::SETCC:
1044 case ISD::SELECT:
1045 case ISD::SMIN:
1046 case ISD::SMAX:
1047 case ISD::UMIN:
1048 case ISD::UMAX:
1049 if (Subtarget->has16BitInsts() &&
1050 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1051 // Don't narrow back down to i16 if promoted to i32 already.
1052 if (!N->isDivergent() && DestVT.isInteger() &&
1053 DestVT.getScalarSizeInBits() > 1 &&
1054 DestVT.getScalarSizeInBits() <= 16 &&
1055 SrcVT.getScalarSizeInBits() > 16) {
1056 return false;
1057 }
1058 }
1059 return true;
1060 default:
1061 break;
1062 }
1063
1064 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1065 // limited number of native 64-bit operations. Shrinking an operation to fit
1066 // in a single 32-bit register should always be helpful. As currently used,
1067 // this is much less general than the name suggests, and is only used in
1068 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1069 // not profitable, and may actually be harmful.
1070 if (isa<LoadSDNode>(Val: N))
1071 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1072
1073 return true;
1074}
1075
1076bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
1077 const SDNode* N, CombineLevel Level) const {
1078 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1079 N->getOpcode() == ISD::SRL) &&
1080 "Expected shift op");
1081
1082 SDValue ShiftLHS = N->getOperand(Num: 0);
1083 if (!ShiftLHS->hasOneUse())
1084 return false;
1085
1086 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1087 !ShiftLHS.getOperand(i: 0)->hasOneUse())
1088 return false;
1089
1090 // Always commute pre-type legalization and right shifts.
1091 // We're looking for shl(or(x,y),z) patterns.
1092 if (Level < CombineLevel::AfterLegalizeTypes ||
1093 N->getOpcode() != ISD::SHL || N->getOperand(Num: 0).getOpcode() != ISD::OR)
1094 return true;
1095
1096 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1097 if (N->getValueType(ResNo: 0) == MVT::i32 && N->hasOneUse() &&
1098 (N->user_begin()->getOpcode() == ISD::SRA ||
1099 N->user_begin()->getOpcode() == ISD::SRL))
1100 return false;
1101
1102 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1103 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1104 if (LHS.getOpcode() != ISD::SHL)
1105 return false;
1106 auto *RHSLd = dyn_cast<LoadSDNode>(Val&: RHS);
1107 auto *LHS0 = dyn_cast<LoadSDNode>(Val: LHS.getOperand(i: 0));
1108 auto *LHS1 = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
1109 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1110 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1111 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1112 };
1113 SDValue LHS = N->getOperand(Num: 0).getOperand(i: 0);
1114 SDValue RHS = N->getOperand(Num: 0).getOperand(i: 1);
1115 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1116}
1117
1118//===---------------------------------------------------------------------===//
1119// TargetLowering Callbacks
1120//===---------------------------------------------------------------------===//
1121
1122CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
1123 bool IsVarArg) {
1124 switch (CC) {
1125 case CallingConv::AMDGPU_VS:
1126 case CallingConv::AMDGPU_GS:
1127 case CallingConv::AMDGPU_PS:
1128 case CallingConv::AMDGPU_CS:
1129 case CallingConv::AMDGPU_HS:
1130 case CallingConv::AMDGPU_ES:
1131 case CallingConv::AMDGPU_LS:
1132 return CC_AMDGPU;
1133 case CallingConv::AMDGPU_CS_Chain:
1134 case CallingConv::AMDGPU_CS_ChainPreserve:
1135 return CC_AMDGPU_CS_CHAIN;
1136 case CallingConv::C:
1137 case CallingConv::Fast:
1138 case CallingConv::Cold:
1139 return CC_AMDGPU_Func;
1140 case CallingConv::AMDGPU_Gfx:
1141 return CC_SI_Gfx;
1142 case CallingConv::AMDGPU_KERNEL:
1143 case CallingConv::SPIR_KERNEL:
1144 default:
1145 reportFatalUsageError(reason: "unsupported calling convention for call");
1146 }
1147}
1148
1149CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1150 bool IsVarArg) {
1151 switch (CC) {
1152 case CallingConv::AMDGPU_KERNEL:
1153 case CallingConv::SPIR_KERNEL:
1154 llvm_unreachable("kernels should not be handled here");
1155 case CallingConv::AMDGPU_VS:
1156 case CallingConv::AMDGPU_GS:
1157 case CallingConv::AMDGPU_PS:
1158 case CallingConv::AMDGPU_CS:
1159 case CallingConv::AMDGPU_CS_Chain:
1160 case CallingConv::AMDGPU_CS_ChainPreserve:
1161 case CallingConv::AMDGPU_HS:
1162 case CallingConv::AMDGPU_ES:
1163 case CallingConv::AMDGPU_LS:
1164 return RetCC_SI_Shader;
1165 case CallingConv::AMDGPU_Gfx:
1166 return RetCC_SI_Gfx;
1167 case CallingConv::C:
1168 case CallingConv::Fast:
1169 case CallingConv::Cold:
1170 return RetCC_AMDGPU_Func;
1171 default:
1172 reportFatalUsageError(reason: "unsupported calling convention");
1173 }
1174}
1175
1176/// The SelectionDAGBuilder will automatically promote function arguments
1177/// with illegal types. However, this does not work for the AMDGPU targets
1178/// since the function arguments are stored in memory as these illegal types.
1179/// In order to handle this properly we need to get the original types sizes
1180/// from the LLVM IR Function and fixup the ISD:InputArg values before
1181/// passing them to AnalyzeFormalArguments()
1182
1183/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1184/// input values across multiple registers. Each item in the Ins array
1185/// represents a single value that will be stored in registers. Ins[x].VT is
1186/// the value type of the value that will be stored in the register, so
1187/// whatever SDNode we lower the argument to needs to be this type.
1188///
1189/// In order to correctly lower the arguments we need to know the size of each
1190/// argument. Since Ins[x].VT gives us the size of the register that will
1191/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1192/// for the original function argument so that we can deduce the correct memory
1193/// type to use for Ins[x]. In most cases the correct memory type will be
1194/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1195/// we have a kernel argument of type v8i8, this argument will be split into
1196/// 8 parts and each part will be represented by its own item in the Ins array.
1197/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1198/// the argument before it was split. From this, we deduce that the memory type
1199/// for each individual part is i8. We pass the memory type as LocVT to the
1200/// calling convention analysis function and the register type (Ins[x].VT) as
1201/// the ValVT.
1202void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1203 CCState &State,
1204 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1205 const MachineFunction &MF = State.getMachineFunction();
1206 const Function &Fn = MF.getFunction();
1207 LLVMContext &Ctx = Fn.getParent()->getContext();
1208 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1209 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1210 CallingConv::ID CC = Fn.getCallingConv();
1211
1212 Align MaxAlign = Align(1);
1213 uint64_t ExplicitArgOffset = 0;
1214 const DataLayout &DL = Fn.getDataLayout();
1215
1216 unsigned InIndex = 0;
1217
1218 for (const Argument &Arg : Fn.args()) {
1219 const bool IsByRef = Arg.hasByRefAttr();
1220 Type *BaseArgTy = Arg.getType();
1221 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1222 Align Alignment = DL.getValueOrABITypeAlignment(
1223 Alignment: IsByRef ? Arg.getParamAlign() : std::nullopt, Ty: MemArgTy);
1224 MaxAlign = std::max(a: Alignment, b: MaxAlign);
1225 uint64_t AllocSize = DL.getTypeAllocSize(Ty: MemArgTy);
1226
1227 uint64_t ArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + ExplicitOffset;
1228 ExplicitArgOffset = alignTo(Size: ExplicitArgOffset, A: Alignment) + AllocSize;
1229
1230 // We're basically throwing away everything passed into us and starting over
1231 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1232 // to us as computed in Ins.
1233 //
1234 // We also need to figure out what type legalization is trying to do to get
1235 // the correct memory offsets.
1236
1237 SmallVector<EVT, 16> ValueVTs;
1238 SmallVector<uint64_t, 16> Offsets;
1239 ComputeValueVTs(TLI: *this, DL, Ty: BaseArgTy, ValueVTs, FixedOffsets: &Offsets, StartingOffset: ArgOffset);
1240
1241 for (unsigned Value = 0, NumValues = ValueVTs.size();
1242 Value != NumValues; ++Value) {
1243 uint64_t BasePartOffset = Offsets[Value];
1244
1245 EVT ArgVT = ValueVTs[Value];
1246 EVT MemVT = ArgVT;
1247 MVT RegisterVT = getRegisterTypeForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1248 unsigned NumRegs = getNumRegistersForCallingConv(Context&: Ctx, CC, VT: ArgVT);
1249
1250 if (NumRegs == 1) {
1251 // This argument is not split, so the IR type is the memory type.
1252 if (ArgVT.isExtended()) {
1253 // We have an extended type, like i24, so we should just use the
1254 // register type.
1255 MemVT = RegisterVT;
1256 } else {
1257 MemVT = ArgVT;
1258 }
1259 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1260 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1261 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1262 // We have a vector value which has been split into a vector with
1263 // the same scalar type, but fewer elements. This should handle
1264 // all the floating-point vector types.
1265 MemVT = RegisterVT;
1266 } else if (ArgVT.isVector() &&
1267 ArgVT.getVectorNumElements() == NumRegs) {
1268 // This arg has been split so that each element is stored in a separate
1269 // register.
1270 MemVT = ArgVT.getScalarType();
1271 } else if (ArgVT.isExtended()) {
1272 // We have an extended type, like i65.
1273 MemVT = RegisterVT;
1274 } else {
1275 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1276 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1277 if (RegisterVT.isInteger()) {
1278 MemVT = EVT::getIntegerVT(Context&: State.getContext(), BitWidth: MemoryBits);
1279 } else if (RegisterVT.isVector()) {
1280 assert(!RegisterVT.getScalarType().isFloatingPoint());
1281 unsigned NumElements = RegisterVT.getVectorNumElements();
1282 assert(MemoryBits % NumElements == 0);
1283 // This vector type has been split into another vector type with
1284 // a different elements size.
1285 EVT ScalarVT = EVT::getIntegerVT(Context&: State.getContext(),
1286 BitWidth: MemoryBits / NumElements);
1287 MemVT = EVT::getVectorVT(Context&: State.getContext(), VT: ScalarVT, NumElements);
1288 } else {
1289 llvm_unreachable("cannot deduce memory type.");
1290 }
1291 }
1292
1293 // Convert one element vectors to scalar.
1294 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1295 MemVT = MemVT.getScalarType();
1296
1297 // Round up vec3/vec5 argument.
1298 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1299 MemVT = MemVT.getPow2VectorType(Context&: State.getContext());
1300 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1301 MemVT = MemVT.getRoundIntegerType(Context&: State.getContext());
1302 }
1303
1304 unsigned PartOffset = 0;
1305 for (unsigned i = 0; i != NumRegs; ++i) {
1306 State.addLoc(V: CCValAssign::getCustomMem(ValNo: InIndex++, ValVT: RegisterVT,
1307 Offset: BasePartOffset + PartOffset,
1308 LocVT: MemVT.getSimpleVT(),
1309 HTP: CCValAssign::Full));
1310 PartOffset += MemVT.getStoreSize();
1311 }
1312 }
1313 }
1314}
1315
1316SDValue AMDGPUTargetLowering::LowerReturn(
1317 SDValue Chain, CallingConv::ID CallConv,
1318 bool isVarArg,
1319 const SmallVectorImpl<ISD::OutputArg> &Outs,
1320 const SmallVectorImpl<SDValue> &OutVals,
1321 const SDLoc &DL, SelectionDAG &DAG) const {
1322 // FIXME: Fails for r600 tests
1323 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1324 // "wave terminate should not have return values");
1325 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM, DL, VT: MVT::Other, Operand: Chain);
1326}
1327
1328//===---------------------------------------------------------------------===//
1329// Target specific lowering
1330//===---------------------------------------------------------------------===//
1331
1332/// Selects the correct CCAssignFn for a given CallingConvention value.
1333CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1334 bool IsVarArg) {
1335 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1336}
1337
1338CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1339 bool IsVarArg) {
1340 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1341}
1342
1343SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1344 SelectionDAG &DAG,
1345 MachineFrameInfo &MFI,
1346 int ClobberedFI) const {
1347 SmallVector<SDValue, 8> ArgChains;
1348 int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
1349 int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - 1;
1350
1351 // Include the original chain at the beginning of the list. When this is
1352 // used by target LowerCall hooks, this helps legalize find the
1353 // CALLSEQ_BEGIN node.
1354 ArgChains.push_back(Elt: Chain);
1355
1356 // Add a chain value for each stack argument corresponding
1357 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1358 if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U)) {
1359 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr())) {
1360 if (FI->getIndex() < 0) {
1361 int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
1362 int64_t InLastByte = InFirstByte;
1363 InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - 1;
1364
1365 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1366 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1367 ArgChains.push_back(Elt: SDValue(L, 1));
1368 }
1369 }
1370 }
1371 }
1372
1373 // Build a tokenfactor for all the chains.
1374 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc(Chain), VT: MVT::Other, Ops: ArgChains);
1375}
1376
1377SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1378 SmallVectorImpl<SDValue> &InVals,
1379 StringRef Reason) const {
1380 SDValue Callee = CLI.Callee;
1381 SelectionDAG &DAG = CLI.DAG;
1382
1383 const Function &Fn = DAG.getMachineFunction().getFunction();
1384
1385 StringRef FuncName("<unknown>");
1386
1387 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Val&: Callee))
1388 FuncName = G->getSymbol();
1389 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee))
1390 FuncName = G->getGlobal()->getName();
1391
1392 DAG.getContext()->diagnose(
1393 DI: DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1394
1395 if (!CLI.IsTailCall) {
1396 for (ISD::InputArg &Arg : CLI.Ins)
1397 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
1398 }
1399
1400 return DAG.getEntryNode();
1401}
1402
1403SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1404 SmallVectorImpl<SDValue> &InVals) const {
1405 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported call to function ");
1406}
1407
1408SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1409 SelectionDAG &DAG) const {
1410 const Function &Fn = DAG.getMachineFunction().getFunction();
1411
1412 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1413 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1414 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), Op.getOperand(i: 0)};
1415 return DAG.getMergeValues(Ops, dl: SDLoc());
1416}
1417
1418SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1419 SelectionDAG &DAG) const {
1420 switch (Op.getOpcode()) {
1421 default:
1422 Op->print(OS&: errs(), G: &DAG);
1423 llvm_unreachable("Custom lowering code for this "
1424 "instruction is not implemented yet!");
1425 break;
1426 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1427 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1428 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1429 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1430 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1431 case ISD::FREM: return LowerFREM(Op, DAG);
1432 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1433 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1434 case ISD::FRINT: return LowerFRINT(Op, DAG);
1435 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1436 case ISD::FROUNDEVEN:
1437 return LowerFROUNDEVEN(Op, DAG);
1438 case ISD::FROUND: return LowerFROUND(Op, DAG);
1439 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1440 case ISD::FLOG2:
1441 return LowerFLOG2(Op, DAG);
1442 case ISD::FLOG:
1443 case ISD::FLOG10:
1444 return LowerFLOGCommon(Op, DAG);
1445 case ISD::FEXP:
1446 case ISD::FEXP10:
1447 return lowerFEXP(Op, DAG);
1448 case ISD::FEXP2:
1449 return lowerFEXP2(Op, DAG);
1450 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1451 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1452 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1453 case ISD::FP_TO_SINT:
1454 case ISD::FP_TO_UINT:
1455 return LowerFP_TO_INT(Op, DAG);
1456 case ISD::CTTZ:
1457 case ISD::CTTZ_ZERO_UNDEF:
1458 case ISD::CTLZ:
1459 case ISD::CTLZ_ZERO_UNDEF:
1460 return LowerCTLZ_CTTZ(Op, DAG);
1461 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1462 }
1463 return Op;
1464}
1465
1466void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1467 SmallVectorImpl<SDValue> &Results,
1468 SelectionDAG &DAG) const {
1469 switch (N->getOpcode()) {
1470 case ISD::SIGN_EXTEND_INREG:
1471 // Different parts of legalization seem to interpret which type of
1472 // sign_extend_inreg is the one to check for custom lowering. The extended
1473 // from type is what really matters, but some places check for custom
1474 // lowering of the result type. This results in trying to use
1475 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1476 // nothing here and let the illegal result integer be handled normally.
1477 return;
1478 case ISD::FLOG2:
1479 if (SDValue Lowered = LowerFLOG2(Op: SDValue(N, 0), DAG))
1480 Results.push_back(Elt: Lowered);
1481 return;
1482 case ISD::FLOG:
1483 case ISD::FLOG10:
1484 if (SDValue Lowered = LowerFLOGCommon(Op: SDValue(N, 0), DAG))
1485 Results.push_back(Elt: Lowered);
1486 return;
1487 case ISD::FEXP2:
1488 if (SDValue Lowered = lowerFEXP2(Op: SDValue(N, 0), DAG))
1489 Results.push_back(Elt: Lowered);
1490 return;
1491 case ISD::FEXP:
1492 case ISD::FEXP10:
1493 if (SDValue Lowered = lowerFEXP(Op: SDValue(N, 0), DAG))
1494 Results.push_back(Elt: Lowered);
1495 return;
1496 case ISD::CTLZ:
1497 case ISD::CTLZ_ZERO_UNDEF:
1498 if (auto Lowered = lowerCTLZResults(Op: SDValue(N, 0u), DAG))
1499 Results.push_back(Elt: Lowered);
1500 return;
1501 default:
1502 return;
1503 }
1504}
1505
1506SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1507 SDValue Op,
1508 SelectionDAG &DAG) const {
1509
1510 const DataLayout &DL = DAG.getDataLayout();
1511 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Val&: Op);
1512 const GlobalValue *GV = G->getGlobal();
1513
1514 if (!MFI->isModuleEntryFunction()) {
1515 if (std::optional<uint32_t> Address =
1516 AMDGPUMachineFunction::getLDSAbsoluteAddress(GV: *GV)) {
1517 return DAG.getConstant(Val: *Address, DL: SDLoc(Op), VT: Op.getValueType());
1518 }
1519 }
1520
1521 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1522 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1523 if (!MFI->isModuleEntryFunction() &&
1524 GV->getName() != "llvm.amdgcn.module.lds" &&
1525 !AMDGPU::isNamedBarrier(GV: *cast<GlobalVariable>(Val: GV))) {
1526 SDLoc DL(Op);
1527 const Function &Fn = DAG.getMachineFunction().getFunction();
1528 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
1529 Fn, "local memory global used by non-kernel function",
1530 DL.getDebugLoc(), DS_Warning));
1531
1532 // We currently don't have a way to correctly allocate LDS objects that
1533 // aren't directly associated with a kernel. We do force inlining of
1534 // functions that use local objects. However, if these dead functions are
1535 // not eliminated, we don't want a compile time error. Just emit a warning
1536 // and a trap, since there should be no callable path here.
1537 SDValue Trap = DAG.getNode(Opcode: ISD::TRAP, DL, VT: MVT::Other, Operand: DAG.getEntryNode());
1538 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
1539 N1: Trap, N2: DAG.getRoot());
1540 DAG.setRoot(OutputChain);
1541 return DAG.getPOISON(VT: Op.getValueType());
1542 }
1543
1544 // XXX: What does the value of G->getOffset() mean?
1545 assert(G->getOffset() == 0 &&
1546 "Do not know what to do with an non-zero offset");
1547
1548 // TODO: We could emit code to handle the initialization somewhere.
1549 // We ignore the initializer for now and legalize it to allow selection.
1550 // The initializer will anyway get errored out during assembly emission.
1551 unsigned Offset = MFI->allocateLDSGlobal(DL, GV: *cast<GlobalVariable>(Val: GV));
1552 return DAG.getConstant(Val: Offset, DL: SDLoc(Op), VT: Op.getValueType());
1553 }
1554 return SDValue();
1555}
1556
1557SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1558 SelectionDAG &DAG) const {
1559 SmallVector<SDValue, 8> Args;
1560 SDLoc SL(Op);
1561
1562 EVT VT = Op.getValueType();
1563 if (VT.getVectorElementType().getSizeInBits() < 32) {
1564 unsigned OpBitSize = Op.getOperand(i: 0).getValueType().getSizeInBits();
1565 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1566 unsigned NewNumElt = OpBitSize / 32;
1567 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1568 : EVT::getVectorVT(Context&: *DAG.getContext(),
1569 VT: MVT::i32, NumElements: NewNumElt);
1570 for (const SDUse &U : Op->ops()) {
1571 SDValue In = U.get();
1572 SDValue NewIn = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewEltVT, Operand: In);
1573 if (NewNumElt > 1)
1574 DAG.ExtractVectorElements(Op: NewIn, Args);
1575 else
1576 Args.push_back(Elt: NewIn);
1577 }
1578
1579 EVT NewVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
1580 NumElements: NewNumElt * Op.getNumOperands());
1581 SDValue BV = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1582 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: BV);
1583 }
1584 }
1585
1586 for (const SDUse &U : Op->ops())
1587 DAG.ExtractVectorElements(Op: U.get(), Args);
1588
1589 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1590}
1591
1592SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1593 SelectionDAG &DAG) const {
1594 SDLoc SL(Op);
1595 SmallVector<SDValue, 8> Args;
1596 unsigned Start = Op.getConstantOperandVal(i: 1);
1597 EVT VT = Op.getValueType();
1598 EVT SrcVT = Op.getOperand(i: 0).getValueType();
1599
1600 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1601 unsigned NumElt = VT.getVectorNumElements();
1602 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1603 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1604
1605 // Extract 32-bit registers at a time.
1606 EVT NewSrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumSrcElt / 2);
1607 EVT NewVT = NumElt == 2
1608 ? MVT::i32
1609 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumElt / 2);
1610 SDValue Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewSrcVT, Operand: Op.getOperand(i: 0));
1611
1612 DAG.ExtractVectorElements(Op: Tmp, Args, Start: Start / 2, Count: NumElt / 2);
1613 if (NumElt == 2)
1614 Tmp = Args[0];
1615 else
1616 Tmp = DAG.getBuildVector(VT: NewVT, DL: SL, Ops: Args);
1617
1618 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Tmp);
1619 }
1620
1621 DAG.ExtractVectorElements(Op: Op.getOperand(i: 0), Args, Start,
1622 Count: VT.getVectorNumElements());
1623
1624 return DAG.getBuildVector(VT: Op.getValueType(), DL: SL, Ops: Args);
1625}
1626
1627// TODO: Handle fabs too
1628static SDValue peekFNeg(SDValue Val) {
1629 if (Val.getOpcode() == ISD::FNEG)
1630 return Val.getOperand(i: 0);
1631
1632 return Val;
1633}
1634
1635static SDValue peekFPSignOps(SDValue Val) {
1636 if (Val.getOpcode() == ISD::FNEG)
1637 Val = Val.getOperand(i: 0);
1638 if (Val.getOpcode() == ISD::FABS)
1639 Val = Val.getOperand(i: 0);
1640 if (Val.getOpcode() == ISD::FCOPYSIGN)
1641 Val = Val.getOperand(i: 0);
1642 return Val;
1643}
1644
1645SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
1646 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1647 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1648 SelectionDAG &DAG = DCI.DAG;
1649 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val&: CC)->get();
1650 switch (CCOpcode) {
1651 case ISD::SETOEQ:
1652 case ISD::SETONE:
1653 case ISD::SETUNE:
1654 case ISD::SETNE:
1655 case ISD::SETUEQ:
1656 case ISD::SETEQ:
1657 case ISD::SETFALSE:
1658 case ISD::SETFALSE2:
1659 case ISD::SETTRUE:
1660 case ISD::SETTRUE2:
1661 case ISD::SETUO:
1662 case ISD::SETO:
1663 break;
1664 case ISD::SETULE:
1665 case ISD::SETULT: {
1666 if (LHS == True)
1667 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1668 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1669 }
1670 case ISD::SETOLE:
1671 case ISD::SETOLT:
1672 case ISD::SETLE:
1673 case ISD::SETLT: {
1674 // Ordered. Assume ordered for undefined.
1675
1676 // Only do this after legalization to avoid interfering with other combines
1677 // which might occur.
1678 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1679 !DCI.isCalledByLegalizer())
1680 return SDValue();
1681
1682 // We need to permute the operands to get the correct NaN behavior. The
1683 // selected operand is the second one based on the failing compare with NaN,
1684 // so permute it based on the compare type the hardware uses.
1685 if (LHS == True)
1686 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1687 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1688 }
1689 case ISD::SETUGE:
1690 case ISD::SETUGT: {
1691 if (LHS == True)
1692 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: RHS, N2: LHS);
1693 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: LHS, N2: RHS);
1694 }
1695 case ISD::SETGT:
1696 case ISD::SETGE:
1697 case ISD::SETOGE:
1698 case ISD::SETOGT: {
1699 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1700 !DCI.isCalledByLegalizer())
1701 return SDValue();
1702
1703 if (LHS == True)
1704 return DAG.getNode(Opcode: AMDGPUISD::FMAX_LEGACY, DL, VT, N1: LHS, N2: RHS);
1705 return DAG.getNode(Opcode: AMDGPUISD::FMIN_LEGACY, DL, VT, N1: RHS, N2: LHS);
1706 }
1707 case ISD::SETCC_INVALID:
1708 llvm_unreachable("Invalid setcc condcode!");
1709 }
1710 return SDValue();
1711}
1712
1713/// Generate Min/Max node
1714SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1715 SDValue LHS, SDValue RHS,
1716 SDValue True, SDValue False,
1717 SDValue CC,
1718 DAGCombinerInfo &DCI) const {
1719 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1720 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1721
1722 SelectionDAG &DAG = DCI.DAG;
1723
1724 // If we can't directly match this, try to see if we can fold an fneg to
1725 // match.
1726
1727 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
1728 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(Val&: False);
1729 SDValue NegTrue = peekFNeg(Val: True);
1730
1731 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1732 // fmin/fmax.
1733 //
1734 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1735 // -> fneg (fmin_legacy lhs, K)
1736 //
1737 // TODO: Use getNegatedExpression
1738 if (LHS == NegTrue && CFalse && CRHS) {
1739 APFloat NegRHS = neg(X: CRHS->getValueAPF());
1740 if (NegRHS == CFalse->getValueAPF()) {
1741 SDValue Combined =
1742 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True: NegTrue, False, CC, DCI);
1743 if (Combined)
1744 return DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: Combined);
1745 return SDValue();
1746 }
1747 }
1748
1749 return SDValue();
1750}
1751
1752std::pair<SDValue, SDValue>
1753AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1754 SDLoc SL(Op);
1755
1756 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1757
1758 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1759 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1760
1761 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1762 SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1763
1764 return std::pair(Lo, Hi);
1765}
1766
1767SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1768 SDLoc SL(Op);
1769
1770 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1771 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
1772 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: Zero);
1773}
1774
1775SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1776 SDLoc SL(Op);
1777
1778 SDValue Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op);
1779 const SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
1780 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Vec, N2: One);
1781}
1782
1783// Split a vector type into two parts. The first part is a power of two vector.
1784// The second part is whatever is left over, and is a scalar if it would
1785// otherwise be a 1-vector.
1786std::pair<EVT, EVT>
1787AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1788 EVT LoVT, HiVT;
1789 EVT EltVT = VT.getVectorElementType();
1790 unsigned NumElts = VT.getVectorNumElements();
1791 unsigned LoNumElts = PowerOf2Ceil(A: (NumElts + 1) / 2);
1792 LoVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: LoNumElts);
1793 HiVT = NumElts - LoNumElts == 1
1794 ? EltVT
1795 : EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumElts - LoNumElts);
1796 return std::pair(LoVT, HiVT);
1797}
1798
1799// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1800// scalar.
1801std::pair<SDValue, SDValue>
1802AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1803 const EVT &LoVT, const EVT &HiVT,
1804 SelectionDAG &DAG) const {
1805 assert(LoVT.getVectorNumElements() +
1806 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1807 N.getValueType().getVectorNumElements() &&
1808 "More vector elements requested than available!");
1809 SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: LoVT, N1: N,
1810 N2: DAG.getVectorIdxConstant(Val: 0, DL));
1811 SDValue Hi = DAG.getNode(
1812 Opcode: HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1813 VT: HiVT, N1: N, N2: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL));
1814 return std::pair(Lo, Hi);
1815}
1816
1817SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1818 SelectionDAG &DAG) const {
1819 LoadSDNode *Load = cast<LoadSDNode>(Val: Op);
1820 EVT VT = Op.getValueType();
1821 SDLoc SL(Op);
1822
1823
1824 // If this is a 2 element vector, we really want to scalarize and not create
1825 // weird 1 element vectors.
1826 if (VT.getVectorNumElements() == 2) {
1827 SDValue Ops[2];
1828 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
1829 return DAG.getMergeValues(Ops, dl: SL);
1830 }
1831
1832 SDValue BasePtr = Load->getBasePtr();
1833 EVT MemVT = Load->getMemoryVT();
1834
1835 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1836
1837 EVT LoVT, HiVT;
1838 EVT LoMemVT, HiMemVT;
1839 SDValue Lo, Hi;
1840
1841 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1842 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1843 std::tie(args&: Lo, args&: Hi) = splitVector(N: Op, DL: SL, LoVT, HiVT, DAG);
1844
1845 unsigned Size = LoMemVT.getStoreSize();
1846 Align BaseAlign = Load->getAlign();
1847 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1848
1849 SDValue LoLoad = DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: LoVT,
1850 Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue, MemVT: LoMemVT,
1851 Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1852 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Size));
1853 SDValue HiLoad =
1854 DAG.getExtLoad(ExtType: Load->getExtensionType(), dl: SL, VT: HiVT, Chain: Load->getChain(),
1855 Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: LoMemVT.getStoreSize()),
1856 MemVT: HiMemVT, Alignment: HiAlign, MMOFlags: Load->getMemOperand()->getFlags());
1857
1858 SDValue Join;
1859 if (LoVT == HiVT) {
1860 // This is the case that the vector is power of two so was evenly split.
1861 Join = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, N1: LoLoad, N2: HiLoad);
1862 } else {
1863 Join = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL: SL, VT, N1: DAG.getPOISON(VT), N2: LoLoad,
1864 N3: DAG.getVectorIdxConstant(Val: 0, DL: SL));
1865 Join = DAG.getNode(
1866 Opcode: HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, DL: SL,
1867 VT, N1: Join, N2: HiLoad,
1868 N3: DAG.getVectorIdxConstant(Val: LoVT.getVectorNumElements(), DL: SL));
1869 }
1870
1871 SDValue Ops[] = {Join, DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
1872 N1: LoLoad.getValue(R: 1), N2: HiLoad.getValue(R: 1))};
1873
1874 return DAG.getMergeValues(Ops, dl: SL);
1875}
1876
1877SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1878 SelectionDAG &DAG) const {
1879 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
1880 EVT VT = Op.getValueType();
1881 SDValue BasePtr = Load->getBasePtr();
1882 EVT MemVT = Load->getMemoryVT();
1883 SDLoc SL(Op);
1884 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1885 Align BaseAlign = Load->getAlign();
1886 unsigned NumElements = MemVT.getVectorNumElements();
1887
1888 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1889 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1890 if (NumElements != 3 ||
1891 (BaseAlign < Align(8) &&
1892 !SrcValue.isDereferenceable(Size: 16, C&: *DAG.getContext(), DL: DAG.getDataLayout())))
1893 return SplitVectorLoad(Op, DAG);
1894
1895 assert(NumElements == 3);
1896
1897 EVT WideVT =
1898 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
1899 EVT WideMemVT =
1900 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(), NumElements: 4);
1901 SDValue WideLoad = DAG.getExtLoad(
1902 ExtType: Load->getExtensionType(), dl: SL, VT: WideVT, Chain: Load->getChain(), Ptr: BasePtr, PtrInfo: SrcValue,
1903 MemVT: WideMemVT, Alignment: BaseAlign, MMOFlags: Load->getMemOperand()->getFlags());
1904 return DAG.getMergeValues(
1905 Ops: {DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT, N1: WideLoad,
1906 N2: DAG.getVectorIdxConstant(Val: 0, DL: SL)),
1907 WideLoad.getValue(R: 1)},
1908 dl: SL);
1909}
1910
1911SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1912 SelectionDAG &DAG) const {
1913 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
1914 SDValue Val = Store->getValue();
1915 EVT VT = Val.getValueType();
1916
1917 // If this is a 2 element vector, we really want to scalarize and not create
1918 // weird 1 element vectors.
1919 if (VT.getVectorNumElements() == 2)
1920 return scalarizeVectorStore(ST: Store, DAG);
1921
1922 EVT MemVT = Store->getMemoryVT();
1923 SDValue Chain = Store->getChain();
1924 SDValue BasePtr = Store->getBasePtr();
1925 SDLoc SL(Op);
1926
1927 EVT LoVT, HiVT;
1928 EVT LoMemVT, HiMemVT;
1929 SDValue Lo, Hi;
1930
1931 std::tie(args&: LoVT, args&: HiVT) = getSplitDestVTs(VT, DAG);
1932 std::tie(args&: LoMemVT, args&: HiMemVT) = getSplitDestVTs(VT: MemVT, DAG);
1933 std::tie(args&: Lo, args&: Hi) = splitVector(N: Val, DL: SL, LoVT, HiVT, DAG);
1934
1935 SDValue HiPtr = DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: LoMemVT.getStoreSize());
1936
1937 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1938 Align BaseAlign = Store->getAlign();
1939 unsigned Size = LoMemVT.getStoreSize();
1940 Align HiAlign = commonAlignment(A: BaseAlign, Offset: Size);
1941
1942 SDValue LoStore =
1943 DAG.getTruncStore(Chain, dl: SL, Val: Lo, Ptr: BasePtr, PtrInfo: SrcValue, SVT: LoMemVT, Alignment: BaseAlign,
1944 MMOFlags: Store->getMemOperand()->getFlags());
1945 SDValue HiStore =
1946 DAG.getTruncStore(Chain, dl: SL, Val: Hi, Ptr: HiPtr, PtrInfo: SrcValue.getWithOffset(O: Size),
1947 SVT: HiMemVT, Alignment: HiAlign, MMOFlags: Store->getMemOperand()->getFlags());
1948
1949 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: LoStore, N2: HiStore);
1950}
1951
1952// This is a shortcut for integer division because we have fast i32<->f32
1953// conversions, and fast f32 reciprocal instructions. The fractional part of a
1954// float is enough to accurately represent up to a 24-bit signed integer.
1955SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1956 bool Sign) const {
1957 SDLoc DL(Op);
1958 EVT VT = Op.getValueType();
1959 SDValue LHS = Op.getOperand(i: 0);
1960 SDValue RHS = Op.getOperand(i: 1);
1961 MVT IntVT = MVT::i32;
1962 MVT FltVT = MVT::f32;
1963
1964 unsigned LHSSignBits = DAG.ComputeNumSignBits(Op: LHS);
1965 if (LHSSignBits < 9)
1966 return SDValue();
1967
1968 unsigned RHSSignBits = DAG.ComputeNumSignBits(Op: RHS);
1969 if (RHSSignBits < 9)
1970 return SDValue();
1971
1972 unsigned BitSize = VT.getSizeInBits();
1973 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1974 unsigned DivBits = BitSize - SignBits;
1975 if (Sign)
1976 ++DivBits;
1977
1978 ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1979 ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1980
1981 SDValue jq = DAG.getConstant(Val: 1, DL, VT: IntVT);
1982
1983 if (Sign) {
1984 // char|short jq = ia ^ ib;
1985 jq = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: RHS);
1986
1987 // jq = jq >> (bitsize - 2)
1988 jq = DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: jq,
1989 N2: DAG.getConstant(Val: BitSize - 2, DL, VT));
1990
1991 // jq = jq | 0x1
1992 jq = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: jq, N2: DAG.getConstant(Val: 1, DL, VT));
1993 }
1994
1995 // int ia = (int)LHS;
1996 SDValue ia = LHS;
1997
1998 // int ib, (int)RHS;
1999 SDValue ib = RHS;
2000
2001 // float fa = (float)ia;
2002 SDValue fa = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ia);
2003
2004 // float fb = (float)ib;
2005 SDValue fb = DAG.getNode(Opcode: ToFp, DL, VT: FltVT, Operand: ib);
2006
2007 SDValue fq = DAG.getNode(Opcode: ISD::FMUL, DL, VT: FltVT,
2008 N1: fa, N2: DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: FltVT, Operand: fb));
2009
2010 // fq = trunc(fq);
2011 fq = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: FltVT, Operand: fq);
2012
2013 // float fqneg = -fq;
2014 SDValue fqneg = DAG.getNode(Opcode: ISD::FNEG, DL, VT: FltVT, Operand: fq);
2015
2016 MachineFunction &MF = DAG.getMachineFunction();
2017
2018 bool UseFmadFtz = false;
2019 if (Subtarget->isGCN()) {
2020 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2021 UseFmadFtz =
2022 MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
2023 }
2024
2025 // float fr = mad(fqneg, fb, fa);
2026 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2027 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2028 : (unsigned)ISD::FMAD;
2029 SDValue fr = DAG.getNode(Opcode: OpCode, DL, VT: FltVT, N1: fqneg, N2: fb, N3: fa);
2030
2031 // int iq = (int)fq;
2032 SDValue iq = DAG.getNode(Opcode: ToInt, DL, VT: IntVT, Operand: fq);
2033
2034 // fr = fabs(fr);
2035 fr = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fr);
2036
2037 // fb = fabs(fb);
2038 fb = DAG.getNode(Opcode: ISD::FABS, DL, VT: FltVT, Operand: fb);
2039
2040 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2041
2042 // int cv = fr >= fb;
2043 SDValue cv = DAG.getSetCC(DL, VT: SetCCVT, LHS: fr, RHS: fb, Cond: ISD::SETOGE);
2044
2045 // jq = (cv ? jq : 0);
2046 jq = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: cv, N2: jq, N3: DAG.getConstant(Val: 0, DL, VT));
2047
2048 // dst = iq + jq;
2049 SDValue Div = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: iq, N2: jq);
2050
2051 // Rem needs compensation, it's easier to recompute it
2052 SDValue Rem = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Div, N2: RHS);
2053 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: LHS, N2: Rem);
2054
2055 // Truncate to number of bits this divide really is.
2056 if (Sign) {
2057 SDValue InRegSize
2058 = DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: DivBits));
2059 Div = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Div, N2: InRegSize);
2060 Rem = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT, N1: Rem, N2: InRegSize);
2061 } else {
2062 SDValue TruncMask = DAG.getConstant(Val: (UINT64_C(1) << DivBits) - 1, DL, VT);
2063 Div = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Div, N2: TruncMask);
2064 Rem = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Rem, N2: TruncMask);
2065 }
2066
2067 return DAG.getMergeValues(Ops: { Div, Rem }, dl: DL);
2068}
2069
2070void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
2071 SelectionDAG &DAG,
2072 SmallVectorImpl<SDValue> &Results) const {
2073 SDLoc DL(Op);
2074 EVT VT = Op.getValueType();
2075
2076 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2077
2078 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2079
2080 SDValue One = DAG.getConstant(Val: 1, DL, VT: HalfVT);
2081 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: HalfVT);
2082
2083 //HiLo split
2084 SDValue LHS_Lo, LHS_Hi;
2085 SDValue LHS = Op.getOperand(i: 0);
2086 std::tie(args&: LHS_Lo, args&: LHS_Hi) = DAG.SplitScalar(N: LHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2087
2088 SDValue RHS_Lo, RHS_Hi;
2089 SDValue RHS = Op.getOperand(i: 1);
2090 std::tie(args&: RHS_Lo, args&: RHS_Hi) = DAG.SplitScalar(N: RHS, DL, LoVT: HalfVT, HiVT: HalfVT);
2091
2092 if (DAG.MaskedValueIsZero(Op: RHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32)) &&
2093 DAG.MaskedValueIsZero(Op: LHS, Mask: APInt::getHighBitsSet(numBits: 64, hiBitsSet: 32))) {
2094
2095 SDValue Res = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2096 N1: LHS_Lo, N2: RHS_Lo);
2097
2098 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 0), Zero});
2099 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Res.getValue(R: 1), Zero});
2100
2101 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV));
2102 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM));
2103 return;
2104 }
2105
2106 if (isTypeLegal(VT: MVT::i64)) {
2107 // The algorithm here is based on ideas from "Software Integer Division",
2108 // Tom Rodeheffer, August 2008.
2109
2110 MachineFunction &MF = DAG.getMachineFunction();
2111 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2112
2113 // Compute denominator reciprocal.
2114 unsigned FMAD =
2115 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2116 : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
2117 ? (unsigned)ISD::FMAD
2118 : (unsigned)AMDGPUISD::FMAD_FTZ;
2119
2120 SDValue Cvt_Lo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Lo);
2121 SDValue Cvt_Hi = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: MVT::f32, Operand: RHS_Hi);
2122 SDValue Mad1 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Cvt_Hi,
2123 N2: DAG.getConstantFP(Val: APInt(32, 0x4f800000).bitsToFloat(), DL, VT: MVT::f32),
2124 N3: Cvt_Lo);
2125 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT: MVT::f32, Operand: Mad1);
2126 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Rcp,
2127 N2: DAG.getConstantFP(Val: APInt(32, 0x5f7ffffc).bitsToFloat(), DL, VT: MVT::f32));
2128 SDValue Mul2 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f32, N1: Mul1,
2129 N2: DAG.getConstantFP(Val: APInt(32, 0x2f800000).bitsToFloat(), DL, VT: MVT::f32));
2130 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL, VT: MVT::f32, Operand: Mul2);
2131 SDValue Mad2 = DAG.getNode(Opcode: FMAD, DL, VT: MVT::f32, N1: Trunc,
2132 N2: DAG.getConstantFP(Val: APInt(32, 0xcf800000).bitsToFloat(), DL, VT: MVT::f32),
2133 N3: Mul1);
2134 SDValue Rcp_Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Mad2);
2135 SDValue Rcp_Hi = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL, VT: HalfVT, Operand: Trunc);
2136 SDValue Rcp64 = DAG.getBitcast(VT,
2137 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Rcp_Lo, Rcp_Hi}));
2138
2139 SDValue Zero64 = DAG.getConstant(Val: 0, DL, VT);
2140 SDValue One64 = DAG.getConstant(Val: 1, DL, VT);
2141 SDValue Zero1 = DAG.getConstant(Val: 0, DL, VT: MVT::i1);
2142 SDVTList HalfCarryVT = DAG.getVTList(VT1: HalfVT, VT2: MVT::i1);
2143
2144 // First round of UNR (Unsigned integer Newton-Raphson).
2145 SDValue Neg_RHS = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero64, N2: RHS);
2146 SDValue Mullo1 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Rcp64);
2147 SDValue Mulhi1 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Rcp64, N2: Mullo1);
2148 SDValue Mulhi1_Lo, Mulhi1_Hi;
2149 std::tie(args&: Mulhi1_Lo, args&: Mulhi1_Hi) =
2150 DAG.SplitScalar(N: Mulhi1, DL, LoVT: HalfVT, HiVT: HalfVT);
2151 SDValue Add1_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Lo,
2152 N2: Mulhi1_Lo, N3: Zero1);
2153 SDValue Add1_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Rcp_Hi,
2154 N2: Mulhi1_Hi, N3: Add1_Lo.getValue(R: 1));
2155 SDValue Add1 = DAG.getBitcast(VT,
2156 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add1_Lo, Add1_Hi}));
2157
2158 // Second round of UNR.
2159 SDValue Mullo2 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Neg_RHS, N2: Add1);
2160 SDValue Mulhi2 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Add1, N2: Mullo2);
2161 SDValue Mulhi2_Lo, Mulhi2_Hi;
2162 std::tie(args&: Mulhi2_Lo, args&: Mulhi2_Hi) =
2163 DAG.SplitScalar(N: Mulhi2, DL, LoVT: HalfVT, HiVT: HalfVT);
2164 SDValue Add2_Lo = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Lo,
2165 N2: Mulhi2_Lo, N3: Zero1);
2166 SDValue Add2_Hi = DAG.getNode(Opcode: ISD::UADDO_CARRY, DL, VTList: HalfCarryVT, N1: Add1_Hi,
2167 N2: Mulhi2_Hi, N3: Add2_Lo.getValue(R: 1));
2168 SDValue Add2 = DAG.getBitcast(VT,
2169 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Add2_Lo, Add2_Hi}));
2170
2171 SDValue Mulhi3 = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: LHS, N2: Add2);
2172
2173 SDValue Mul3 = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: RHS, N2: Mulhi3);
2174
2175 SDValue Mul3_Lo, Mul3_Hi;
2176 std::tie(args&: Mul3_Lo, args&: Mul3_Hi) = DAG.SplitScalar(N: Mul3, DL, LoVT: HalfVT, HiVT: HalfVT);
2177 SDValue Sub1_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Lo,
2178 N2: Mul3_Lo, N3: Zero1);
2179 SDValue Sub1_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: LHS_Hi,
2180 N2: Mul3_Hi, N3: Sub1_Lo.getValue(R: 1));
2181 SDValue Sub1_Mi = DAG.getNode(Opcode: ISD::SUB, DL, VT: HalfVT, N1: LHS_Hi, N2: Mul3_Hi);
2182 SDValue Sub1 = DAG.getBitcast(VT,
2183 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub1_Lo, Sub1_Hi}));
2184
2185 SDValue MinusOne = DAG.getConstant(Val: 0xffffffffu, DL, VT: HalfVT);
2186 SDValue C1 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2187 Cond: ISD::SETUGE);
2188 SDValue C2 = DAG.getSelectCC(DL, LHS: Sub1_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2189 Cond: ISD::SETUGE);
2190 SDValue C3 = DAG.getSelectCC(DL, LHS: Sub1_Hi, RHS: RHS_Hi, True: C2, False: C1, Cond: ISD::SETEQ);
2191
2192 // TODO: Here and below portions of the code can be enclosed into if/endif.
2193 // Currently control flow is unconditional and we have 4 selects after
2194 // potential endif to substitute PHIs.
2195
2196 // if C3 != 0 ...
2197 SDValue Sub2_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Lo,
2198 N2: RHS_Lo, N3: Zero1);
2199 SDValue Sub2_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub1_Mi,
2200 N2: RHS_Hi, N3: Sub1_Lo.getValue(R: 1));
2201 SDValue Sub2_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2202 N2: Zero, N3: Sub2_Lo.getValue(R: 1));
2203 SDValue Sub2 = DAG.getBitcast(VT,
2204 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub2_Lo, Sub2_Hi}));
2205
2206 SDValue Add3 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Mulhi3, N2: One64);
2207
2208 SDValue C4 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: MinusOne, False: Zero,
2209 Cond: ISD::SETUGE);
2210 SDValue C5 = DAG.getSelectCC(DL, LHS: Sub2_Lo, RHS: RHS_Lo, True: MinusOne, False: Zero,
2211 Cond: ISD::SETUGE);
2212 SDValue C6 = DAG.getSelectCC(DL, LHS: Sub2_Hi, RHS: RHS_Hi, True: C5, False: C4, Cond: ISD::SETEQ);
2213
2214 // if (C6 != 0)
2215 SDValue Add4 = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Add3, N2: One64);
2216
2217 SDValue Sub3_Lo = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Lo,
2218 N2: RHS_Lo, N3: Zero1);
2219 SDValue Sub3_Mi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub2_Mi,
2220 N2: RHS_Hi, N3: Sub2_Lo.getValue(R: 1));
2221 SDValue Sub3_Hi = DAG.getNode(Opcode: ISD::USUBO_CARRY, DL, VTList: HalfCarryVT, N1: Sub3_Mi,
2222 N2: Zero, N3: Sub3_Lo.getValue(R: 1));
2223 SDValue Sub3 = DAG.getBitcast(VT,
2224 V: DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Sub3_Lo, Sub3_Hi}));
2225
2226 // endif C6
2227 // endif C3
2228
2229 SDValue Sel1 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Add4, False: Add3, Cond: ISD::SETNE);
2230 SDValue Div = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel1, False: Mulhi3, Cond: ISD::SETNE);
2231
2232 SDValue Sel2 = DAG.getSelectCC(DL, LHS: C6, RHS: Zero, True: Sub3, False: Sub2, Cond: ISD::SETNE);
2233 SDValue Rem = DAG.getSelectCC(DL, LHS: C3, RHS: Zero, True: Sel2, False: Sub1, Cond: ISD::SETNE);
2234
2235 Results.push_back(Elt: Div);
2236 Results.push_back(Elt: Rem);
2237
2238 return;
2239 }
2240
2241 // r600 expandion.
2242 // Get Speculative values
2243 SDValue DIV_Part = DAG.getNode(Opcode: ISD::UDIV, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2244 SDValue REM_Part = DAG.getNode(Opcode: ISD::UREM, DL, VT: HalfVT, N1: LHS_Hi, N2: RHS_Lo);
2245
2246 SDValue REM_Lo = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: REM_Part, False: LHS_Hi, Cond: ISD::SETEQ);
2247 SDValue REM = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {REM_Lo, Zero});
2248 REM = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: REM);
2249
2250 SDValue DIV_Hi = DAG.getSelectCC(DL, LHS: RHS_Hi, RHS: Zero, True: DIV_Part, False: Zero, Cond: ISD::SETEQ);
2251 SDValue DIV_Lo = Zero;
2252
2253 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2254
2255 for (unsigned i = 0; i < halfBitWidth; ++i) {
2256 const unsigned bitPos = halfBitWidth - i - 1;
2257 SDValue POS = DAG.getConstant(Val: bitPos, DL, VT: HalfVT);
2258 // Get value of high bit
2259 SDValue HBit = DAG.getNode(Opcode: ISD::SRL, DL, VT: HalfVT, N1: LHS_Lo, N2: POS);
2260 HBit = DAG.getNode(Opcode: ISD::AND, DL, VT: HalfVT, N1: HBit, N2: One);
2261 HBit = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT, Operand: HBit);
2262
2263 // Shift
2264 REM = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: REM, N2: DAG.getConstant(Val: 1, DL, VT));
2265 // Add LHS high bit
2266 REM = DAG.getNode(Opcode: ISD::OR, DL, VT, N1: REM, N2: HBit);
2267
2268 SDValue BIT = DAG.getConstant(Val: 1ULL << bitPos, DL, VT: HalfVT);
2269 SDValue realBIT = DAG.getSelectCC(DL, LHS: REM, RHS, True: BIT, False: Zero, Cond: ISD::SETUGE);
2270
2271 DIV_Lo = DAG.getNode(Opcode: ISD::OR, DL, VT: HalfVT, N1: DIV_Lo, N2: realBIT);
2272
2273 // Update REM
2274 SDValue REM_sub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: REM, N2: RHS);
2275 REM = DAG.getSelectCC(DL, LHS: REM, RHS, True: REM_sub, False: REM, Cond: ISD::SETUGE);
2276 }
2277
2278 SDValue DIV = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {DIV_Lo, DIV_Hi});
2279 DIV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: DIV);
2280 Results.push_back(Elt: DIV);
2281 Results.push_back(Elt: REM);
2282}
2283
2284SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2285 SelectionDAG &DAG) const {
2286 SDLoc DL(Op);
2287 EVT VT = Op.getValueType();
2288
2289 if (VT == MVT::i64) {
2290 SmallVector<SDValue, 2> Results;
2291 LowerUDIVREM64(Op, DAG, Results);
2292 return DAG.getMergeValues(Ops: Results, dl: DL);
2293 }
2294
2295 if (VT == MVT::i32) {
2296 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: false))
2297 return Res;
2298 }
2299
2300 SDValue X = Op.getOperand(i: 0);
2301 SDValue Y = Op.getOperand(i: 1);
2302
2303 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2304 // algorithm used here.
2305
2306 // Initial estimate of inv(y).
2307 SDValue Z = DAG.getNode(Opcode: AMDGPUISD::URECIP, DL, VT, Operand: Y);
2308
2309 // One round of UNR.
2310 SDValue NegY = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: 0, DL, VT), N2: Y);
2311 SDValue NegYZ = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: NegY, N2: Z);
2312 Z = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Z,
2313 N2: DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: Z, N2: NegYZ));
2314
2315 // Quotient/remainder estimate.
2316 SDValue Q = DAG.getNode(Opcode: ISD::MULHU, DL, VT, N1: X, N2: Z);
2317 SDValue R =
2318 DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: X, N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: Q, N2: Y));
2319
2320 // First quotient/remainder refinement.
2321 EVT CCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2322 SDValue One = DAG.getConstant(Val: 1, DL, VT);
2323 SDValue Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2324 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2325 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2326 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2327 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2328
2329 // Second quotient/remainder refinement.
2330 Cond = DAG.getSetCC(DL, VT: CCVT, LHS: R, RHS: Y, Cond: ISD::SETUGE);
2331 Q = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2332 N2: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: Q, N2: One), N3: Q);
2333 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: Cond,
2334 N2: DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: R, N2: Y), N3: R);
2335
2336 return DAG.getMergeValues(Ops: {Q, R}, dl: DL);
2337}
2338
2339SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2340 SelectionDAG &DAG) const {
2341 SDLoc DL(Op);
2342 EVT VT = Op.getValueType();
2343
2344 SDValue LHS = Op.getOperand(i: 0);
2345 SDValue RHS = Op.getOperand(i: 1);
2346
2347 SDValue Zero = DAG.getConstant(Val: 0, DL, VT);
2348 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2349
2350 if (VT == MVT::i32) {
2351 if (SDValue Res = LowerDIVREM24(Op, DAG, Sign: true))
2352 return Res;
2353 }
2354
2355 if (VT == MVT::i64 &&
2356 DAG.ComputeNumSignBits(Op: LHS) > 32 &&
2357 DAG.ComputeNumSignBits(Op: RHS) > 32) {
2358 EVT HalfVT = VT.getHalfSizedIntegerVT(Context&: *DAG.getContext());
2359
2360 //HiLo split
2361 SDValue LHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: LHS, N2: Zero);
2362 SDValue RHS_Lo = DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL, VT: HalfVT, N1: RHS, N2: Zero);
2363 SDValue DIVREM = DAG.getNode(Opcode: ISD::SDIVREM, DL, VTList: DAG.getVTList(VT1: HalfVT, VT2: HalfVT),
2364 N1: LHS_Lo, N2: RHS_Lo);
2365 SDValue Res[2] = {
2366 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 0)),
2367 DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: DIVREM.getValue(R: 1))
2368 };
2369 return DAG.getMergeValues(Ops: Res, dl: DL);
2370 }
2371
2372 SDValue LHSign = DAG.getSelectCC(DL, LHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2373 SDValue RHSign = DAG.getSelectCC(DL, LHS: RHS, RHS: Zero, True: NegOne, False: Zero, Cond: ISD::SETLT);
2374 SDValue DSign = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHSign, N2: RHSign);
2375 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2376
2377 LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: LHS, N2: LHSign);
2378 RHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: RHSign);
2379
2380 LHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: LHS, N2: LHSign);
2381 RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: RHS, N2: RHSign);
2382
2383 SDValue Div = DAG.getNode(Opcode: ISD::UDIVREM, DL, VTList: DAG.getVTList(VT1: VT, VT2: VT), N1: LHS, N2: RHS);
2384 SDValue Rem = Div.getValue(R: 1);
2385
2386 Div = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Div, N2: DSign);
2387 Rem = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: Rem, N2: RSign);
2388
2389 Div = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Div, N2: DSign);
2390 Rem = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Rem, N2: RSign);
2391
2392 SDValue Res[2] = {
2393 Div,
2394 Rem
2395 };
2396 return DAG.getMergeValues(Ops: Res, dl: DL);
2397}
2398
2399// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2400SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2401 SDLoc SL(Op);
2402 EVT VT = Op.getValueType();
2403 auto Flags = Op->getFlags();
2404 SDValue X = Op.getOperand(i: 0);
2405 SDValue Y = Op.getOperand(i: 1);
2406
2407 SDValue Div = DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT, N1: X, N2: Y, Flags);
2408 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: Div, Flags);
2409 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Trunc, Flags);
2410 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2411 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Neg, N2: Y, N3: X, Flags);
2412}
2413
2414SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2415 SDLoc SL(Op);
2416 SDValue Src = Op.getOperand(i: 0);
2417
2418 // result = trunc(src)
2419 // if (src > 0.0 && src != result)
2420 // result += 1.0
2421
2422 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2423
2424 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2425 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
2426
2427 EVT SetCCVT =
2428 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2429
2430 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOGT);
2431 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2432 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2433
2434 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: One, N3: Zero);
2435 // TODO: Should this propagate fast-math-flags?
2436 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2437}
2438
2439static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2440 SelectionDAG &DAG) {
2441 const unsigned FractBits = 52;
2442 const unsigned ExpBits = 11;
2443
2444 SDValue ExpPart = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
2445 N1: Hi,
2446 N2: DAG.getConstant(Val: FractBits - 32, DL: SL, VT: MVT::i32),
2447 N3: DAG.getConstant(Val: ExpBits, DL: SL, VT: MVT::i32));
2448 SDValue Exp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ExpPart,
2449 N2: DAG.getConstant(Val: 1023, DL: SL, VT: MVT::i32));
2450
2451 return Exp;
2452}
2453
2454SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2455 SDLoc SL(Op);
2456 SDValue Src = Op.getOperand(i: 0);
2457
2458 assert(Op.getValueType() == MVT::f64);
2459
2460 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
2461
2462 // Extract the upper half, since this is where we will find the sign and
2463 // exponent.
2464 SDValue Hi = getHiHalf64(Op: Src, DAG);
2465
2466 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2467
2468 const unsigned FractBits = 52;
2469
2470 // Extract the sign bit.
2471 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, DL: SL, VT: MVT::i32);
2472 SDValue SignBit = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: Hi, N2: SignBitMask);
2473
2474 // Extend back to 64-bits.
2475 SDValue SignBit64 = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Zero, SignBit});
2476 SignBit64 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: SignBit64);
2477
2478 SDValue BcInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Src);
2479 const SDValue FractMask
2480 = DAG.getConstant(Val: (UINT64_C(1) << FractBits) - 1, DL: SL, VT: MVT::i64);
2481
2482 SDValue Shr = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: FractMask, N2: Exp);
2483 SDValue Not = DAG.getNOT(DL: SL, Val: Shr, VT: MVT::i64);
2484 SDValue Tmp0 = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i64, N1: BcInt, N2: Not);
2485
2486 EVT SetCCVT =
2487 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::i32);
2488
2489 const SDValue FiftyOne = DAG.getConstant(Val: FractBits - 1, DL: SL, VT: MVT::i32);
2490
2491 SDValue ExpLt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: Zero, Cond: ISD::SETLT);
2492 SDValue ExpGt51 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Exp, RHS: FiftyOne, Cond: ISD::SETGT);
2493
2494 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpLt0, N2: SignBit64, N3: Tmp0);
2495 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: ExpGt51, N2: BcInt, N3: Tmp1);
2496
2497 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f64, Operand: Tmp2);
2498}
2499
2500SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
2501 SelectionDAG &DAG) const {
2502 SDLoc SL(Op);
2503 SDValue Src = Op.getOperand(i: 0);
2504
2505 assert(Op.getValueType() == MVT::f64);
2506
2507 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2508 SDValue C1 = DAG.getConstantFP(Val: C1Val, DL: SL, VT: MVT::f64);
2509 SDValue CopySign = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MVT::f64, N1: C1, N2: Src);
2510
2511 // TODO: Should this propagate fast-math-flags?
2512
2513 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Src, N2: CopySign);
2514 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT: MVT::f64, N1: Tmp1, N2: CopySign);
2515
2516 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f64, Operand: Src);
2517
2518 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2519 SDValue C2 = DAG.getConstantFP(Val: C2Val, DL: SL, VT: MVT::f64);
2520
2521 EVT SetCCVT =
2522 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2523 SDValue Cond = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Fabs, RHS: C2, Cond: ISD::SETOGT);
2524
2525 return DAG.getSelect(DL: SL, VT: MVT::f64, Cond, LHS: Src, RHS: Tmp2);
2526}
2527
2528SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
2529 SelectionDAG &DAG) const {
2530 // FNEARBYINT and FRINT are the same, except in their handling of FP
2531 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2532 // rint, so just treat them as equivalent.
2533 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT: Op.getValueType(),
2534 Operand: Op.getOperand(i: 0));
2535}
2536
2537SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2538 auto VT = Op.getValueType();
2539 auto Arg = Op.getOperand(i: 0u);
2540 return DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SDLoc(Op), VT, Operand: Arg);
2541}
2542
2543// XXX - May require not supporting f32 denormals?
2544
2545// Don't handle v2f16. The extra instructions to scalarize and repack around the
2546// compare and vselect end up producing worse code than scalarizing the whole
2547// operation.
2548SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2549 SDLoc SL(Op);
2550 SDValue X = Op.getOperand(i: 0);
2551 EVT VT = Op.getValueType();
2552
2553 SDValue T = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: X);
2554
2555 // TODO: Should this propagate fast-math-flags?
2556
2557 SDValue Diff = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: T);
2558
2559 SDValue AbsDiff = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Diff);
2560
2561 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2562 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2563
2564 EVT SetCCVT =
2565 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2566
2567 const SDValue Half = DAG.getConstantFP(Val: 0.5, DL: SL, VT);
2568 SDValue Cmp = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsDiff, RHS: Half, Cond: ISD::SETOGE);
2569 SDValue OneOrZeroFP = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cmp, N2: One, N3: Zero);
2570
2571 SDValue SignedOffset = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: OneOrZeroFP, N2: X);
2572 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: T, N2: SignedOffset);
2573}
2574
2575SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2576 SDLoc SL(Op);
2577 SDValue Src = Op.getOperand(i: 0);
2578
2579 // result = trunc(src);
2580 // if (src < 0.0 && src != result)
2581 // result += -1.0.
2582
2583 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: MVT::f64, Operand: Src);
2584
2585 const SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT: MVT::f64);
2586 const SDValue NegOne = DAG.getConstantFP(Val: -1.0, DL: SL, VT: MVT::f64);
2587
2588 EVT SetCCVT =
2589 getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT: MVT::f64);
2590
2591 SDValue Lt0 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Zero, Cond: ISD::SETOLT);
2592 SDValue NeTrunc = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: Trunc, Cond: ISD::SETONE);
2593 SDValue And = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SetCCVT, N1: Lt0, N2: NeTrunc);
2594
2595 SDValue Add = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f64, N1: And, N2: NegOne, N3: Zero);
2596 // TODO: Should this propagate fast-math-flags?
2597 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: Trunc, N2: Add);
2598}
2599
2600/// Return true if it's known that \p Src can never be an f32 denormal value.
2601static bool valueIsKnownNeverF32Denorm(SDValue Src) {
2602 switch (Src.getOpcode()) {
2603 case ISD::FP_EXTEND:
2604 return Src.getOperand(i: 0).getValueType() == MVT::f16;
2605 case ISD::FP16_TO_FP:
2606 case ISD::FFREXP:
2607 return true;
2608 case ISD::INTRINSIC_WO_CHAIN: {
2609 unsigned IntrinsicID = Src.getConstantOperandVal(i: 0);
2610 switch (IntrinsicID) {
2611 case Intrinsic::amdgcn_frexp_mant:
2612 return true;
2613 default:
2614 return false;
2615 }
2616 }
2617 default:
2618 return false;
2619 }
2620
2621 llvm_unreachable("covered opcode switch");
2622}
2623
2624bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
2625 SDNodeFlags Flags) {
2626 if (Flags.hasApproximateFuncs())
2627 return true;
2628 auto &Options = DAG.getTarget().Options;
2629 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2630}
2631
2632bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
2633 SDValue Src,
2634 SDNodeFlags Flags) {
2635 return !valueIsKnownNeverF32Denorm(Src) &&
2636 DAG.getMachineFunction()
2637 .getDenormalMode(FPType: APFloat::IEEEsingle())
2638 .Input != DenormalMode::PreserveSign;
2639}
2640
2641SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
2642 SDValue Src,
2643 SDNodeFlags Flags) const {
2644 SDLoc SL(Src);
2645 EVT VT = Src.getValueType();
2646 const fltSemantics &Semantics = VT.getFltSemantics();
2647 SDValue SmallestNormal =
2648 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2649
2650 // Want to scale denormals up, but negatives and 0 work just as well on the
2651 // scaled path.
2652 SDValue IsLtSmallestNormal = DAG.getSetCC(
2653 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2654 RHS: SmallestNormal, Cond: ISD::SETOLT);
2655
2656 return IsLtSmallestNormal;
2657}
2658
2659SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
2660 SDNodeFlags Flags) const {
2661 SDLoc SL(Src);
2662 EVT VT = Src.getValueType();
2663 const fltSemantics &Semantics = VT.getFltSemantics();
2664 SDValue Inf = DAG.getConstantFP(Val: APFloat::getInf(Sem: Semantics), DL: SL, VT);
2665
2666 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: Src, Flags);
2667 SDValue IsFinite = DAG.getSetCC(
2668 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Fabs,
2669 RHS: Inf, Cond: ISD::SETOLT);
2670 return IsFinite;
2671}
2672
2673/// If denormal handling is required return the scaled input to FLOG2, and the
2674/// check for denormal range. Otherwise, return null values.
2675std::pair<SDValue, SDValue>
2676AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
2677 SDValue Src, SDNodeFlags Flags) const {
2678 if (!needsDenormHandlingF32(DAG, Src, Flags))
2679 return {};
2680
2681 MVT VT = MVT::f32;
2682 const fltSemantics &Semantics = APFloat::IEEEsingle();
2683 SDValue SmallestNormal =
2684 DAG.getConstantFP(Val: APFloat::getSmallestNormalized(Sem: Semantics), DL: SL, VT);
2685
2686 SDValue IsLtSmallestNormal = DAG.getSetCC(
2687 DL: SL, VT: getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT), LHS: Src,
2688 RHS: SmallestNormal, Cond: ISD::SETOLT);
2689
2690 SDValue Scale32 = DAG.getConstantFP(Val: 0x1.0p+32, DL: SL, VT);
2691 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2692 SDValue ScaleFactor =
2693 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: Scale32, N3: One, Flags);
2694
2695 SDValue ScaledInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Src, N2: ScaleFactor, Flags);
2696 return {ScaledInput, IsLtSmallestNormal};
2697}
2698
2699SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
2700 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2701 // If we have to handle denormals, scale up the input and adjust the result.
2702
2703 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2704 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2705
2706 SDLoc SL(Op);
2707 EVT VT = Op.getValueType();
2708 SDValue Src = Op.getOperand(i: 0);
2709 SDNodeFlags Flags = Op->getFlags();
2710
2711 if (VT == MVT::f16) {
2712 // Nothing in half is a denormal when promoted to f32.
2713 assert(!Subtarget->has16BitInsts());
2714 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2715 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2716 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2717 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2718 }
2719
2720 auto [ScaledInput, IsLtSmallestNormal] =
2721 getScaledLogInput(DAG, SL, Src, Flags);
2722 if (!ScaledInput)
2723 return DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: Src, Flags);
2724
2725 SDValue Log2 = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2726
2727 SDValue ThirtyTwo = DAG.getConstantFP(Val: 32.0, DL: SL, VT);
2728 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2729 SDValue ResultOffset =
2730 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLtSmallestNormal, N2: ThirtyTwo, N3: Zero);
2731 return DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: Log2, N2: ResultOffset, Flags);
2732}
2733
2734static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2735 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2736 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Y, Flags);
2737 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: C, Flags);
2738}
2739
2740SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
2741 SelectionDAG &DAG) const {
2742 SDValue X = Op.getOperand(i: 0);
2743 EVT VT = Op.getValueType();
2744 SDNodeFlags Flags = Op->getFlags();
2745 SDLoc DL(Op);
2746
2747 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2748 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2749
2750 const auto &Options = getTargetMachine().Options;
2751 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2752 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2753
2754 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2755 // Log and multiply in f32 is good enough for f16.
2756 X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X, Flags);
2757 }
2758
2759 SDValue Lowered = LowerFLOGUnsafe(Op: X, SL: DL, DAG, IsLog10, Flags);
2760 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2761 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Lowered,
2762 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32), Flags);
2763 }
2764
2765 return Lowered;
2766 }
2767
2768 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL: DL, Src: X, Flags);
2769 if (ScaledInput)
2770 X = ScaledInput;
2771
2772 SDValue Y = DAG.getNode(Opcode: AMDGPUISD::LOG, DL, VT, Operand: X, Flags);
2773
2774 SDValue R;
2775 if (Subtarget->hasFastFMAF32()) {
2776 // c+cc are ln(2)/ln(10) to more than 49 bits
2777 const float c_log10 = 0x1.344134p-2f;
2778 const float cc_log10 = 0x1.09f79ep-26f;
2779
2780 // c + cc is ln(2) to more than 49 bits
2781 const float c_log = 0x1.62e42ep-1f;
2782 const float cc_log = 0x1.efa39ep-25f;
2783
2784 SDValue C = DAG.getConstantFP(Val: IsLog10 ? c_log10 : c_log, DL, VT);
2785 SDValue CC = DAG.getConstantFP(Val: IsLog10 ? cc_log10 : cc_log, DL, VT);
2786
2787 R = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Y, N2: C, Flags);
2788 SDValue NegR = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: R, Flags);
2789 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: C, N3: NegR, Flags);
2790 SDValue FMA1 = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: Y, N2: CC, N3: FMA0, Flags);
2791 R = DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: R, N2: FMA1, Flags);
2792 } else {
2793 // ch+ct is ln(2)/ln(10) to more than 36 bits
2794 const float ch_log10 = 0x1.344000p-2f;
2795 const float ct_log10 = 0x1.3509f6p-18f;
2796
2797 // ch + ct is ln(2) to more than 36 bits
2798 const float ch_log = 0x1.62e000p-1f;
2799 const float ct_log = 0x1.0bfbe8p-15f;
2800
2801 SDValue CH = DAG.getConstantFP(Val: IsLog10 ? ch_log10 : ch_log, DL, VT);
2802 SDValue CT = DAG.getConstantFP(Val: IsLog10 ? ct_log10 : ct_log, DL, VT);
2803
2804 SDValue YAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Y);
2805 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL, VT: MVT::i32);
2806 SDValue YHInt = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: YAsInt, N2: MaskConst);
2807 SDValue YH = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: YHInt);
2808 SDValue YT = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: Y, N2: YH, Flags);
2809
2810 SDValue YTCT = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: YT, N2: CT, Flags);
2811 SDValue Mad0 = getMad(DAG, SL: DL, VT, X: YH, Y: CT, C: YTCT, Flags);
2812 SDValue Mad1 = getMad(DAG, SL: DL, VT, X: YT, Y: CH, C: Mad0, Flags);
2813 R = getMad(DAG, SL: DL, VT, X: YH, Y: CH, C: Mad1);
2814 }
2815
2816 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2817 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2818
2819 // TODO: Check if known finite from source value.
2820 if (!IsFiniteOnly) {
2821 SDValue IsFinite = getIsFinite(DAG, Src: Y, Flags);
2822 R = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsFinite, N2: R, N3: Y, Flags);
2823 }
2824
2825 if (IsScaled) {
2826 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
2827 SDValue ShiftK =
2828 DAG.getConstantFP(Val: IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2829 SDValue Shift =
2830 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsScaled, N2: ShiftK, N3: Zero, Flags);
2831 R = DAG.getNode(Opcode: ISD::FSUB, DL, VT, N1: R, N2: Shift, Flags);
2832 }
2833
2834 return R;
2835}
2836
2837SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
2838 return LowerFLOGCommon(Op, DAG);
2839}
2840
2841// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2842// promote f16 operation.
2843SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
2844 SelectionDAG &DAG, bool IsLog10,
2845 SDNodeFlags Flags) const {
2846 EVT VT = Src.getValueType();
2847 unsigned LogOp =
2848 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2849
2850 double Log2BaseInverted =
2851 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2852
2853 if (VT == MVT::f32) {
2854 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2855 if (ScaledInput) {
2856 SDValue LogSrc = DAG.getNode(Opcode: AMDGPUISD::LOG, DL: SL, VT, Operand: ScaledInput, Flags);
2857 SDValue ScaledResultOffset =
2858 DAG.getConstantFP(Val: -32.0 * Log2BaseInverted, DL: SL, VT);
2859
2860 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL: SL, VT);
2861
2862 SDValue ResultOffset = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsScaled,
2863 N2: ScaledResultOffset, N3: Zero, Flags);
2864
2865 SDValue Log2Inv = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2866
2867 if (Subtarget->hasFastFMAF32())
2868 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: LogSrc, N2: Log2Inv, N3: ResultOffset,
2869 Flags);
2870 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LogSrc, N2: Log2Inv, Flags);
2871 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Mul, N2: ResultOffset);
2872 }
2873 }
2874
2875 SDValue Log2Operand = DAG.getNode(Opcode: LogOp, DL: SL, VT, Operand: Src, Flags);
2876 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Val: Log2BaseInverted, DL: SL, VT);
2877
2878 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Log2Operand, N2: Log2BaseInvertedOperand,
2879 Flags);
2880}
2881
2882SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
2883 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2884 // If we have to handle denormals, scale up the input and adjust the result.
2885
2886 SDLoc SL(Op);
2887 EVT VT = Op.getValueType();
2888 SDValue Src = Op.getOperand(i: 0);
2889 SDNodeFlags Flags = Op->getFlags();
2890
2891 if (VT == MVT::f16) {
2892 // Nothing in half is a denormal when promoted to f32.
2893 assert(!Subtarget->has16BitInsts());
2894 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2895 SDValue Log = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Ext, Flags);
2896 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Log,
2897 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
2898 }
2899
2900 assert(VT == MVT::f32);
2901
2902 if (!needsDenormHandlingF32(DAG, Src, Flags))
2903 return DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT: MVT::f32, Operand: Src, Flags);
2904
2905 // bool needs_scaling = x < -0x1.f80000p+6f;
2906 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2907
2908 // -nextafter(128.0, -1)
2909 SDValue RangeCheckConst = DAG.getConstantFP(Val: -0x1.f80000p+6f, DL: SL, VT);
2910
2911 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2912
2913 SDValue NeedsScaling =
2914 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: Src, RHS: RangeCheckConst, Cond: ISD::SETOLT);
2915
2916 SDValue SixtyFour = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2917 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
2918
2919 SDValue AddOffset =
2920 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: SixtyFour, N3: Zero);
2921
2922 SDValue AddInput = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: Src, N2: AddOffset, Flags);
2923 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: AddInput, Flags);
2924
2925 SDValue TwoExpNeg64 = DAG.getConstantFP(Val: 0x1.0p-64f, DL: SL, VT);
2926 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
2927 SDValue ResultScale =
2928 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: TwoExpNeg64, N3: One);
2929
2930 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScale, Flags);
2931}
2932
2933SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
2934 SelectionDAG &DAG,
2935 SDNodeFlags Flags) const {
2936 EVT VT = X.getValueType();
2937 const SDValue Log2E = DAG.getConstantFP(Val: numbers::log2e, DL: SL, VT);
2938
2939 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
2940 // exp2(M_LOG2E_F * f);
2941 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: Log2E, Flags);
2942 return DAG.getNode(Opcode: VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2943 : (unsigned)ISD::FEXP2,
2944 DL: SL, VT, Operand: Mul, Flags);
2945 }
2946
2947 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2948
2949 SDValue Threshold = DAG.getConstantFP(Val: -0x1.5d58a0p+6f, DL: SL, VT);
2950 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
2951
2952 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+6f, DL: SL, VT);
2953
2954 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
2955
2956 SDValue AdjustedX =
2957 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
2958
2959 SDValue ExpInput = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: Log2E, Flags);
2960
2961 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: ExpInput, Flags);
2962
2963 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.969d48p-93f, DL: SL, VT);
2964 SDValue AdjustedResult =
2965 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2, N2: ResultScaleFactor, Flags);
2966
2967 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: Exp2,
2968 Flags);
2969}
2970
2971/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2972/// handled correctly.
2973SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
2974 SelectionDAG &DAG,
2975 SDNodeFlags Flags) const {
2976 const EVT VT = X.getValueType();
2977 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
2978 : static_cast<unsigned>(ISD::FEXP2);
2979
2980 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, Src: X, Flags)) {
2981 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2982 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
2983 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
2984
2985 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K0, Flags);
2986 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
2987 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: K1, Flags);
2988 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
2989 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1);
2990 }
2991
2992 // bool s = x < -0x1.2f7030p+5f;
2993 // x += s ? 0x1.0p+5f : 0.0f;
2994 // exp10 = exp2(x * 0x1.a92000p+1f) *
2995 // exp2(x * 0x1.4f0978p-11f) *
2996 // (s ? 0x1.9f623ep-107f : 1.0f);
2997
2998 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
2999
3000 SDValue Threshold = DAG.getConstantFP(Val: -0x1.2f7030p+5f, DL: SL, VT);
3001 SDValue NeedsScaling = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: Threshold, Cond: ISD::SETOLT);
3002
3003 SDValue ScaleOffset = DAG.getConstantFP(Val: 0x1.0p+5f, DL: SL, VT);
3004 SDValue ScaledX = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: X, N2: ScaleOffset, Flags);
3005 SDValue AdjustedX =
3006 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: ScaledX, N3: X);
3007
3008 SDValue K0 = DAG.getConstantFP(Val: 0x1.a92000p+1f, DL: SL, VT);
3009 SDValue K1 = DAG.getConstantFP(Val: 0x1.4f0978p-11f, DL: SL, VT);
3010
3011 SDValue Mul0 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K0, Flags);
3012 SDValue Exp2_0 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul0, Flags);
3013 SDValue Mul1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: AdjustedX, N2: K1, Flags);
3014 SDValue Exp2_1 = DAG.getNode(Opcode: Exp2Op, DL: SL, VT, Operand: Mul1, Flags);
3015
3016 SDValue MulExps = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: Exp2_0, N2: Exp2_1, Flags);
3017
3018 SDValue ResultScaleFactor = DAG.getConstantFP(Val: 0x1.9f623ep-107f, DL: SL, VT);
3019 SDValue AdjustedResult =
3020 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: MulExps, N2: ResultScaleFactor, Flags);
3021
3022 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NeedsScaling, N2: AdjustedResult, N3: MulExps,
3023 Flags);
3024}
3025
3026SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
3027 EVT VT = Op.getValueType();
3028 SDLoc SL(Op);
3029 SDValue X = Op.getOperand(i: 0);
3030 SDNodeFlags Flags = Op->getFlags();
3031 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3032
3033 if (VT.getScalarType() == MVT::f16) {
3034 // v_exp_f16 (fmul x, log2e)
3035 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3036 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3037
3038 if (VT.isVector())
3039 return SDValue();
3040
3041 // exp(f16 x) ->
3042 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3043
3044 // Nothing in half is a denormal when promoted to f32.
3045 SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: X, Flags);
3046 SDValue Lowered = lowerFEXPUnsafe(X: Ext, SL, DAG, Flags);
3047 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Lowered,
3048 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
3049 }
3050
3051 assert(VT == MVT::f32);
3052
3053 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3054 // library behavior. Also, is known-not-daz source sufficient?
3055 if (allowApproxFunc(DAG, Flags)) {
3056 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3057 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3058 }
3059
3060 // Algorithm:
3061 //
3062 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3063 //
3064 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3065 // n = 64*m + j, 0 <= j < 64
3066 //
3067 // e^x = 2^((64*m + j + f)/64)
3068 // = (2^m) * (2^(j/64)) * 2^(f/64)
3069 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3070 //
3071 // f = x*(64/ln(2)) - n
3072 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3073 //
3074 // e^x = (2^m) * (2^(j/64)) * e^r
3075 //
3076 // (2^(j/64)) is precomputed
3077 //
3078 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3079 // e^r = 1 + q
3080 //
3081 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3082 //
3083 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3084 SDNodeFlags FlagsNoContract = Flags;
3085 FlagsNoContract.setAllowContract(false);
3086
3087 SDValue PH, PL;
3088 if (Subtarget->hasFastFMAF32()) {
3089 const float c_exp = numbers::log2ef;
3090 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3091 const float c_exp10 = 0x1.a934f0p+1f;
3092 const float cc_exp10 = 0x1.2f346ep-24f;
3093
3094 SDValue C = DAG.getConstantFP(Val: IsExp10 ? c_exp10 : c_exp, DL: SL, VT);
3095 SDValue CC = DAG.getConstantFP(Val: IsExp10 ? cc_exp10 : cc_exp, DL: SL, VT);
3096
3097 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: C, Flags);
3098 SDValue NegPH = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: PH, Flags);
3099 SDValue FMA0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: C, N3: NegPH, Flags);
3100 PL = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: X, N2: CC, N3: FMA0, Flags);
3101 } else {
3102 const float ch_exp = 0x1.714000p+0f;
3103 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3104
3105 const float ch_exp10 = 0x1.a92000p+1f;
3106 const float cl_exp10 = 0x1.4f0978p-11f;
3107
3108 SDValue CH = DAG.getConstantFP(Val: IsExp10 ? ch_exp10 : ch_exp, DL: SL, VT);
3109 SDValue CL = DAG.getConstantFP(Val: IsExp10 ? cl_exp10 : cl_exp, DL: SL, VT);
3110
3111 SDValue XAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: X);
3112 SDValue MaskConst = DAG.getConstant(Val: 0xfffff000, DL: SL, VT: MVT::i32);
3113 SDValue XHAsInt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: XAsInt, N2: MaskConst);
3114 SDValue XH = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: XHAsInt);
3115 SDValue XL = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: X, N2: XH, Flags);
3116
3117 PH = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XH, N2: CH, Flags);
3118
3119 SDValue XLCL = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: XL, N2: CL, Flags);
3120 SDValue Mad0 = getMad(DAG, SL, VT, X: XL, Y: CH, C: XLCL, Flags);
3121 PL = getMad(DAG, SL, VT, X: XH, Y: CL, C: Mad0, Flags);
3122 }
3123
3124 SDValue E = DAG.getNode(Opcode: ISD::FROUNDEVEN, DL: SL, VT, Operand: PH, Flags);
3125
3126 // It is unsafe to contract this fsub into the PH multiply.
3127 SDValue PHSubE = DAG.getNode(Opcode: ISD::FSUB, DL: SL, VT, N1: PH, N2: E, Flags: FlagsNoContract);
3128
3129 SDValue A = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: PHSubE, N2: PL, Flags);
3130 SDValue IntE = DAG.getNode(Opcode: ISD::FP_TO_SINT, DL: SL, VT: MVT::i32, Operand: E);
3131 SDValue Exp2 = DAG.getNode(Opcode: AMDGPUISD::EXP, DL: SL, VT, Operand: A, Flags);
3132
3133 SDValue R = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: Exp2, N2: IntE, Flags);
3134
3135 SDValue UnderflowCheckConst =
3136 DAG.getConstantFP(Val: IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, DL: SL, VT);
3137
3138 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Context&: *DAG.getContext(), VT);
3139 SDValue Zero = DAG.getConstantFP(Val: 0.0, DL: SL, VT);
3140 SDValue Underflow =
3141 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: UnderflowCheckConst, Cond: ISD::SETOLT);
3142
3143 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Underflow, N2: Zero, N3: R);
3144 const auto &Options = getTargetMachine().Options;
3145
3146 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3147 SDValue OverflowCheckConst =
3148 DAG.getConstantFP(Val: IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, DL: SL, VT);
3149 SDValue Overflow =
3150 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: X, RHS: OverflowCheckConst, Cond: ISD::SETOGT);
3151 SDValue Inf =
3152 DAG.getConstantFP(Val: APFloat::getInf(Sem: APFloat::IEEEsingle()), DL: SL, VT);
3153 R = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Overflow, N2: Inf, N3: R);
3154 }
3155
3156 return R;
3157}
3158
3159static bool isCtlzOpc(unsigned Opc) {
3160 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3161}
3162
3163static bool isCttzOpc(unsigned Opc) {
3164 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3165}
3166
3167SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
3168 SelectionDAG &DAG) const {
3169 auto SL = SDLoc(Op);
3170 auto Opc = Op.getOpcode();
3171 auto Arg = Op.getOperand(i: 0u);
3172 auto ResultVT = Op.getValueType();
3173
3174 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3175 return {};
3176
3177 assert(isCtlzOpc(Opc));
3178 assert(ResultVT == Arg.getValueType());
3179
3180 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3181 SDValue NumExtBits = DAG.getConstant(Val: 32u - NumBits, DL: SL, VT: MVT::i32);
3182 SDValue NewOp;
3183
3184 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3185 NewOp = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3186 NewOp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3187 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3188 } else {
3189 NewOp = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Arg);
3190 NewOp = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, Operand: NewOp);
3191 NewOp = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewOp, N2: NumExtBits);
3192 }
3193
3194 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ResultVT, Operand: NewOp);
3195}
3196
3197SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
3198 SDLoc SL(Op);
3199 SDValue Src = Op.getOperand(i: 0);
3200
3201 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3202 bool Ctlz = isCtlzOpc(Opc: Op.getOpcode());
3203 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3204
3205 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3206 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3207 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3208
3209 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3210 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3211 // (cttz hi:lo) -> (umin (ffbl src), 32)
3212 // (ctlz_zero_undef src) -> (ffbh src)
3213 // (cttz_zero_undef src) -> (ffbl src)
3214
3215 // 64-bit scalar version produce 32-bit result
3216 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3217 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3218 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3219 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3220 SDValue NewOpr = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Src);
3221 if (!ZeroUndef) {
3222 const SDValue ConstVal = DAG.getConstant(
3223 Val: Op.getValueType().getScalarSizeInBits(), DL: SL, VT: MVT::i32);
3224 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: ConstVal);
3225 }
3226 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: Src.getValueType(), Operand: NewOpr);
3227 }
3228
3229 SDValue Lo, Hi;
3230 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3231
3232 SDValue OprLo = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Lo);
3233 SDValue OprHi = DAG.getNode(Opcode: NewOpc, DL: SL, VT: MVT::i32, Operand: Hi);
3234
3235 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3236 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3237 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3238 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3239
3240 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3241 const SDValue Const32 = DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32);
3242 if (Ctlz)
3243 OprLo = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprLo, N2: Const32);
3244 else
3245 OprHi = DAG.getNode(Opcode: AddOpc, DL: SL, VT: MVT::i32, N1: OprHi, N2: Const32);
3246
3247 SDValue NewOpr;
3248 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: OprLo, N2: OprHi);
3249 if (!ZeroUndef) {
3250 const SDValue Const64 = DAG.getConstant(Val: 64, DL: SL, VT: MVT::i32);
3251 NewOpr = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewOpr, N2: Const64);
3252 }
3253
3254 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: NewOpr);
3255}
3256
3257SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
3258 bool Signed) const {
3259 // The regular method converting a 64-bit integer to float roughly consists of
3260 // 2 steps: normalization and rounding. In fact, after normalization, the
3261 // conversion from a 64-bit integer to a float is essentially the same as the
3262 // one from a 32-bit integer. The only difference is that it has more
3263 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3264 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3265 // converted into the correct float number. The basic steps for the unsigned
3266 // conversion are illustrated in the following pseudo code:
3267 //
3268 // f32 uitofp(i64 u) {
3269 // i32 hi, lo = split(u);
3270 // // Only count the leading zeros in hi as we have native support of the
3271 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3272 // // reduced to a 32-bit one automatically.
3273 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3274 // u <<= shamt;
3275 // hi, lo = split(u);
3276 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3277 // // convert it as a 32-bit integer and scale the result back.
3278 // return uitofp(hi) * 2^(32 - shamt);
3279 // }
3280 //
3281 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3282 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3283 // converted instead followed by negation based its sign bit.
3284
3285 SDLoc SL(Op);
3286 SDValue Src = Op.getOperand(i: 0);
3287
3288 SDValue Lo, Hi;
3289 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3290 SDValue Sign;
3291 SDValue ShAmt;
3292 if (Signed && Subtarget->isGCN()) {
3293 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3294 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3295 // account. That is, the maximal shift is
3296 // - 32 if Lo and Hi have opposite signs;
3297 // - 33 if Lo and Hi have the same sign.
3298 //
3299 // Or, MaxShAmt = 33 + OppositeSign, where
3300 //
3301 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3302 // - -1 if Lo and Hi have opposite signs; and
3303 // - 0 otherwise.
3304 //
3305 // All in all, ShAmt is calculated as
3306 //
3307 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3308 //
3309 // or
3310 //
3311 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3312 //
3313 // to reduce the critical path.
3314 SDValue OppositeSign = DAG.getNode(
3315 Opcode: ISD::SRA, DL: SL, VT: MVT::i32, N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: Lo, N2: Hi),
3316 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3317 SDValue MaxShAmt =
3318 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3319 N2: OppositeSign);
3320 // Count the leading sign bits.
3321 ShAmt = DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL: SL, VT: MVT::i32, Operand: Hi);
3322 // Different from unsigned conversion, the shift should be one bit less to
3323 // preserve the sign bit.
3324 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: ShAmt,
3325 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
3326 ShAmt = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: ShAmt, N2: MaxShAmt);
3327 } else {
3328 if (Signed) {
3329 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3330 // absolute value first.
3331 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i64, N1: Src,
3332 N2: DAG.getConstant(Val: 63, DL: SL, VT: MVT::i64));
3333 SDValue Abs =
3334 DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64,
3335 N1: DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: Src, N2: Sign), N2: Sign);
3336 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Abs, DAG);
3337 }
3338 // Count the leading zeros.
3339 ShAmt = DAG.getNode(Opcode: ISD::CTLZ, DL: SL, VT: MVT::i32, Operand: Hi);
3340 // The shift amount for signed integers is [0, 32].
3341 }
3342 // Normalize the given 64-bit integer.
3343 SDValue Norm = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i64, N1: Src, N2: ShAmt);
3344 // Split it again.
3345 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Norm, DAG);
3346 // Calculate the adjust bit for rounding.
3347 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3348 SDValue Adjust = DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32,
3349 N1: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32), N2: Lo);
3350 // Get the 32-bit normalized integer.
3351 Norm = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Hi, N2: Adjust);
3352 // Convert the normalized 32-bit integer into f32.
3353 unsigned Opc =
3354 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3355 SDValue FVal = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::f32, Operand: Norm);
3356
3357 // Finally, need to scale back the converted floating number as the original
3358 // 64-bit integer is converted as a 32-bit one.
3359 ShAmt = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32),
3360 N2: ShAmt);
3361 // On GCN, use LDEXP directly.
3362 if (Subtarget->isGCN())
3363 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f32, N1: FVal, N2: ShAmt);
3364
3365 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3366 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3367 // exponent is enough to avoid overflowing into the sign bit.
3368 SDValue Exp = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ShAmt,
3369 N2: DAG.getConstant(Val: 23, DL: SL, VT: MVT::i32));
3370 SDValue IVal =
3371 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32,
3372 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: FVal), N2: Exp);
3373 if (Signed) {
3374 // Set the sign bit.
3375 Sign = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32,
3376 N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Sign),
3377 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3378 IVal = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: IVal, N2: Sign);
3379 }
3380 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: IVal);
3381}
3382
3383SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
3384 bool Signed) const {
3385 SDLoc SL(Op);
3386 SDValue Src = Op.getOperand(i: 0);
3387
3388 SDValue Lo, Hi;
3389 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: Src, DAG);
3390
3391 SDValue CvtHi = DAG.getNode(Opcode: Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
3392 DL: SL, VT: MVT::f64, Operand: Hi);
3393
3394 SDValue CvtLo = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f64, Operand: Lo);
3395
3396 SDValue LdExp = DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT: MVT::f64, N1: CvtHi,
3397 N2: DAG.getConstant(Val: 32, DL: SL, VT: MVT::i32));
3398 // TODO: Should this propagate fast-math-flags?
3399 return DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f64, N1: LdExp, N2: CvtLo);
3400}
3401
3402SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
3403 SelectionDAG &DAG) const {
3404 // TODO: Factor out code common with LowerSINT_TO_FP.
3405 EVT DestVT = Op.getValueType();
3406 SDValue Src = Op.getOperand(i: 0);
3407 EVT SrcVT = Src.getValueType();
3408
3409 if (SrcVT == MVT::i16) {
3410 if (DestVT == MVT::f16)
3411 return Op;
3412 SDLoc DL(Op);
3413
3414 // Promote src to i32
3415 SDValue Ext = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Src);
3416 return DAG.getNode(Opcode: ISD::UINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3417 }
3418
3419 if (DestVT == MVT::bf16) {
3420 SDLoc SL(Op);
3421 SDValue ToF32 = DAG.getNode(Opcode: ISD::UINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3422 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3423 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3424 }
3425
3426 if (SrcVT != MVT::i64)
3427 return Op;
3428
3429 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3430 SDLoc DL(Op);
3431
3432 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3433 SDValue FPRoundFlag =
3434 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3435 SDValue FPRound =
3436 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3437
3438 return FPRound;
3439 }
3440
3441 if (DestVT == MVT::f32)
3442 return LowerINT_TO_FP32(Op, DAG, Signed: false);
3443
3444 assert(DestVT == MVT::f64);
3445 return LowerINT_TO_FP64(Op, DAG, Signed: false);
3446}
3447
3448SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
3449 SelectionDAG &DAG) const {
3450 EVT DestVT = Op.getValueType();
3451
3452 SDValue Src = Op.getOperand(i: 0);
3453 EVT SrcVT = Src.getValueType();
3454
3455 if (SrcVT == MVT::i16) {
3456 if (DestVT == MVT::f16)
3457 return Op;
3458
3459 SDLoc DL(Op);
3460 // Promote src to i32
3461 SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i32, Operand: Src);
3462 return DAG.getNode(Opcode: ISD::SINT_TO_FP, DL, VT: DestVT, Operand: Ext);
3463 }
3464
3465 if (DestVT == MVT::bf16) {
3466 SDLoc SL(Op);
3467 SDValue ToF32 = DAG.getNode(Opcode: ISD::SINT_TO_FP, DL: SL, VT: MVT::f32, Operand: Src);
3468 SDValue FPRoundFlag = DAG.getIntPtrConstant(Val: 0, DL: SL, /*isTarget=*/true);
3469 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ToF32, N2: FPRoundFlag);
3470 }
3471
3472 if (SrcVT != MVT::i64)
3473 return Op;
3474
3475 // TODO: Factor out code common with LowerUINT_TO_FP.
3476
3477 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3478 SDLoc DL(Op);
3479 SDValue Src = Op.getOperand(i: 0);
3480
3481 SDValue IntToFp32 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f32, Operand: Src);
3482 SDValue FPRoundFlag =
3483 DAG.getIntPtrConstant(Val: 0, DL: SDLoc(Op), /*isTarget=*/true);
3484 SDValue FPRound =
3485 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: IntToFp32, N2: FPRoundFlag);
3486
3487 return FPRound;
3488 }
3489
3490 if (DestVT == MVT::f32)
3491 return LowerINT_TO_FP32(Op, DAG, Signed: true);
3492
3493 assert(DestVT == MVT::f64);
3494 return LowerINT_TO_FP64(Op, DAG, Signed: true);
3495}
3496
3497SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
3498 bool Signed) const {
3499 SDLoc SL(Op);
3500
3501 SDValue Src = Op.getOperand(i: 0);
3502 EVT SrcVT = Src.getValueType();
3503
3504 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3505
3506 // The basic idea of converting a floating point number into a pair of 32-bit
3507 // integers is illustrated as follows:
3508 //
3509 // tf := trunc(val);
3510 // hif := floor(tf * 2^-32);
3511 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3512 // hi := fptoi(hif);
3513 // lo := fptoi(lof);
3514 //
3515 SDValue Trunc = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT: SrcVT, Operand: Src);
3516 SDValue Sign;
3517 if (Signed && SrcVT == MVT::f32) {
3518 // However, a 32-bit floating point number has only 23 bits mantissa and
3519 // it's not enough to hold all the significant bits of `lof` if val is
3520 // negative. To avoid the loss of precision, We need to take the absolute
3521 // value after truncating and flip the result back based on the original
3522 // signedness.
3523 Sign = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: MVT::i32,
3524 N1: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Trunc),
3525 N2: DAG.getConstant(Val: 31, DL: SL, VT: MVT::i32));
3526 Trunc = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: SrcVT, Operand: Trunc);
3527 }
3528
3529 SDValue K0, K1;
3530 if (SrcVT == MVT::f64) {
3531 K0 = DAG.getConstantFP(
3532 Val: llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), DL: SL,
3533 VT: SrcVT);
3534 K1 = DAG.getConstantFP(
3535 Val: llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), DL: SL,
3536 VT: SrcVT);
3537 } else {
3538 K0 = DAG.getConstantFP(
3539 Val: llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), DL: SL, VT: SrcVT);
3540 K1 = DAG.getConstantFP(
3541 Val: llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), DL: SL, VT: SrcVT);
3542 }
3543 // TODO: Should this propagate fast-math-flags?
3544 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: SrcVT, N1: Trunc, N2: K0);
3545
3546 SDValue FloorMul = DAG.getNode(Opcode: ISD::FFLOOR, DL: SL, VT: SrcVT, Operand: Mul);
3547
3548 SDValue Fma = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: SrcVT, N1: FloorMul, N2: K1, N3: Trunc);
3549
3550 SDValue Hi = DAG.getNode(Opcode: (Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3551 : ISD::FP_TO_UINT,
3552 DL: SL, VT: MVT::i32, Operand: FloorMul);
3553 SDValue Lo = DAG.getNode(Opcode: ISD::FP_TO_UINT, DL: SL, VT: MVT::i32, Operand: Fma);
3554
3555 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3556 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Lo, Hi}));
3557
3558 if (Signed && SrcVT == MVT::f32) {
3559 assert(Sign);
3560 // Flip the result based on the signedness, which is either all 0s or 1s.
3561 Sign = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64,
3562 Operand: DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {Sign, Sign}));
3563 // r := xor(r, sign) - sign;
3564 Result =
3565 DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i64,
3566 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i64, N1: Result, N2: Sign), N2: Sign);
3567 }
3568
3569 return Result;
3570}
3571
3572SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
3573 SDLoc DL(Op);
3574 SDValue N0 = Op.getOperand(i: 0);
3575
3576 // Convert to target node to get known bits
3577 if (N0.getValueType() == MVT::f32)
3578 return DAG.getNode(Opcode: AMDGPUISD::FP_TO_FP16, DL, VT: Op.getValueType(), Operand: N0);
3579
3580 if (getTargetMachine().Options.UnsafeFPMath) {
3581 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3582 return SDValue();
3583 }
3584
3585 return LowerF64ToF16Safe(Src: N0, DL, DAG);
3586}
3587
3588// return node in i32
3589SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL,
3590 SelectionDAG &DAG) const {
3591 assert(Src.getSimpleValueType() == MVT::f64);
3592
3593 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3594 // TODO: We can generate better code for True16.
3595 const unsigned ExpMask = 0x7ff;
3596 const unsigned ExpBiasf64 = 1023;
3597 const unsigned ExpBiasf16 = 15;
3598 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3599 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
3600 SDValue U = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Src);
3601 SDValue UH = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: U,
3602 N2: DAG.getConstant(Val: 32, DL, VT: MVT::i64));
3603 UH = DAG.getZExtOrTrunc(Op: UH, DL, VT: MVT::i32);
3604 U = DAG.getZExtOrTrunc(Op: U, DL, VT: MVT::i32);
3605 SDValue E = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3606 N2: DAG.getConstant(Val: 20, DL, VT: MVT::i64));
3607 E = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: E,
3608 N2: DAG.getConstant(Val: ExpMask, DL, VT: MVT::i32));
3609 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3610 // add the f16 bias (15) to get the biased exponent for the f16 format.
3611 E = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: E,
3612 N2: DAG.getConstant(Val: -ExpBiasf64 + ExpBiasf16, DL, VT: MVT::i32));
3613
3614 SDValue M = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3615 N2: DAG.getConstant(Val: 8, DL, VT: MVT::i32));
3616 M = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: M,
3617 N2: DAG.getConstant(Val: 0xffe, DL, VT: MVT::i32));
3618
3619 SDValue MaskedSig = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UH,
3620 N2: DAG.getConstant(Val: 0x1ff, DL, VT: MVT::i32));
3621 MaskedSig = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: MaskedSig, N2: U);
3622
3623 SDValue Lo40Set = DAG.getSelectCC(DL, LHS: MaskedSig, RHS: Zero, True: Zero, False: One, Cond: ISD::SETEQ);
3624 M = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M, N2: Lo40Set);
3625
3626 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3627 SDValue I = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32,
3628 N1: DAG.getSelectCC(DL, LHS: M, RHS: Zero, True: DAG.getConstant(Val: 0x0200, DL, VT: MVT::i32),
3629 False: Zero, Cond: ISD::SETNE), N2: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32));
3630
3631 // N = M | (E << 12);
3632 SDValue N = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3633 N2: DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: E,
3634 N2: DAG.getConstant(Val: 12, DL, VT: MVT::i32)));
3635
3636 // B = clamp(1-E, 0, 13);
3637 SDValue OneSubExp = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32,
3638 N1: One, N2: E);
3639 SDValue B = DAG.getNode(Opcode: ISD::SMAX, DL, VT: MVT::i32, N1: OneSubExp, N2: Zero);
3640 B = DAG.getNode(Opcode: ISD::SMIN, DL, VT: MVT::i32, N1: B,
3641 N2: DAG.getConstant(Val: 13, DL, VT: MVT::i32));
3642
3643 SDValue SigSetHigh = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: M,
3644 N2: DAG.getConstant(Val: 0x1000, DL, VT: MVT::i32));
3645
3646 SDValue D = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: SigSetHigh, N2: B);
3647 SDValue D0 = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: D, N2: B);
3648 SDValue D1 = DAG.getSelectCC(DL, LHS: D0, RHS: SigSetHigh, True: One, False: Zero, Cond: ISD::SETNE);
3649 D = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: D, N2: D1);
3650
3651 SDValue V = DAG.getSelectCC(DL, LHS: E, RHS: One, True: D, False: N, Cond: ISD::SETLT);
3652 SDValue VLow3 = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: V,
3653 N2: DAG.getConstant(Val: 0x7, DL, VT: MVT::i32));
3654 V = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: V,
3655 N2: DAG.getConstant(Val: 2, DL, VT: MVT::i32));
3656 SDValue V0 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 3, DL, VT: MVT::i32),
3657 True: One, False: Zero, Cond: ISD::SETEQ);
3658 SDValue V1 = DAG.getSelectCC(DL, LHS: VLow3, RHS: DAG.getConstant(Val: 5, DL, VT: MVT::i32),
3659 True: One, False: Zero, Cond: ISD::SETGT);
3660 V1 = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: V0, N2: V1);
3661 V = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: V, N2: V1);
3662
3663 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 30, DL, VT: MVT::i32),
3664 True: DAG.getConstant(Val: 0x7c00, DL, VT: MVT::i32), False: V, Cond: ISD::SETGT);
3665 V = DAG.getSelectCC(DL, LHS: E, RHS: DAG.getConstant(Val: 1039, DL, VT: MVT::i32),
3666 True: I, False: V, Cond: ISD::SETEQ);
3667
3668 // Extract the sign bit.
3669 SDValue Sign = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: UH,
3670 N2: DAG.getConstant(Val: 16, DL, VT: MVT::i32));
3671 Sign = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: Sign,
3672 N2: DAG.getConstant(Val: 0x8000, DL, VT: MVT::i32));
3673
3674 return DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Sign, N2: V);
3675}
3676
3677SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,
3678 SelectionDAG &DAG) const {
3679 SDValue Src = Op.getOperand(i: 0);
3680 unsigned OpOpcode = Op.getOpcode();
3681 EVT SrcVT = Src.getValueType();
3682 EVT DestVT = Op.getValueType();
3683
3684 // Will be selected natively
3685 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3686 return Op;
3687
3688 if (SrcVT == MVT::bf16) {
3689 SDLoc DL(Op);
3690 SDValue PromotedSrc = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Src);
3691 return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DestVT, Operand: PromotedSrc);
3692 }
3693
3694 // Promote i16 to i32
3695 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3696 SDLoc DL(Op);
3697
3698 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3699 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToInt32);
3700 }
3701
3702 if (DestVT != MVT::i64)
3703 return Op;
3704
3705 if (SrcVT == MVT::f16 ||
3706 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3707 SDLoc DL(Op);
3708
3709 SDValue FpToInt32 = DAG.getNode(Opcode: OpOpcode, DL, VT: MVT::i32, Operand: Src);
3710 unsigned Ext =
3711 OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3712 return DAG.getNode(Opcode: Ext, DL, VT: MVT::i64, Operand: FpToInt32);
3713 }
3714
3715 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3716 return LowerFP_TO_INT64(Op, DAG, Signed: OpOpcode == ISD::FP_TO_SINT);
3717
3718 return SDValue();
3719}
3720
3721SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
3722 SelectionDAG &DAG) const {
3723 EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: 1))->getVT();
3724 MVT VT = Op.getSimpleValueType();
3725 MVT ScalarVT = VT.getScalarType();
3726
3727 assert(VT.isVector());
3728
3729 SDValue Src = Op.getOperand(i: 0);
3730 SDLoc DL(Op);
3731
3732 // TODO: Don't scalarize on Evergreen?
3733 unsigned NElts = VT.getVectorNumElements();
3734 SmallVector<SDValue, 8> Args;
3735 DAG.ExtractVectorElements(Op: Src, Args, Start: 0, Count: NElts);
3736
3737 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3738 for (unsigned I = 0; I < NElts; ++I)
3739 Args[I] = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ScalarVT, N1: Args[I], N2: VTOp);
3740
3741 return DAG.getBuildVector(VT, DL, Ops: Args);
3742}
3743
3744//===----------------------------------------------------------------------===//
3745// Custom DAG optimizations
3746//===----------------------------------------------------------------------===//
3747
3748static bool isU24(SDValue Op, SelectionDAG &DAG) {
3749 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3750}
3751
3752static bool isI24(SDValue Op, SelectionDAG &DAG) {
3753 EVT VT = Op.getValueType();
3754 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3755 // as unsigned 24-bit values.
3756 AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
3757}
3758
3759static SDValue simplifyMul24(SDNode *Node24,
3760 TargetLowering::DAGCombinerInfo &DCI) {
3761 SelectionDAG &DAG = DCI.DAG;
3762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3763 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3764
3765 SDValue LHS = IsIntrin ? Node24->getOperand(Num: 1) : Node24->getOperand(Num: 0);
3766 SDValue RHS = IsIntrin ? Node24->getOperand(Num: 2) : Node24->getOperand(Num: 1);
3767 unsigned NewOpcode = Node24->getOpcode();
3768 if (IsIntrin) {
3769 unsigned IID = Node24->getConstantOperandVal(Num: 0);
3770 switch (IID) {
3771 case Intrinsic::amdgcn_mul_i24:
3772 NewOpcode = AMDGPUISD::MUL_I24;
3773 break;
3774 case Intrinsic::amdgcn_mul_u24:
3775 NewOpcode = AMDGPUISD::MUL_U24;
3776 break;
3777 case Intrinsic::amdgcn_mulhi_i24:
3778 NewOpcode = AMDGPUISD::MULHI_I24;
3779 break;
3780 case Intrinsic::amdgcn_mulhi_u24:
3781 NewOpcode = AMDGPUISD::MULHI_U24;
3782 break;
3783 default:
3784 llvm_unreachable("Expected 24-bit mul intrinsic");
3785 }
3786 }
3787
3788 APInt Demanded = APInt::getLowBitsSet(numBits: LHS.getValueSizeInBits(), loBitsSet: 24);
3789
3790 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3791 // the operands to have other uses, but will only perform simplifications that
3792 // involve bypassing some nodes for this user.
3793 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(Op: LHS, DemandedBits: Demanded, DAG);
3794 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(Op: RHS, DemandedBits: Demanded, DAG);
3795 if (DemandedLHS || DemandedRHS)
3796 return DAG.getNode(Opcode: NewOpcode, DL: SDLoc(Node24), VTList: Node24->getVTList(),
3797 N1: DemandedLHS ? DemandedLHS : LHS,
3798 N2: DemandedRHS ? DemandedRHS : RHS);
3799
3800 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3801 // operands if this node is the only user.
3802 if (TLI.SimplifyDemandedBits(Op: LHS, DemandedBits: Demanded, DCI))
3803 return SDValue(Node24, 0);
3804 if (TLI.SimplifyDemandedBits(Op: RHS, DemandedBits: Demanded, DCI))
3805 return SDValue(Node24, 0);
3806
3807 return SDValue();
3808}
3809
3810template <typename IntTy>
3811static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
3812 uint32_t Width, const SDLoc &DL) {
3813 if (Width + Offset < 32) {
3814 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3815 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3816 if constexpr (std::is_signed_v<IntTy>) {
3817 return DAG.getSignedConstant(Val: Result, DL, VT: MVT::i32);
3818 } else {
3819 return DAG.getConstant(Result, DL, MVT::i32);
3820 }
3821 }
3822
3823 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3824}
3825
3826static bool hasVolatileUser(SDNode *Val) {
3827 for (SDNode *U : Val->users()) {
3828 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: U)) {
3829 if (M->isVolatile())
3830 return true;
3831 }
3832 }
3833
3834 return false;
3835}
3836
3837bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
3838 // i32 vectors are the canonical memory type.
3839 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3840 return false;
3841
3842 if (!VT.isByteSized())
3843 return false;
3844
3845 unsigned Size = VT.getStoreSize();
3846
3847 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3848 return false;
3849
3850 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3851 return false;
3852
3853 return true;
3854}
3855
3856// Replace load of an illegal type with a store of a bitcast to a friendlier
3857// type.
3858SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
3859 DAGCombinerInfo &DCI) const {
3860 if (!DCI.isBeforeLegalize())
3861 return SDValue();
3862
3863 LoadSDNode *LN = cast<LoadSDNode>(Val: N);
3864 if (!LN->isSimple() || !ISD::isNormalLoad(N: LN) || hasVolatileUser(Val: LN))
3865 return SDValue();
3866
3867 SDLoc SL(N);
3868 SelectionDAG &DAG = DCI.DAG;
3869 EVT VT = LN->getMemoryVT();
3870
3871 unsigned Size = VT.getStoreSize();
3872 Align Alignment = LN->getAlign();
3873 if (Alignment < Size && isTypeLegal(VT)) {
3874 unsigned IsFast;
3875 unsigned AS = LN->getAddressSpace();
3876
3877 // Expand unaligned loads earlier than legalization. Due to visitation order
3878 // problems during legalization, the emitted instructions to pack and unpack
3879 // the bytes again are not eliminated in the case of an unaligned copy.
3880 if (!allowsMisalignedMemoryAccesses(
3881 VT, AddrSpace: AS, Alignment, Flags: LN->getMemOperand()->getFlags(), &IsFast)) {
3882 if (VT.isVector())
3883 return SplitVectorLoad(Op: SDValue(LN, 0), DAG);
3884
3885 SDValue Ops[2];
3886 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: LN, DAG);
3887
3888 return DAG.getMergeValues(Ops, dl: SDLoc(N));
3889 }
3890
3891 if (!IsFast)
3892 return SDValue();
3893 }
3894
3895 if (!shouldCombineMemoryType(VT))
3896 return SDValue();
3897
3898 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3899
3900 SDValue NewLoad
3901 = DAG.getLoad(VT: NewVT, dl: SL, Chain: LN->getChain(),
3902 Ptr: LN->getBasePtr(), MMO: LN->getMemOperand());
3903
3904 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewLoad);
3905 DCI.CombineTo(N, Res0: BC, Res1: NewLoad.getValue(R: 1));
3906 return SDValue(N, 0);
3907}
3908
3909// Replace store of an illegal type with a store of a bitcast to a friendlier
3910// type.
3911SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3912 DAGCombinerInfo &DCI) const {
3913 if (!DCI.isBeforeLegalize())
3914 return SDValue();
3915
3916 StoreSDNode *SN = cast<StoreSDNode>(Val: N);
3917 if (!SN->isSimple() || !ISD::isNormalStore(N: SN))
3918 return SDValue();
3919
3920 EVT VT = SN->getMemoryVT();
3921 unsigned Size = VT.getStoreSize();
3922
3923 SDLoc SL(N);
3924 SelectionDAG &DAG = DCI.DAG;
3925 Align Alignment = SN->getAlign();
3926 if (Alignment < Size && isTypeLegal(VT)) {
3927 unsigned IsFast;
3928 unsigned AS = SN->getAddressSpace();
3929
3930 // Expand unaligned stores earlier than legalization. Due to visitation
3931 // order problems during legalization, the emitted instructions to pack and
3932 // unpack the bytes again are not eliminated in the case of an unaligned
3933 // copy.
3934 if (!allowsMisalignedMemoryAccesses(
3935 VT, AddrSpace: AS, Alignment, Flags: SN->getMemOperand()->getFlags(), &IsFast)) {
3936 if (VT.isVector())
3937 return SplitVectorStore(Op: SDValue(SN, 0), DAG);
3938
3939 return expandUnalignedStore(ST: SN, DAG);
3940 }
3941
3942 if (!IsFast)
3943 return SDValue();
3944 }
3945
3946 if (!shouldCombineMemoryType(VT))
3947 return SDValue();
3948
3949 EVT NewVT = getEquivalentMemType(Ctx&: *DAG.getContext(), VT);
3950 SDValue Val = SN->getValue();
3951
3952 //DCI.AddToWorklist(Val.getNode());
3953
3954 bool OtherUses = !Val.hasOneUse();
3955 SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Val);
3956 if (OtherUses) {
3957 SDValue CastBack = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: CastVal);
3958 DAG.ReplaceAllUsesOfValueWith(From: Val, To: CastBack);
3959 }
3960
3961 return DAG.getStore(Chain: SN->getChain(), dl: SL, Val: CastVal,
3962 Ptr: SN->getBasePtr(), MMO: SN->getMemOperand());
3963}
3964
3965// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3966// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3967// issues.
3968SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3969 DAGCombinerInfo &DCI) const {
3970 SelectionDAG &DAG = DCI.DAG;
3971 SDValue N0 = N->getOperand(Num: 0);
3972
3973 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3974 // (vt2 (truncate (assertzext vt0:x, vt1)))
3975 if (N0.getOpcode() == ISD::TRUNCATE) {
3976 SDValue N1 = N->getOperand(Num: 1);
3977 EVT ExtVT = cast<VTSDNode>(Val&: N1)->getVT();
3978 SDLoc SL(N);
3979
3980 SDValue Src = N0.getOperand(i: 0);
3981 EVT SrcVT = Src.getValueType();
3982 if (SrcVT.bitsGE(VT: ExtVT)) {
3983 SDValue NewInReg = DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: SrcVT, N1: Src, N2: N1);
3984 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: N->getValueType(ResNo: 0), Operand: NewInReg);
3985 }
3986 }
3987
3988 return SDValue();
3989}
3990
3991SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3992 SDNode *N, DAGCombinerInfo &DCI) const {
3993 unsigned IID = N->getConstantOperandVal(Num: 0);
3994 switch (IID) {
3995 case Intrinsic::amdgcn_mul_i24:
3996 case Intrinsic::amdgcn_mul_u24:
3997 case Intrinsic::amdgcn_mulhi_i24:
3998 case Intrinsic::amdgcn_mulhi_u24:
3999 return simplifyMul24(Node24: N, DCI);
4000 case Intrinsic::amdgcn_fract:
4001 case Intrinsic::amdgcn_rsq:
4002 case Intrinsic::amdgcn_rcp_legacy:
4003 case Intrinsic::amdgcn_rsq_legacy:
4004 case Intrinsic::amdgcn_rsq_clamp: {
4005 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4006 SDValue Src = N->getOperand(Num: 1);
4007 return Src.isUndef() ? Src : SDValue();
4008 }
4009 case Intrinsic::amdgcn_frexp_exp: {
4010 // frexp_exp (fneg x) -> frexp_exp x
4011 // frexp_exp (fabs x) -> frexp_exp x
4012 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4013 SDValue Src = N->getOperand(Num: 1);
4014 SDValue PeekSign = peekFPSignOps(Val: Src);
4015 if (PeekSign == Src)
4016 return SDValue();
4017 return SDValue(DCI.DAG.UpdateNodeOperands(N, Op1: N->getOperand(Num: 0), Op2: PeekSign),
4018 0);
4019 }
4020 default:
4021 return SDValue();
4022 }
4023}
4024
4025/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4026/// binary operation \p Opc to it with the corresponding constant operands.
4027SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
4028 DAGCombinerInfo &DCI, const SDLoc &SL,
4029 unsigned Opc, SDValue LHS,
4030 uint32_t ValLo, uint32_t ValHi) const {
4031 SelectionDAG &DAG = DCI.DAG;
4032 SDValue Lo, Hi;
4033 std::tie(args&: Lo, args&: Hi) = split64BitValue(Op: LHS, DAG);
4034
4035 SDValue LoRHS = DAG.getConstant(Val: ValLo, DL: SL, VT: MVT::i32);
4036 SDValue HiRHS = DAG.getConstant(Val: ValHi, DL: SL, VT: MVT::i32);
4037
4038 SDValue LoAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Lo, N2: LoRHS);
4039 SDValue HiAnd = DAG.getNode(Opcode: Opc, DL: SL, VT: MVT::i32, N1: Hi, N2: HiRHS);
4040
4041 // Re-visit the ands. It's possible we eliminated one of them and it could
4042 // simplify the vector.
4043 DCI.AddToWorklist(N: Lo.getNode());
4044 DCI.AddToWorklist(N: Hi.getNode());
4045
4046 SDValue Vec = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoAnd, HiAnd});
4047 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
4048}
4049
4050SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
4051 DAGCombinerInfo &DCI) const {
4052 EVT VT = N->getValueType(ResNo: 0);
4053 SDValue LHS = N->getOperand(Num: 0);
4054 SDValue RHS = N->getOperand(Num: 1);
4055 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4056 SDLoc SL(N);
4057 SelectionDAG &DAG = DCI.DAG;
4058
4059 unsigned RHSVal;
4060 if (CRHS) {
4061 RHSVal = CRHS->getZExtValue();
4062 if (!RHSVal)
4063 return LHS;
4064
4065 switch (LHS->getOpcode()) {
4066 default:
4067 break;
4068 case ISD::ZERO_EXTEND:
4069 case ISD::SIGN_EXTEND:
4070 case ISD::ANY_EXTEND: {
4071 SDValue X = LHS->getOperand(Num: 0);
4072
4073 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4074 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: MVT::v2i16)) {
4075 // Prefer build_vector as the canonical form if packed types are legal.
4076 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4077 SDValue Vec = DAG.getBuildVector(
4078 VT: MVT::v2i16, DL: SL,
4079 Ops: {DAG.getConstant(Val: 0, DL: SL, VT: MVT::i16), LHS->getOperand(Num: 0)});
4080 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Vec);
4081 }
4082
4083 // shl (ext x) => zext (shl x), if shift does not overflow int
4084 if (VT != MVT::i64)
4085 break;
4086 KnownBits Known = DAG.computeKnownBits(Op: X);
4087 unsigned LZ = Known.countMinLeadingZeros();
4088 if (LZ < RHSVal)
4089 break;
4090 EVT XVT = X.getValueType();
4091 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: XVT, N1: X, N2: SDValue(CRHS, 0));
4092 return DAG.getZExtOrTrunc(Op: Shl, DL: SL, VT);
4093 }
4094 }
4095 }
4096
4097 if (VT.getScalarType() != MVT::i64)
4098 return SDValue();
4099
4100 // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
4101
4102 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4103 // common case, splitting this into a move and a 32-bit shift is faster and
4104 // the same code size.
4105 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4106
4107 EVT ElementType = VT.getScalarType();
4108 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4109 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4110 : TargetScalarType;
4111
4112 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4113 return SDValue();
4114 SDValue ShiftAmt;
4115
4116 if (CRHS) {
4117 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4118 VT: TargetType);
4119 } else {
4120 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4121 const SDValue ShiftMask =
4122 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4123 // This AND instruction will clamp out of bounds shift values.
4124 // It will also be removed during later instruction selection.
4125 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4126 }
4127
4128 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: LHS);
4129 SDValue NewShift =
4130 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: TargetType, N1: Lo, N2: ShiftAmt, Flags: N->getFlags());
4131
4132 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4133 SDValue Vec;
4134
4135 if (VT.isVector()) {
4136 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4137 unsigned NElts = TargetType.getVectorNumElements();
4138 SmallVector<SDValue, 8> HiOps;
4139 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4140
4141 DAG.ExtractVectorElements(Op: NewShift, Args&: HiOps, Start: 0, Count: NElts);
4142 for (unsigned I = 0; I != NElts; ++I)
4143 HiAndLoOps[2 * I + 1] = HiOps[I];
4144 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4145 } else {
4146 EVT ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4147 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {Zero, NewShift});
4148 }
4149 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4150}
4151
4152SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
4153 DAGCombinerInfo &DCI) const {
4154 SDValue RHS = N->getOperand(Num: 1);
4155 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4156 EVT VT = N->getValueType(ResNo: 0);
4157 SDValue LHS = N->getOperand(Num: 0);
4158 SelectionDAG &DAG = DCI.DAG;
4159 SDLoc SL(N);
4160
4161 if (VT.getScalarType() != MVT::i64)
4162 return SDValue();
4163
4164 // For C >= 32
4165 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4166
4167 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4168 // common case, splitting this into a move and a 32-bit shift is faster and
4169 // the same code size.
4170 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4171
4172 EVT ElementType = VT.getScalarType();
4173 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4174 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4175 : TargetScalarType;
4176
4177 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4178 return SDValue();
4179
4180 SDValue ShiftFullAmt =
4181 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4182 SDValue ShiftAmt;
4183 if (CRHS) {
4184 unsigned RHSVal = CRHS->getZExtValue();
4185 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4186 VT: TargetType);
4187 } else if (Known.getMinValue().getZExtValue() ==
4188 (ElementType.getSizeInBits() - 1)) {
4189 ShiftAmt = ShiftFullAmt;
4190 } else {
4191 SDValue truncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4192 const SDValue ShiftMask =
4193 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4194 // This AND instruction will clamp out of bounds shift values.
4195 // It will also be removed during later instruction selection.
4196 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: truncShiftAmt, N2: ShiftMask);
4197 }
4198
4199 EVT ConcatType;
4200 SDValue Hi;
4201 SDLoc LHSSL(LHS);
4202 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4203 if (VT.isVector()) {
4204 unsigned NElts = TargetType.getVectorNumElements();
4205 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4206 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4207 SmallVector<SDValue, 8> HiOps(NElts);
4208 SmallVector<SDValue, 16> HiAndLoOps;
4209
4210 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, Start: 0, Count: NElts * 2);
4211 for (unsigned I = 0; I != NElts; ++I) {
4212 HiOps[I] = HiAndLoOps[2 * I + 1];
4213 }
4214 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4215 } else {
4216 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4217 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4218 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4219 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4220 }
4221 Hi = DAG.getFreeze(V: Hi);
4222
4223 SDValue HiShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftFullAmt);
4224 SDValue NewShift = DAG.getNode(Opcode: ISD::SRA, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt);
4225
4226 SDValue Vec;
4227 if (VT.isVector()) {
4228 unsigned NElts = TargetType.getVectorNumElements();
4229 SmallVector<SDValue, 8> HiOps;
4230 SmallVector<SDValue, 8> LoOps;
4231 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4232
4233 DAG.ExtractVectorElements(Op: HiShift, Args&: HiOps, Start: 0, Count: NElts);
4234 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4235 for (unsigned I = 0; I != NElts; ++I) {
4236 HiAndLoOps[2 * I + 1] = HiOps[I];
4237 HiAndLoOps[2 * I] = LoOps[I];
4238 }
4239 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4240 } else {
4241 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, HiShift});
4242 }
4243 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4244}
4245
4246SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
4247 DAGCombinerInfo &DCI) const {
4248 SDValue RHS = N->getOperand(Num: 1);
4249 ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
4250 EVT VT = N->getValueType(ResNo: 0);
4251 SDValue LHS = N->getOperand(Num: 0);
4252 SelectionDAG &DAG = DCI.DAG;
4253 SDLoc SL(N);
4254 unsigned RHSVal;
4255
4256 if (CRHS) {
4257 RHSVal = CRHS->getZExtValue();
4258
4259 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4260 // this improves the ability to match BFE patterns in isel.
4261 if (LHS.getOpcode() == ISD::AND) {
4262 if (auto *Mask = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1))) {
4263 unsigned MaskIdx, MaskLen;
4264 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4265 MaskIdx == RHSVal) {
4266 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT,
4267 N1: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 0),
4268 N2: N->getOperand(Num: 1)),
4269 N2: DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: LHS.getOperand(i: 1),
4270 N2: N->getOperand(Num: 1)));
4271 }
4272 }
4273 }
4274 }
4275
4276 if (VT.getScalarType() != MVT::i64)
4277 return SDValue();
4278
4279 // for C >= 32
4280 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4281
4282 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4283 // common case, splitting this into a move and a 32-bit shift is faster and
4284 // the same code size.
4285 KnownBits Known = DAG.computeKnownBits(Op: RHS);
4286
4287 EVT ElementType = VT.getScalarType();
4288 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(Context&: *DAG.getContext());
4289 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(EltVT: TargetScalarType)
4290 : TargetScalarType;
4291
4292 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4293 return SDValue();
4294
4295 SDValue ShiftAmt;
4296 if (CRHS) {
4297 ShiftAmt = DAG.getConstant(Val: RHSVal - TargetScalarType.getSizeInBits(), DL: SL,
4298 VT: TargetType);
4299 } else {
4300 SDValue TruncShiftAmt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: TargetType, Operand: RHS);
4301 const SDValue ShiftMask =
4302 DAG.getConstant(Val: TargetScalarType.getSizeInBits() - 1, DL: SL, VT: TargetType);
4303 // This AND instruction will clamp out of bounds shift values.
4304 // It will also be removed during later instruction selection.
4305 ShiftAmt = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: TargetType, N1: TruncShiftAmt, N2: ShiftMask);
4306 }
4307
4308 const SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT: TargetScalarType);
4309 EVT ConcatType;
4310 SDValue Hi;
4311 SDLoc LHSSL(LHS);
4312 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4313 if (VT.isVector()) {
4314 unsigned NElts = TargetType.getVectorNumElements();
4315 ConcatType = TargetType.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
4316 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4317 SmallVector<SDValue, 8> HiOps(NElts);
4318 SmallVector<SDValue, 16> HiAndLoOps;
4319
4320 DAG.ExtractVectorElements(Op: SplitLHS, Args&: HiAndLoOps, /*Start=*/0, Count: NElts * 2);
4321 for (unsigned I = 0; I != NElts; ++I)
4322 HiOps[I] = HiAndLoOps[2 * I + 1];
4323 Hi = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: LHSSL, VT: TargetType, Ops: HiOps);
4324 } else {
4325 const SDValue One = DAG.getConstant(Val: 1, DL: LHSSL, VT: TargetScalarType);
4326 ConcatType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: TargetType, NumElements: 2);
4327 SDValue SplitLHS = DAG.getNode(Opcode: ISD::BITCAST, DL: LHSSL, VT: ConcatType, Operand: LHS);
4328 Hi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: LHSSL, VT: TargetType, N1: SplitLHS, N2: One);
4329 }
4330
4331 SDValue NewShift = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: TargetType, N1: Hi, N2: ShiftAmt);
4332
4333 SDValue Vec;
4334 if (VT.isVector()) {
4335 unsigned NElts = TargetType.getVectorNumElements();
4336 SmallVector<SDValue, 8> LoOps;
4337 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4338
4339 DAG.ExtractVectorElements(Op: NewShift, Args&: LoOps, Start: 0, Count: NElts);
4340 for (unsigned I = 0; I != NElts; ++I)
4341 HiAndLoOps[2 * I] = LoOps[I];
4342 Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: ConcatType, Ops: HiAndLoOps);
4343 } else {
4344 Vec = DAG.getBuildVector(VT: ConcatType, DL: SL, Ops: {NewShift, Zero});
4345 }
4346 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Vec);
4347}
4348
4349SDValue AMDGPUTargetLowering::performTruncateCombine(
4350 SDNode *N, DAGCombinerInfo &DCI) const {
4351 SDLoc SL(N);
4352 SelectionDAG &DAG = DCI.DAG;
4353 EVT VT = N->getValueType(ResNo: 0);
4354 SDValue Src = N->getOperand(Num: 0);
4355
4356 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4357 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4358 SDValue Vec = Src.getOperand(i: 0);
4359 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4360 SDValue Elt0 = Vec.getOperand(i: 0);
4361 EVT EltVT = Elt0.getValueType();
4362 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4363 if (EltVT.isFloatingPoint()) {
4364 Elt0 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL,
4365 VT: EltVT.changeTypeToInteger(), Operand: Elt0);
4366 }
4367
4368 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Elt0);
4369 }
4370 }
4371 }
4372
4373 // Equivalent of above for accessing the high element of a vector as an
4374 // integer operation.
4375 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4376 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4377 if (auto *K = isConstOrConstSplat(N: Src.getOperand(i: 1))) {
4378 SDValue BV = stripBitcast(Val: Src.getOperand(i: 0));
4379 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4380 EVT SrcEltVT = BV.getOperand(i: 0).getValueType();
4381 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4382 unsigned BitIndex = K->getZExtValue();
4383 unsigned PartIndex = BitIndex / SrcEltSize;
4384
4385 if (PartIndex * SrcEltSize == BitIndex &&
4386 PartIndex < BV.getNumOperands()) {
4387 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4388 SDValue SrcElt =
4389 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcEltVT.changeTypeToInteger(),
4390 Operand: BV.getOperand(i: PartIndex));
4391 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: SrcElt);
4392 }
4393 }
4394 }
4395 }
4396 }
4397
4398 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4399 //
4400 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4401 // i16 (trunc (srl (i32 (trunc x), K)))
4402 if (VT.getScalarSizeInBits() < 32) {
4403 EVT SrcVT = Src.getValueType();
4404 if (SrcVT.getScalarSizeInBits() > 32 &&
4405 (Src.getOpcode() == ISD::SRL ||
4406 Src.getOpcode() == ISD::SRA ||
4407 Src.getOpcode() == ISD::SHL)) {
4408 SDValue Amt = Src.getOperand(i: 1);
4409 KnownBits Known = DAG.computeKnownBits(Op: Amt);
4410
4411 // - For left shifts, do the transform as long as the shift
4412 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4413 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4414 // losing information stored in the high bits when truncating.
4415 const unsigned MaxCstSize =
4416 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4417 if (Known.getMaxValue().ule(RHS: MaxCstSize)) {
4418 EVT MidVT = VT.isVector() ?
4419 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
4420 NumElements: VT.getVectorNumElements()) : MVT::i32;
4421
4422 EVT NewShiftVT = getShiftAmountTy(LHSTy: MidVT, DL: DAG.getDataLayout());
4423 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MidVT,
4424 Operand: Src.getOperand(i: 0));
4425 DCI.AddToWorklist(N: Trunc.getNode());
4426
4427 if (Amt.getValueType() != NewShiftVT) {
4428 Amt = DAG.getZExtOrTrunc(Op: Amt, DL: SL, VT: NewShiftVT);
4429 DCI.AddToWorklist(N: Amt.getNode());
4430 }
4431
4432 SDValue ShrunkShift = DAG.getNode(Opcode: Src.getOpcode(), DL: SL, VT: MidVT,
4433 N1: Trunc, N2: Amt);
4434 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: ShrunkShift);
4435 }
4436 }
4437 }
4438
4439 return SDValue();
4440}
4441
4442// We need to specifically handle i64 mul here to avoid unnecessary conversion
4443// instructions. If we only match on the legalized i64 mul expansion,
4444// SimplifyDemandedBits will be unable to remove them because there will be
4445// multiple uses due to the separate mul + mulh[su].
4446static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4447 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4448 if (Size <= 32) {
4449 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4450 return DAG.getNode(Opcode: MulOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4451 }
4452
4453 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4454 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4455
4456 SDValue MulLo = DAG.getNode(Opcode: MulLoOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4457 SDValue MulHi = DAG.getNode(Opcode: MulHiOpc, DL: SL, VT: MVT::i32, N1: N0, N2: N1);
4458
4459 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: MulLo, N2: MulHi);
4460}
4461
4462/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4463/// return SDValue().
4464static SDValue getAddOneOp(const SDNode *V) {
4465 if (V->getOpcode() != ISD::ADD)
4466 return SDValue();
4467
4468 return isOneConstant(V: V->getOperand(Num: 1)) ? V->getOperand(Num: 0) : SDValue();
4469}
4470
4471SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
4472 DAGCombinerInfo &DCI) const {
4473 assert(N->getOpcode() == ISD::MUL);
4474 EVT VT = N->getValueType(ResNo: 0);
4475
4476 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4477 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4478 // unnecessarily). isDivergent() is used as an approximation of whether the
4479 // value is in an SGPR.
4480 if (!N->isDivergent())
4481 return SDValue();
4482
4483 unsigned Size = VT.getSizeInBits();
4484 if (VT.isVector() || Size > 64)
4485 return SDValue();
4486
4487 SelectionDAG &DAG = DCI.DAG;
4488 SDLoc DL(N);
4489
4490 SDValue N0 = N->getOperand(Num: 0);
4491 SDValue N1 = N->getOperand(Num: 1);
4492
4493 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4494 // matching.
4495
4496 // mul x, (add y, 1) -> add (mul x, y), x
4497 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4498 SDValue AddOp = getAddOneOp(V: V.getNode());
4499 if (!AddOp)
4500 return SDValue();
4501
4502 if (V.hasOneUse() || all_of(Range: V->users(), P: [](const SDNode *U) -> bool {
4503 return U->getOpcode() == ISD::MUL;
4504 }))
4505 return AddOp;
4506
4507 return SDValue();
4508 };
4509
4510 // FIXME: The selection pattern is not properly checking for commuted
4511 // operands, so we have to place the mul in the LHS
4512 if (SDValue MulOper = IsFoldableAdd(N0)) {
4513 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1, N2: MulOper);
4514 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N1);
4515 }
4516
4517 if (SDValue MulOper = IsFoldableAdd(N1)) {
4518 SDValue MulVal = DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: N0, N2: MulOper);
4519 return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: MulVal, N2: N0);
4520 }
4521
4522 // There are i16 integer mul/mad.
4523 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(VT: MVT::i16))
4524 return SDValue();
4525
4526 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4527 // in the source into any_extends if the result of the mul is truncated. Since
4528 // we can assume the high bits are whatever we want, use the underlying value
4529 // to avoid the unknown high bits from interfering.
4530 if (N0.getOpcode() == ISD::ANY_EXTEND)
4531 N0 = N0.getOperand(i: 0);
4532
4533 if (N1.getOpcode() == ISD::ANY_EXTEND)
4534 N1 = N1.getOperand(i: 0);
4535
4536 SDValue Mul;
4537
4538 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4539 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4540 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4541 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: false);
4542 } else if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4543 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4544 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4545 Mul = getMul24(DAG, SL: DL, N0, N1, Size, Signed: true);
4546 } else {
4547 return SDValue();
4548 }
4549
4550 // We need to use sext even for MUL_U24, because MUL_U24 is used
4551 // for signed multiply of 8 and 16-bit types.
4552 return DAG.getSExtOrTrunc(Op: Mul, DL, VT);
4553}
4554
4555SDValue
4556AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
4557 DAGCombinerInfo &DCI) const {
4558 if (N->getValueType(ResNo: 0) != MVT::i32)
4559 return SDValue();
4560
4561 SelectionDAG &DAG = DCI.DAG;
4562 SDLoc DL(N);
4563
4564 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4565 SDValue N0 = N->getOperand(Num: 0);
4566 SDValue N1 = N->getOperand(Num: 1);
4567
4568 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4569 // in the source into any_extends if the result of the mul is truncated. Since
4570 // we can assume the high bits are whatever we want, use the underlying value
4571 // to avoid the unknown high bits from interfering.
4572 if (N0.getOpcode() == ISD::ANY_EXTEND)
4573 N0 = N0.getOperand(i: 0);
4574 if (N1.getOpcode() == ISD::ANY_EXTEND)
4575 N1 = N1.getOperand(i: 0);
4576
4577 // Try to use two fast 24-bit multiplies (one for each half of the result)
4578 // instead of one slow extending multiply.
4579 unsigned LoOpcode = 0;
4580 unsigned HiOpcode = 0;
4581 if (Signed) {
4582 if (Subtarget->hasMulI24() && isI24(Op: N0, DAG) && isI24(Op: N1, DAG)) {
4583 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4584 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4585 LoOpcode = AMDGPUISD::MUL_I24;
4586 HiOpcode = AMDGPUISD::MULHI_I24;
4587 }
4588 } else {
4589 if (Subtarget->hasMulU24() && isU24(Op: N0, DAG) && isU24(Op: N1, DAG)) {
4590 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4591 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4592 LoOpcode = AMDGPUISD::MUL_U24;
4593 HiOpcode = AMDGPUISD::MULHI_U24;
4594 }
4595 }
4596 if (!LoOpcode)
4597 return SDValue();
4598
4599 SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4600 SDValue Hi = DAG.getNode(Opcode: HiOpcode, DL, VT: MVT::i32, N1: N0, N2: N1);
4601 DCI.CombineTo(N, Res0: Lo, Res1: Hi);
4602 return SDValue(N, 0);
4603}
4604
4605SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
4606 DAGCombinerInfo &DCI) const {
4607 EVT VT = N->getValueType(ResNo: 0);
4608
4609 if (!Subtarget->hasMulI24() || VT.isVector())
4610 return SDValue();
4611
4612 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4613 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4614 // unnecessarily). isDivergent() is used as an approximation of whether the
4615 // value is in an SGPR.
4616 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4617 // valu op anyway)
4618 if (Subtarget->hasSMulHi() && !N->isDivergent())
4619 return SDValue();
4620
4621 SelectionDAG &DAG = DCI.DAG;
4622 SDLoc DL(N);
4623
4624 SDValue N0 = N->getOperand(Num: 0);
4625 SDValue N1 = N->getOperand(Num: 1);
4626
4627 if (!isI24(Op: N0, DAG) || !isI24(Op: N1, DAG))
4628 return SDValue();
4629
4630 N0 = DAG.getSExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4631 N1 = DAG.getSExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4632
4633 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_I24, DL, VT: MVT::i32, N1: N0, N2: N1);
4634 DCI.AddToWorklist(N: Mulhi.getNode());
4635 return DAG.getSExtOrTrunc(Op: Mulhi, DL, VT);
4636}
4637
4638SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
4639 DAGCombinerInfo &DCI) const {
4640 EVT VT = N->getValueType(ResNo: 0);
4641
4642 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4643 return SDValue();
4644
4645 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4646 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4647 // unnecessarily). isDivergent() is used as an approximation of whether the
4648 // value is in an SGPR.
4649 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4650 // valu op anyway)
4651 if (Subtarget->hasSMulHi() && !N->isDivergent())
4652 return SDValue();
4653
4654 SelectionDAG &DAG = DCI.DAG;
4655 SDLoc DL(N);
4656
4657 SDValue N0 = N->getOperand(Num: 0);
4658 SDValue N1 = N->getOperand(Num: 1);
4659
4660 if (!isU24(Op: N0, DAG) || !isU24(Op: N1, DAG))
4661 return SDValue();
4662
4663 N0 = DAG.getZExtOrTrunc(Op: N0, DL, VT: MVT::i32);
4664 N1 = DAG.getZExtOrTrunc(Op: N1, DL, VT: MVT::i32);
4665
4666 SDValue Mulhi = DAG.getNode(Opcode: AMDGPUISD::MULHI_U24, DL, VT: MVT::i32, N1: N0, N2: N1);
4667 DCI.AddToWorklist(N: Mulhi.getNode());
4668 return DAG.getZExtOrTrunc(Op: Mulhi, DL, VT);
4669}
4670
4671SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4672 SDValue Op,
4673 const SDLoc &DL,
4674 unsigned Opc) const {
4675 EVT VT = Op.getValueType();
4676 EVT LegalVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
4677 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4678 LegalVT != MVT::i16))
4679 return SDValue();
4680
4681 if (VT != MVT::i32)
4682 Op = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: Op);
4683
4684 SDValue FFBX = DAG.getNode(Opcode: Opc, DL, VT: MVT::i32, Operand: Op);
4685 if (VT != MVT::i32)
4686 FFBX = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: FFBX);
4687
4688 return FFBX;
4689}
4690
4691// The native instructions return -1 on 0 input. Optimize out a select that
4692// produces -1 on 0.
4693//
4694// TODO: If zero is not undef, we could also do this if the output is compared
4695// against the bitwidth.
4696//
4697// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4698SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
4699 SDValue LHS, SDValue RHS,
4700 DAGCombinerInfo &DCI) const {
4701 if (!isNullConstant(V: Cond.getOperand(i: 1)))
4702 return SDValue();
4703
4704 SelectionDAG &DAG = DCI.DAG;
4705 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
4706 SDValue CmpLHS = Cond.getOperand(i: 0);
4707
4708 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4709 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4710 if (CCOpcode == ISD::SETEQ &&
4711 (isCtlzOpc(Opc: RHS.getOpcode()) || isCttzOpc(Opc: RHS.getOpcode())) &&
4712 RHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: LHS)) {
4713 unsigned Opc =
4714 isCttzOpc(Opc: RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4715 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4716 }
4717
4718 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4719 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4720 if (CCOpcode == ISD::SETNE &&
4721 (isCtlzOpc(Opc: LHS.getOpcode()) || isCttzOpc(Opc: LHS.getOpcode())) &&
4722 LHS.getOperand(i: 0) == CmpLHS && isAllOnesConstant(V: RHS)) {
4723 unsigned Opc =
4724 isCttzOpc(Opc: LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4725
4726 return getFFBX_U32(DAG, Op: CmpLHS, DL: SL, Opc);
4727 }
4728
4729 return SDValue();
4730}
4731
4732static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
4733 unsigned Op,
4734 const SDLoc &SL,
4735 SDValue Cond,
4736 SDValue N1,
4737 SDValue N2) {
4738 SelectionDAG &DAG = DCI.DAG;
4739 EVT VT = N1.getValueType();
4740
4741 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: Cond,
4742 N2: N1.getOperand(i: 0), N3: N2.getOperand(i: 0));
4743 DCI.AddToWorklist(N: NewSelect.getNode());
4744 return DAG.getNode(Opcode: Op, DL: SL, VT, Operand: NewSelect);
4745}
4746
4747// Pull a free FP operation out of a select so it may fold into uses.
4748//
4749// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4750// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4751//
4752// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4753// select c, (fabs x), +k -> fabs (select c, x, k)
4754SDValue
4755AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
4756 SDValue N) const {
4757 SelectionDAG &DAG = DCI.DAG;
4758 SDValue Cond = N.getOperand(i: 0);
4759 SDValue LHS = N.getOperand(i: 1);
4760 SDValue RHS = N.getOperand(i: 2);
4761
4762 EVT VT = N.getValueType();
4763 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4764 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4765 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4766 return SDValue();
4767
4768 return distributeOpThroughSelect(DCI, Op: LHS.getOpcode(),
4769 SL: SDLoc(N), Cond, N1: LHS, N2: RHS);
4770 }
4771
4772 bool Inv = false;
4773 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4774 std::swap(a&: LHS, b&: RHS);
4775 Inv = true;
4776 }
4777
4778 // TODO: Support vector constants.
4779 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
4780 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4781 !selectSupportsSourceMods(N: N.getNode())) {
4782 SDLoc SL(N);
4783 // If one side is an fneg/fabs and the other is a constant, we can push the
4784 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4785 SDValue NewLHS = LHS.getOperand(i: 0);
4786 SDValue NewRHS = RHS;
4787
4788 // Careful: if the neg can be folded up, don't try to pull it back down.
4789 bool ShouldFoldNeg = true;
4790
4791 if (NewLHS.hasOneUse()) {
4792 unsigned Opc = NewLHS.getOpcode();
4793 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(N: NewLHS.getNode()))
4794 ShouldFoldNeg = false;
4795 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4796 ShouldFoldNeg = false;
4797 }
4798
4799 if (ShouldFoldNeg) {
4800 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4801 return SDValue();
4802
4803 // We're going to be forced to use a source modifier anyway, there's no
4804 // point to pulling the negate out unless we can get a size reduction by
4805 // negating the constant.
4806 //
4807 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4808 // about cheaper constants.
4809 if (NewLHS.getOpcode() == ISD::FABS &&
4810 getConstantNegateCost(C: CRHS) != NegatibleCost::Cheaper)
4811 return SDValue();
4812
4813 if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N: N.getNode()))
4814 return SDValue();
4815
4816 if (LHS.getOpcode() == ISD::FNEG)
4817 NewRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4818
4819 if (Inv)
4820 std::swap(a&: NewLHS, b&: NewRHS);
4821
4822 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT,
4823 N1: Cond, N2: NewLHS, N3: NewRHS);
4824 DCI.AddToWorklist(N: NewSelect.getNode());
4825 return DAG.getNode(Opcode: LHS.getOpcode(), DL: SL, VT, Operand: NewSelect);
4826 }
4827 }
4828
4829 return SDValue();
4830}
4831
4832SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
4833 DAGCombinerInfo &DCI) const {
4834 if (SDValue Folded = foldFreeOpFromSelect(DCI, N: SDValue(N, 0)))
4835 return Folded;
4836
4837 SDValue Cond = N->getOperand(Num: 0);
4838 if (Cond.getOpcode() != ISD::SETCC)
4839 return SDValue();
4840
4841 EVT VT = N->getValueType(ResNo: 0);
4842 SDValue LHS = Cond.getOperand(i: 0);
4843 SDValue RHS = Cond.getOperand(i: 1);
4844 SDValue CC = Cond.getOperand(i: 2);
4845
4846 SDValue True = N->getOperand(Num: 1);
4847 SDValue False = N->getOperand(Num: 2);
4848
4849 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4850 SelectionDAG &DAG = DCI.DAG;
4851 if (DAG.isConstantValueOfAnyType(N: True) &&
4852 !DAG.isConstantValueOfAnyType(N: False)) {
4853 // Swap cmp + select pair to move constant to false input.
4854 // This will allow using VOPC cndmasks more often.
4855 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4856
4857 SDLoc SL(N);
4858 ISD::CondCode NewCC =
4859 getSetCCInverse(Operation: cast<CondCodeSDNode>(Val&: CC)->get(), Type: LHS.getValueType());
4860
4861 SDValue NewCond = DAG.getSetCC(DL: SL, VT: Cond.getValueType(), LHS, RHS, Cond: NewCC);
4862 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: NewCond, N2: False, N3: True);
4863 }
4864
4865 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4866 SDValue MinMax
4867 = combineFMinMaxLegacy(DL: SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4868 // Revisit this node so we can catch min3/max3/med3 patterns.
4869 //DCI.AddToWorklist(MinMax.getNode());
4870 return MinMax;
4871 }
4872 }
4873
4874 // There's no reason to not do this if the condition has other uses.
4875 return performCtlz_CttzCombine(SL: SDLoc(N), Cond, LHS: True, RHS: False, DCI);
4876}
4877
4878static bool isInv2Pi(const APFloat &APF) {
4879 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4880 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4881 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4882
4883 return APF.bitwiseIsEqual(RHS: KF16) ||
4884 APF.bitwiseIsEqual(RHS: KF32) ||
4885 APF.bitwiseIsEqual(RHS: KF64);
4886}
4887
4888// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4889// additional cost to negate them.
4890TargetLowering::NegatibleCost
4891AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
4892 if (C->isZero())
4893 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4894
4895 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(APF: C->getValueAPF()))
4896 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4897
4898 return NegatibleCost::Neutral;
4899}
4900
4901bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
4902 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4903 return getConstantNegateCost(C) == NegatibleCost::Expensive;
4904 return false;
4905}
4906
4907bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
4908 if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
4909 return getConstantNegateCost(C) == NegatibleCost::Cheaper;
4910 return false;
4911}
4912
4913static unsigned inverseMinMax(unsigned Opc) {
4914 switch (Opc) {
4915 case ISD::FMAXNUM:
4916 return ISD::FMINNUM;
4917 case ISD::FMINNUM:
4918 return ISD::FMAXNUM;
4919 case ISD::FMAXNUM_IEEE:
4920 return ISD::FMINNUM_IEEE;
4921 case ISD::FMINNUM_IEEE:
4922 return ISD::FMAXNUM_IEEE;
4923 case ISD::FMAXIMUM:
4924 return ISD::FMINIMUM;
4925 case ISD::FMINIMUM:
4926 return ISD::FMAXIMUM;
4927 case ISD::FMAXIMUMNUM:
4928 return ISD::FMINIMUMNUM;
4929 case ISD::FMINIMUMNUM:
4930 return ISD::FMAXIMUMNUM;
4931 case AMDGPUISD::FMAX_LEGACY:
4932 return AMDGPUISD::FMIN_LEGACY;
4933 case AMDGPUISD::FMIN_LEGACY:
4934 return AMDGPUISD::FMAX_LEGACY;
4935 default:
4936 llvm_unreachable("invalid min/max opcode");
4937 }
4938}
4939
4940/// \return true if it's profitable to try to push an fneg into its source
4941/// instruction.
4942bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
4943 // If the input has multiple uses and we can either fold the negate down, or
4944 // the other uses cannot, give up. This both prevents unprofitable
4945 // transformations and infinite loops: we won't repeatedly try to fold around
4946 // a negate that has no 'good' form.
4947 if (N0.hasOneUse()) {
4948 // This may be able to fold into the source, but at a code size cost. Don't
4949 // fold if the fold into the user is free.
4950 if (allUsesHaveSourceMods(N, CostThreshold: 0))
4951 return false;
4952 } else {
4953 if (fnegFoldsIntoOp(N: N0.getNode()) &&
4954 (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N: N0.getNode())))
4955 return false;
4956 }
4957
4958 return true;
4959}
4960
4961SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
4962 DAGCombinerInfo &DCI) const {
4963 SelectionDAG &DAG = DCI.DAG;
4964 SDValue N0 = N->getOperand(Num: 0);
4965 EVT VT = N->getValueType(ResNo: 0);
4966
4967 unsigned Opc = N0.getOpcode();
4968
4969 if (!shouldFoldFNegIntoSrc(N, N0))
4970 return SDValue();
4971
4972 SDLoc SL(N);
4973 switch (Opc) {
4974 case ISD::FADD: {
4975 if (!mayIgnoreSignedZero(Op: N0))
4976 return SDValue();
4977
4978 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4979 SDValue LHS = N0.getOperand(i: 0);
4980 SDValue RHS = N0.getOperand(i: 1);
4981
4982 if (LHS.getOpcode() != ISD::FNEG)
4983 LHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
4984 else
4985 LHS = LHS.getOperand(i: 0);
4986
4987 if (RHS.getOpcode() != ISD::FNEG)
4988 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
4989 else
4990 RHS = RHS.getOperand(i: 0);
4991
4992 SDValue Res = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
4993 if (Res.getOpcode() != ISD::FADD)
4994 return SDValue(); // Op got folded away.
4995 if (!N0.hasOneUse())
4996 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
4997 return Res;
4998 }
4999 case ISD::FMUL:
5000 case AMDGPUISD::FMUL_LEGACY: {
5001 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5002 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5003 SDValue LHS = N0.getOperand(i: 0);
5004 SDValue RHS = N0.getOperand(i: 1);
5005
5006 if (LHS.getOpcode() == ISD::FNEG)
5007 LHS = LHS.getOperand(i: 0);
5008 else if (RHS.getOpcode() == ISD::FNEG)
5009 RHS = RHS.getOperand(i: 0);
5010 else
5011 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5012
5013 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: RHS, Flags: N0->getFlags());
5014 if (Res.getOpcode() != Opc)
5015 return SDValue(); // Op got folded away.
5016 if (!N0.hasOneUse())
5017 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5018 return Res;
5019 }
5020 case ISD::FMA:
5021 case ISD::FMAD: {
5022 // TODO: handle llvm.amdgcn.fma.legacy
5023 if (!mayIgnoreSignedZero(Op: N0))
5024 return SDValue();
5025
5026 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5027 SDValue LHS = N0.getOperand(i: 0);
5028 SDValue MHS = N0.getOperand(i: 1);
5029 SDValue RHS = N0.getOperand(i: 2);
5030
5031 if (LHS.getOpcode() == ISD::FNEG)
5032 LHS = LHS.getOperand(i: 0);
5033 else if (MHS.getOpcode() == ISD::FNEG)
5034 MHS = MHS.getOperand(i: 0);
5035 else
5036 MHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: MHS);
5037
5038 if (RHS.getOpcode() != ISD::FNEG)
5039 RHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5040 else
5041 RHS = RHS.getOperand(i: 0);
5042
5043 SDValue Res = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: LHS, N2: MHS, N3: RHS);
5044 if (Res.getOpcode() != Opc)
5045 return SDValue(); // Op got folded away.
5046 if (!N0.hasOneUse())
5047 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5048 return Res;
5049 }
5050 case ISD::FMAXNUM:
5051 case ISD::FMINNUM:
5052 case ISD::FMAXNUM_IEEE:
5053 case ISD::FMINNUM_IEEE:
5054 case ISD::FMINIMUM:
5055 case ISD::FMAXIMUM:
5056 case ISD::FMINIMUMNUM:
5057 case ISD::FMAXIMUMNUM:
5058 case AMDGPUISD::FMAX_LEGACY:
5059 case AMDGPUISD::FMIN_LEGACY: {
5060 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5061 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5062 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5063 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5064
5065 SDValue LHS = N0.getOperand(i: 0);
5066 SDValue RHS = N0.getOperand(i: 1);
5067
5068 // 0 doesn't have a negated inline immediate.
5069 // TODO: This constant check should be generalized to other operations.
5070 if (isConstantCostlierToNegate(N: RHS))
5071 return SDValue();
5072
5073 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS);
5074 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
5075 unsigned Opposite = inverseMinMax(Opc);
5076
5077 SDValue Res = DAG.getNode(Opcode: Opposite, DL: SL, VT, N1: NegLHS, N2: NegRHS, Flags: N0->getFlags());
5078 if (Res.getOpcode() != Opposite)
5079 return SDValue(); // Op got folded away.
5080 if (!N0.hasOneUse())
5081 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res));
5082 return Res;
5083 }
5084 case AMDGPUISD::FMED3: {
5085 SDValue Ops[3];
5086 for (unsigned I = 0; I < 3; ++I)
5087 Ops[I] = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: N0->getOperand(Num: I), Flags: N0->getFlags());
5088
5089 SDValue Res = DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT, Ops, Flags: N0->getFlags());
5090 if (Res.getOpcode() != AMDGPUISD::FMED3)
5091 return SDValue(); // Op got folded away.
5092
5093 if (!N0.hasOneUse()) {
5094 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Res);
5095 DAG.ReplaceAllUsesWith(From: N0, To: Neg);
5096
5097 for (SDNode *U : Neg->users())
5098 DCI.AddToWorklist(N: U);
5099 }
5100
5101 return Res;
5102 }
5103 case ISD::FP_EXTEND:
5104 case ISD::FTRUNC:
5105 case ISD::FRINT:
5106 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5107 case ISD::FROUNDEVEN:
5108 case ISD::FSIN:
5109 case ISD::FCANONICALIZE:
5110 case AMDGPUISD::RCP:
5111 case AMDGPUISD::RCP_LEGACY:
5112 case AMDGPUISD::RCP_IFLAG:
5113 case AMDGPUISD::SIN_HW: {
5114 SDValue CvtSrc = N0.getOperand(i: 0);
5115 if (CvtSrc.getOpcode() == ISD::FNEG) {
5116 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5117 // (fneg (rcp (fneg x))) -> (rcp x)
5118 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: CvtSrc.getOperand(i: 0));
5119 }
5120
5121 if (!N0.hasOneUse())
5122 return SDValue();
5123
5124 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5125 // (fneg (rcp x)) -> (rcp (fneg x))
5126 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5127 return DAG.getNode(Opcode: Opc, DL: SL, VT, Operand: Neg, Flags: N0->getFlags());
5128 }
5129 case ISD::FP_ROUND: {
5130 SDValue CvtSrc = N0.getOperand(i: 0);
5131
5132 if (CvtSrc.getOpcode() == ISD::FNEG) {
5133 // (fneg (fp_round (fneg x))) -> (fp_round x)
5134 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT,
5135 N1: CvtSrc.getOperand(i: 0), N2: N0.getOperand(i: 1));
5136 }
5137
5138 if (!N0.hasOneUse())
5139 return SDValue();
5140
5141 // (fneg (fp_round x)) -> (fp_round (fneg x))
5142 SDValue Neg = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: CvtSrc.getValueType(), Operand: CvtSrc);
5143 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT, N1: Neg, N2: N0.getOperand(i: 1));
5144 }
5145 case ISD::FP16_TO_FP: {
5146 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5147 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5148 // Put the fneg back as a legal source operation that can be matched later.
5149 SDLoc SL(N);
5150
5151 SDValue Src = N0.getOperand(i: 0);
5152 EVT SrcVT = Src.getValueType();
5153
5154 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5155 SDValue IntFNeg = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: SrcVT, N1: Src,
5156 N2: DAG.getConstant(Val: 0x8000, DL: SL, VT: SrcVT));
5157 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFNeg);
5158 }
5159 case ISD::SELECT: {
5160 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5161 // TODO: Invert conditions of foldFreeOpFromSelect
5162 return SDValue();
5163 }
5164 case ISD::BITCAST: {
5165 SDLoc SL(N);
5166 SDValue BCSrc = N0.getOperand(i: 0);
5167 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5168 SDValue HighBits = BCSrc.getOperand(i: BCSrc.getNumOperands() - 1);
5169 if (HighBits.getValueType().getSizeInBits() != 32 ||
5170 !fnegFoldsIntoOp(N: HighBits.getNode()))
5171 return SDValue();
5172
5173 // f64 fneg only really needs to operate on the high half of of the
5174 // register, so try to force it to an f32 operation to help make use of
5175 // source modifiers.
5176 //
5177 //
5178 // fneg (f64 (bitcast (build_vector x, y))) ->
5179 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5180 // (fneg (bitcast i32:y to f32)))
5181
5182 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: HighBits);
5183 SDValue NegHi = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: CastHi);
5184 SDValue CastBack =
5185 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HighBits.getValueType(), Operand: NegHi);
5186
5187 SmallVector<SDValue, 8> Ops(BCSrc->ops());
5188 Ops.back() = CastBack;
5189 DCI.AddToWorklist(N: NegHi.getNode());
5190 SDValue Build =
5191 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: BCSrc.getValueType(), Ops);
5192 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Build);
5193
5194 if (!N0.hasOneUse())
5195 DAG.ReplaceAllUsesWith(From: N0, To: DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Result));
5196 return Result;
5197 }
5198
5199 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5200 BCSrc.hasOneUse()) {
5201 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5202 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5203
5204 // TODO: Cast back result for multiple uses is beneficial in some cases.
5205
5206 SDValue LHS =
5207 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 1));
5208 SDValue RHS =
5209 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: BCSrc.getOperand(i: 2));
5210
5211 SDValue NegLHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: LHS);
5212 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHS);
5213
5214 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: BCSrc.getOperand(i: 0), N2: NegLHS,
5215 N3: NegRHS);
5216 }
5217
5218 return SDValue();
5219 }
5220 default:
5221 return SDValue();
5222 }
5223}
5224
5225SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
5226 DAGCombinerInfo &DCI) const {
5227 SelectionDAG &DAG = DCI.DAG;
5228 SDValue N0 = N->getOperand(Num: 0);
5229
5230 if (!N0.hasOneUse())
5231 return SDValue();
5232
5233 switch (N0.getOpcode()) {
5234 case ISD::FP16_TO_FP: {
5235 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5236 SDLoc SL(N);
5237 SDValue Src = N0.getOperand(i: 0);
5238 EVT SrcVT = Src.getValueType();
5239
5240 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5241 SDValue IntFAbs = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: SrcVT, N1: Src,
5242 N2: DAG.getConstant(Val: 0x7fff, DL: SL, VT: SrcVT));
5243 return DAG.getNode(Opcode: ISD::FP16_TO_FP, DL: SL, VT: N->getValueType(ResNo: 0), Operand: IntFAbs);
5244 }
5245 default:
5246 return SDValue();
5247 }
5248}
5249
5250SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
5251 DAGCombinerInfo &DCI) const {
5252 const auto *CFP = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
5253 if (!CFP)
5254 return SDValue();
5255
5256 // XXX - Should this flush denormals?
5257 const APFloat &Val = CFP->getValueAPF();
5258 APFloat One(Val.getSemantics(), "1.0");
5259 return DCI.DAG.getConstantFP(Val: One / Val, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
5260}
5261
5262SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
5263 DAGCombinerInfo &DCI) const {
5264 SelectionDAG &DAG = DCI.DAG;
5265 SDLoc DL(N);
5266
5267 switch(N->getOpcode()) {
5268 default:
5269 break;
5270 case ISD::BITCAST: {
5271 EVT DestVT = N->getValueType(ResNo: 0);
5272
5273 // Push casts through vector builds. This helps avoid emitting a large
5274 // number of copies when materializing floating point vector constants.
5275 //
5276 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5277 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5278 if (DestVT.isVector()) {
5279 SDValue Src = N->getOperand(Num: 0);
5280 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5281 (DCI.getDAGCombineLevel() < AfterLegalizeDAG ||
5282 isOperationLegal(Op: ISD::BUILD_VECTOR, VT: DestVT))) {
5283 EVT SrcVT = Src.getValueType();
5284 unsigned NElts = DestVT.getVectorNumElements();
5285
5286 if (SrcVT.getVectorNumElements() == NElts) {
5287 EVT DestEltVT = DestVT.getVectorElementType();
5288
5289 SmallVector<SDValue, 8> CastedElts;
5290 SDLoc SL(N);
5291 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5292 SDValue Elt = Src.getOperand(i: I);
5293 CastedElts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DestEltVT, Operand: Elt));
5294 }
5295
5296 return DAG.getBuildVector(VT: DestVT, DL: SL, Ops: CastedElts);
5297 }
5298 }
5299 }
5300
5301 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5302 break;
5303
5304 // Fold bitcasts of constants.
5305 //
5306 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5307 // TODO: Generalize and move to DAGCombiner
5308 SDValue Src = N->getOperand(Num: 0);
5309 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Src)) {
5310 SDLoc SL(N);
5311 uint64_t CVal = C->getZExtValue();
5312 SDValue BV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5313 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5314 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5315 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: BV);
5316 }
5317
5318 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
5319 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5320 SDLoc SL(N);
5321 uint64_t CVal = Val.getZExtValue();
5322 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
5323 N1: DAG.getConstant(Val: Lo_32(Value: CVal), DL: SL, VT: MVT::i32),
5324 N2: DAG.getConstant(Val: Hi_32(Value: CVal), DL: SL, VT: MVT::i32));
5325
5326 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: DestVT, Operand: Vec);
5327 }
5328
5329 break;
5330 }
5331 case ISD::SHL:
5332 case ISD::SRA:
5333 case ISD::SRL: {
5334 // Range metadata can be invalidated when loads are converted to legal types
5335 // (e.g. v2i64 -> v4i32).
5336 // Try to convert vector shl/sra/srl before type legalization so that range
5337 // metadata can be utilized.
5338 if (!(N->getValueType(ResNo: 0).isVector() &&
5339 DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
5340 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5341 break;
5342 if (N->getOpcode() == ISD::SHL)
5343 return performShlCombine(N, DCI);
5344 if (N->getOpcode() == ISD::SRA)
5345 return performSraCombine(N, DCI);
5346 return performSrlCombine(N, DCI);
5347 }
5348 case ISD::TRUNCATE:
5349 return performTruncateCombine(N, DCI);
5350 case ISD::MUL:
5351 return performMulCombine(N, DCI);
5352 case AMDGPUISD::MUL_U24:
5353 case AMDGPUISD::MUL_I24: {
5354 if (SDValue Simplified = simplifyMul24(Node24: N, DCI))
5355 return Simplified;
5356 break;
5357 }
5358 case AMDGPUISD::MULHI_I24:
5359 case AMDGPUISD::MULHI_U24:
5360 return simplifyMul24(Node24: N, DCI);
5361 case ISD::SMUL_LOHI:
5362 case ISD::UMUL_LOHI:
5363 return performMulLoHiCombine(N, DCI);
5364 case ISD::MULHS:
5365 return performMulhsCombine(N, DCI);
5366 case ISD::MULHU:
5367 return performMulhuCombine(N, DCI);
5368 case ISD::SELECT:
5369 return performSelectCombine(N, DCI);
5370 case ISD::FNEG:
5371 return performFNegCombine(N, DCI);
5372 case ISD::FABS:
5373 return performFAbsCombine(N, DCI);
5374 case AMDGPUISD::BFE_I32:
5375 case AMDGPUISD::BFE_U32: {
5376 assert(!N->getValueType(0).isVector() &&
5377 "Vector handling of BFE not implemented");
5378 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 2));
5379 if (!Width)
5380 break;
5381
5382 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5383 if (WidthVal == 0)
5384 return DAG.getConstant(Val: 0, DL, VT: MVT::i32);
5385
5386 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5387 if (!Offset)
5388 break;
5389
5390 SDValue BitsFrom = N->getOperand(Num: 0);
5391 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5392
5393 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5394
5395 if (OffsetVal == 0) {
5396 // This is already sign / zero extended, so try to fold away extra BFEs.
5397 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5398
5399 unsigned OpSignBits = DAG.ComputeNumSignBits(Op: BitsFrom);
5400 if (OpSignBits >= SignBits)
5401 return BitsFrom;
5402
5403 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WidthVal);
5404 if (Signed) {
5405 // This is a sign_extend_inreg. Replace it to take advantage of existing
5406 // DAG Combines. If not eliminated, we will match back to BFE during
5407 // selection.
5408
5409 // TODO: The sext_inreg of extended types ends, although we can could
5410 // handle them in a single BFE.
5411 return DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i32, N1: BitsFrom,
5412 N2: DAG.getValueType(SmallVT));
5413 }
5414
5415 return DAG.getZeroExtendInReg(Op: BitsFrom, DL, VT: SmallVT);
5416 }
5417
5418 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(Val&: BitsFrom)) {
5419 if (Signed) {
5420 return constantFoldBFE<int32_t>(DAG,
5421 Src0: CVal->getSExtValue(),
5422 Offset: OffsetVal,
5423 Width: WidthVal,
5424 DL);
5425 }
5426
5427 return constantFoldBFE<uint32_t>(DAG,
5428 Src0: CVal->getZExtValue(),
5429 Offset: OffsetVal,
5430 Width: WidthVal,
5431 DL);
5432 }
5433
5434 if ((OffsetVal + WidthVal) >= 32 &&
5435 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5436 SDValue ShiftVal = DAG.getConstant(Val: OffsetVal, DL, VT: MVT::i32);
5437 return DAG.getNode(Opcode: Signed ? ISD::SRA : ISD::SRL, DL, VT: MVT::i32,
5438 N1: BitsFrom, N2: ShiftVal);
5439 }
5440
5441 if (BitsFrom.hasOneUse()) {
5442 APInt Demanded = APInt::getBitsSet(numBits: 32,
5443 loBit: OffsetVal,
5444 hiBit: OffsetVal + WidthVal);
5445
5446 KnownBits Known;
5447 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
5448 !DCI.isBeforeLegalizeOps());
5449 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5450 if (TLI.ShrinkDemandedConstant(Op: BitsFrom, DemandedBits: Demanded, TLO) ||
5451 TLI.SimplifyDemandedBits(Op: BitsFrom, DemandedBits: Demanded, Known, TLO)) {
5452 DCI.CommitTargetLoweringOpt(TLO);
5453 }
5454 }
5455
5456 break;
5457 }
5458 case ISD::LOAD:
5459 return performLoadCombine(N, DCI);
5460 case ISD::STORE:
5461 return performStoreCombine(N, DCI);
5462 case AMDGPUISD::RCP:
5463 case AMDGPUISD::RCP_IFLAG:
5464 return performRcpCombine(N, DCI);
5465 case ISD::AssertZext:
5466 case ISD::AssertSext:
5467 return performAssertSZExtCombine(N, DCI);
5468 case ISD::INTRINSIC_WO_CHAIN:
5469 return performIntrinsicWOChainCombine(N, DCI);
5470 case AMDGPUISD::FMAD_FTZ: {
5471 SDValue N0 = N->getOperand(Num: 0);
5472 SDValue N1 = N->getOperand(Num: 1);
5473 SDValue N2 = N->getOperand(Num: 2);
5474 EVT VT = N->getValueType(ResNo: 0);
5475
5476 // FMAD_FTZ is a FMAD + flush denormals to zero.
5477 // We flush the inputs, the intermediate step, and the output.
5478 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Val&: N0);
5479 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(Val&: N1);
5480 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(Val&: N2);
5481 if (N0CFP && N1CFP && N2CFP) {
5482 const auto FTZ = [](const APFloat &V) {
5483 if (V.isDenormal()) {
5484 APFloat Zero(V.getSemantics(), 0);
5485 return V.isNegative() ? -Zero : Zero;
5486 }
5487 return V;
5488 };
5489
5490 APFloat V0 = FTZ(N0CFP->getValueAPF());
5491 APFloat V1 = FTZ(N1CFP->getValueAPF());
5492 APFloat V2 = FTZ(N2CFP->getValueAPF());
5493 V0.multiply(RHS: V1, RM: APFloat::rmNearestTiesToEven);
5494 V0 = FTZ(V0);
5495 V0.add(RHS: V2, RM: APFloat::rmNearestTiesToEven);
5496 return DAG.getConstantFP(Val: FTZ(V0), DL, VT);
5497 }
5498 break;
5499 }
5500 }
5501 return SDValue();
5502}
5503
5504//===----------------------------------------------------------------------===//
5505// Helper functions
5506//===----------------------------------------------------------------------===//
5507
5508SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
5509 const TargetRegisterClass *RC,
5510 Register Reg, EVT VT,
5511 const SDLoc &SL,
5512 bool RawReg) const {
5513 MachineFunction &MF = DAG.getMachineFunction();
5514 MachineRegisterInfo &MRI = MF.getRegInfo();
5515 Register VReg;
5516
5517 if (!MRI.isLiveIn(Reg)) {
5518 VReg = MRI.createVirtualRegister(RegClass: RC);
5519 MRI.addLiveIn(Reg, vreg: VReg);
5520 } else {
5521 VReg = MRI.getLiveInVirtReg(PReg: Reg);
5522 }
5523
5524 if (RawReg)
5525 return DAG.getRegister(Reg: VReg, VT);
5526
5527 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: VReg, VT);
5528}
5529
5530// This may be called multiple times, and nothing prevents creating multiple
5531// objects at the same offset. See if we already defined this object.
5532static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
5533 int64_t Offset) {
5534 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5535 if (MFI.getObjectOffset(ObjectIdx: I) == Offset) {
5536 assert(MFI.getObjectSize(I) == Size);
5537 return I;
5538 }
5539 }
5540
5541 return MFI.CreateFixedObject(Size, SPOffset: Offset, IsImmutable: true);
5542}
5543
5544SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
5545 EVT VT,
5546 const SDLoc &SL,
5547 int64_t Offset) const {
5548 MachineFunction &MF = DAG.getMachineFunction();
5549 MachineFrameInfo &MFI = MF.getFrameInfo();
5550 int FI = getOrCreateFixedStackObject(MFI, Size: VT.getStoreSize(), Offset);
5551
5552 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5553 SDValue Ptr = DAG.getFrameIndex(FI, VT: MVT::i32);
5554
5555 return DAG.getLoad(VT, dl: SL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: SrcPtrInfo, Alignment: Align(4),
5556 MMOFlags: MachineMemOperand::MODereferenceable |
5557 MachineMemOperand::MOInvariant);
5558}
5559
5560SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
5561 const SDLoc &SL,
5562 SDValue Chain,
5563 SDValue ArgVal,
5564 int64_t Offset) const {
5565 MachineFunction &MF = DAG.getMachineFunction();
5566 MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
5567 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5568
5569 SDValue Ptr = DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32);
5570 // Stores to the argument stack area are relative to the stack pointer.
5571 SDValue SP =
5572 DAG.getCopyFromReg(Chain, dl: SL, Reg: Info->getStackPtrOffsetReg(), VT: MVT::i32);
5573 Ptr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: SP, N2: Ptr);
5574 SDValue Store = DAG.getStore(Chain, dl: SL, Val: ArgVal, Ptr, PtrInfo: DstInfo, Alignment: Align(4),
5575 MMOFlags: MachineMemOperand::MODereferenceable);
5576 return Store;
5577}
5578
5579SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
5580 const TargetRegisterClass *RC,
5581 EVT VT, const SDLoc &SL,
5582 const ArgDescriptor &Arg) const {
5583 assert(Arg && "Attempting to load missing argument");
5584
5585 SDValue V = Arg.isRegister() ?
5586 CreateLiveInRegister(DAG, RC, Reg: Arg.getRegister(), VT, SL) :
5587 loadStackInputValue(DAG, VT, SL, Offset: Arg.getStackOffset());
5588
5589 if (!Arg.isMasked())
5590 return V;
5591
5592 unsigned Mask = Arg.getMask();
5593 unsigned Shift = llvm::countr_zero<unsigned>(Val: Mask);
5594 V = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT, N1: V,
5595 N2: DAG.getShiftAmountConstant(Val: Shift, VT, DL: SL));
5596 return DAG.getNode(Opcode: ISD::AND, DL: SL, VT, N1: V,
5597 N2: DAG.getConstant(Val: Mask >> Shift, DL: SL, VT));
5598}
5599
5600uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5601 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5602 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5603 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5604 uint64_t ArgOffset =
5605 alignTo(Size: ExplicitKernArgSize, A: Alignment) + ExplicitArgOffset;
5606 switch (Param) {
5607 case FIRST_IMPLICIT:
5608 return ArgOffset;
5609 case PRIVATE_BASE:
5610 return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
5611 case SHARED_BASE:
5612 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5613 case QUEUE_PTR:
5614 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5615 }
5616 llvm_unreachable("unexpected implicit parameter type");
5617}
5618
5619uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
5620 const MachineFunction &MF, const ImplicitParameter Param) const {
5621 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
5622 return getImplicitParameterOffset(ExplicitKernArgSize: MFI->getExplicitKernArgSize(), Param);
5623}
5624
5625#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5626
5627const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5628 switch ((AMDGPUISD::NodeType)Opcode) {
5629 case AMDGPUISD::FIRST_NUMBER: break;
5630 // AMDIL DAG nodes
5631 NODE_NAME_CASE(BRANCH_COND);
5632
5633 // AMDGPU DAG nodes
5634 NODE_NAME_CASE(IF)
5635 NODE_NAME_CASE(ELSE)
5636 NODE_NAME_CASE(LOOP)
5637 NODE_NAME_CASE(CALL)
5638 NODE_NAME_CASE(TC_RETURN)
5639 NODE_NAME_CASE(TC_RETURN_GFX)
5640 NODE_NAME_CASE(TC_RETURN_CHAIN)
5641 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5642 NODE_NAME_CASE(TRAP)
5643 NODE_NAME_CASE(RET_GLUE)
5644 NODE_NAME_CASE(WAVE_ADDRESS)
5645 NODE_NAME_CASE(RETURN_TO_EPILOG)
5646 NODE_NAME_CASE(ENDPGM)
5647 NODE_NAME_CASE(ENDPGM_TRAP)
5648 NODE_NAME_CASE(SIMULATED_TRAP)
5649 NODE_NAME_CASE(DWORDADDR)
5650 NODE_NAME_CASE(FRACT)
5651 NODE_NAME_CASE(SETCC)
5652 NODE_NAME_CASE(DENORM_MODE)
5653 NODE_NAME_CASE(FMA_W_CHAIN)
5654 NODE_NAME_CASE(FMUL_W_CHAIN)
5655 NODE_NAME_CASE(CLAMP)
5656 NODE_NAME_CASE(COS_HW)
5657 NODE_NAME_CASE(SIN_HW)
5658 NODE_NAME_CASE(FMAX_LEGACY)
5659 NODE_NAME_CASE(FMIN_LEGACY)
5660 NODE_NAME_CASE(FMAX3)
5661 NODE_NAME_CASE(SMAX3)
5662 NODE_NAME_CASE(UMAX3)
5663 NODE_NAME_CASE(FMIN3)
5664 NODE_NAME_CASE(SMIN3)
5665 NODE_NAME_CASE(UMIN3)
5666 NODE_NAME_CASE(FMED3)
5667 NODE_NAME_CASE(SMED3)
5668 NODE_NAME_CASE(UMED3)
5669 NODE_NAME_CASE(FMAXIMUM3)
5670 NODE_NAME_CASE(FMINIMUM3)
5671 NODE_NAME_CASE(FDOT2)
5672 NODE_NAME_CASE(URECIP)
5673 NODE_NAME_CASE(DIV_SCALE)
5674 NODE_NAME_CASE(DIV_FMAS)
5675 NODE_NAME_CASE(DIV_FIXUP)
5676 NODE_NAME_CASE(FMAD_FTZ)
5677 NODE_NAME_CASE(RCP)
5678 NODE_NAME_CASE(RSQ)
5679 NODE_NAME_CASE(RCP_LEGACY)
5680 NODE_NAME_CASE(RCP_IFLAG)
5681 NODE_NAME_CASE(LOG)
5682 NODE_NAME_CASE(EXP)
5683 NODE_NAME_CASE(FMUL_LEGACY)
5684 NODE_NAME_CASE(RSQ_CLAMP)
5685 NODE_NAME_CASE(FP_CLASS)
5686 NODE_NAME_CASE(DOT4)
5687 NODE_NAME_CASE(CARRY)
5688 NODE_NAME_CASE(BORROW)
5689 NODE_NAME_CASE(BFE_U32)
5690 NODE_NAME_CASE(BFE_I32)
5691 NODE_NAME_CASE(BFI)
5692 NODE_NAME_CASE(BFM)
5693 NODE_NAME_CASE(FFBH_U32)
5694 NODE_NAME_CASE(FFBH_I32)
5695 NODE_NAME_CASE(FFBL_B32)
5696 NODE_NAME_CASE(MUL_U24)
5697 NODE_NAME_CASE(MUL_I24)
5698 NODE_NAME_CASE(MULHI_U24)
5699 NODE_NAME_CASE(MULHI_I24)
5700 NODE_NAME_CASE(MAD_U24)
5701 NODE_NAME_CASE(MAD_I24)
5702 NODE_NAME_CASE(MAD_I64_I32)
5703 NODE_NAME_CASE(MAD_U64_U32)
5704 NODE_NAME_CASE(PERM)
5705 NODE_NAME_CASE(TEXTURE_FETCH)
5706 NODE_NAME_CASE(R600_EXPORT)
5707 NODE_NAME_CASE(CONST_ADDRESS)
5708 NODE_NAME_CASE(REGISTER_LOAD)
5709 NODE_NAME_CASE(REGISTER_STORE)
5710 NODE_NAME_CASE(CVT_F32_UBYTE0)
5711 NODE_NAME_CASE(CVT_F32_UBYTE1)
5712 NODE_NAME_CASE(CVT_F32_UBYTE2)
5713 NODE_NAME_CASE(CVT_F32_UBYTE3)
5714 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5715 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5716 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5717 NODE_NAME_CASE(CVT_PK_I16_I32)
5718 NODE_NAME_CASE(CVT_PK_U16_U32)
5719 NODE_NAME_CASE(FP_TO_FP16)
5720 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5721 NODE_NAME_CASE(CONST_DATA_PTR)
5722 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5723 NODE_NAME_CASE(LDS)
5724 NODE_NAME_CASE(DUMMY_CHAIN)
5725 NODE_NAME_CASE(LOAD_D16_HI)
5726 NODE_NAME_CASE(LOAD_D16_LO)
5727 NODE_NAME_CASE(LOAD_D16_HI_I8)
5728 NODE_NAME_CASE(LOAD_D16_HI_U8)
5729 NODE_NAME_CASE(LOAD_D16_LO_I8)
5730 NODE_NAME_CASE(LOAD_D16_LO_U8)
5731 NODE_NAME_CASE(STORE_MSKOR)
5732 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5733 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5734 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5735 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5736 NODE_NAME_CASE(DS_ORDERED_COUNT)
5737 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5738 NODE_NAME_CASE(BUFFER_LOAD)
5739 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5740 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5741 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5742 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5743 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5744 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5745 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5746 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5747 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5748 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5749 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5750 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5751 NODE_NAME_CASE(SBUFFER_LOAD)
5752 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5753 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5754 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5755 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5756 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5757 NODE_NAME_CASE(BUFFER_STORE)
5758 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5759 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5760 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5761 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5762 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5763 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5764 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5765 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5766 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5767 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5768 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5769 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5770 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5771 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5772 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5773 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5774 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5775 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5776 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5777 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5778 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5779 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5780 }
5781 return nullptr;
5782}
5783
5784SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
5785 SelectionDAG &DAG, int Enabled,
5786 int &RefinementSteps,
5787 bool &UseOneConstNR,
5788 bool Reciprocal) const {
5789 EVT VT = Operand.getValueType();
5790
5791 if (VT == MVT::f32) {
5792 RefinementSteps = 0;
5793 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(Operand), VT, Operand);
5794 }
5795
5796 // TODO: There is also f64 rsq instruction, but the documentation is less
5797 // clear on its precision.
5798
5799 return SDValue();
5800}
5801
5802SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
5803 SelectionDAG &DAG, int Enabled,
5804 int &RefinementSteps) const {
5805 EVT VT = Operand.getValueType();
5806
5807 if (VT == MVT::f32) {
5808 // Reciprocal, < 1 ulp error.
5809 //
5810 // This reciprocal approximation converges to < 0.5 ulp error with one
5811 // newton rhapson performed with two fused multiple adds (FMAs).
5812
5813 RefinementSteps = 0;
5814 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SDLoc(Operand), VT, Operand);
5815 }
5816
5817 // TODO: There is also f64 rcp instruction, but the documentation is less
5818 // clear on its precision.
5819
5820 return SDValue();
5821}
5822
5823static unsigned workitemIntrinsicDim(unsigned ID) {
5824 switch (ID) {
5825 case Intrinsic::amdgcn_workitem_id_x:
5826 return 0;
5827 case Intrinsic::amdgcn_workitem_id_y:
5828 return 1;
5829 case Intrinsic::amdgcn_workitem_id_z:
5830 return 2;
5831 default:
5832 llvm_unreachable("not a workitem intrinsic");
5833 }
5834}
5835
5836void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
5837 const SDValue Op, KnownBits &Known,
5838 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5839
5840 Known.resetAll(); // Don't know anything.
5841
5842 unsigned Opc = Op.getOpcode();
5843
5844 switch (Opc) {
5845 default:
5846 break;
5847 case AMDGPUISD::CARRY:
5848 case AMDGPUISD::BORROW: {
5849 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 31);
5850 break;
5851 }
5852
5853 case AMDGPUISD::BFE_I32:
5854 case AMDGPUISD::BFE_U32: {
5855 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5856 if (!CWidth)
5857 return;
5858
5859 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5860
5861 if (Opc == AMDGPUISD::BFE_U32)
5862 Known.Zero = APInt::getHighBitsSet(numBits: 32, hiBitsSet: 32 - Width);
5863
5864 break;
5865 }
5866 case AMDGPUISD::FP_TO_FP16: {
5867 unsigned BitWidth = Known.getBitWidth();
5868
5869 // High bits are zero.
5870 Known.Zero = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - 16);
5871 break;
5872 }
5873 case AMDGPUISD::MUL_U24:
5874 case AMDGPUISD::MUL_I24: {
5875 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5876 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5877 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5878 RHSKnown.countMinTrailingZeros();
5879 Known.Zero.setLowBits(std::min(a: TrailZ, b: 32u));
5880 // Skip extra check if all bits are known zeros.
5881 if (TrailZ >= 32)
5882 break;
5883
5884 // Truncate to 24 bits.
5885 LHSKnown = LHSKnown.trunc(BitWidth: 24);
5886 RHSKnown = RHSKnown.trunc(BitWidth: 24);
5887
5888 if (Opc == AMDGPUISD::MUL_I24) {
5889 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5890 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5891 unsigned MaxValBits = LHSValBits + RHSValBits;
5892 if (MaxValBits > 32)
5893 break;
5894 unsigned SignBits = 32 - MaxValBits + 1;
5895 bool LHSNegative = LHSKnown.isNegative();
5896 bool LHSNonNegative = LHSKnown.isNonNegative();
5897 bool LHSPositive = LHSKnown.isStrictlyPositive();
5898 bool RHSNegative = RHSKnown.isNegative();
5899 bool RHSNonNegative = RHSKnown.isNonNegative();
5900 bool RHSPositive = RHSKnown.isStrictlyPositive();
5901
5902 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5903 Known.Zero.setHighBits(SignBits);
5904 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5905 Known.One.setHighBits(SignBits);
5906 } else {
5907 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5908 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5909 unsigned MaxValBits = LHSValBits + RHSValBits;
5910 if (MaxValBits >= 32)
5911 break;
5912 Known.Zero.setBitsFrom(MaxValBits);
5913 }
5914 break;
5915 }
5916 case AMDGPUISD::PERM: {
5917 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
5918 if (!CMask)
5919 return;
5920
5921 KnownBits LHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5922 KnownBits RHSKnown = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5923 unsigned Sel = CMask->getZExtValue();
5924
5925 for (unsigned I = 0; I < 32; I += 8) {
5926 unsigned SelBits = Sel & 0xff;
5927 if (SelBits < 4) {
5928 SelBits *= 8;
5929 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5930 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5931 } else if (SelBits < 7) {
5932 SelBits = (SelBits & 3) * 8;
5933 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5934 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5935 } else if (SelBits == 0x0c) {
5936 Known.Zero |= 0xFFull << I;
5937 } else if (SelBits > 0x0c) {
5938 Known.One |= 0xFFull << I;
5939 }
5940 Sel >>= 8;
5941 }
5942 break;
5943 }
5944 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5945 Known.Zero.setHighBits(24);
5946 break;
5947 }
5948 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5949 Known.Zero.setHighBits(16);
5950 break;
5951 }
5952 case AMDGPUISD::LDS: {
5953 auto *GA = cast<GlobalAddressSDNode>(Val: Op.getOperand(i: 0).getNode());
5954 Align Alignment = GA->getGlobal()->getPointerAlignment(DL: DAG.getDataLayout());
5955
5956 Known.Zero.setHighBits(16);
5957 Known.Zero.setLowBits(Log2(A: Alignment));
5958 break;
5959 }
5960 case AMDGPUISD::SMIN3:
5961 case AMDGPUISD::SMAX3:
5962 case AMDGPUISD::SMED3:
5963 case AMDGPUISD::UMIN3:
5964 case AMDGPUISD::UMAX3:
5965 case AMDGPUISD::UMED3: {
5966 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
5967 if (Known2.isUnknown())
5968 break;
5969
5970 KnownBits Known1 = DAG.computeKnownBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
5971 if (Known1.isUnknown())
5972 break;
5973
5974 KnownBits Known0 = DAG.computeKnownBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
5975 if (Known0.isUnknown())
5976 break;
5977
5978 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5979 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5980 Known.One = Known0.One & Known1.One & Known2.One;
5981 break;
5982 }
5983 case ISD::INTRINSIC_WO_CHAIN: {
5984 unsigned IID = Op.getConstantOperandVal(i: 0);
5985 switch (IID) {
5986 case Intrinsic::amdgcn_workitem_id_x:
5987 case Intrinsic::amdgcn_workitem_id_y:
5988 case Intrinsic::amdgcn_workitem_id_z: {
5989 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5990 Kernel: DAG.getMachineFunction().getFunction(), Dimension: workitemIntrinsicDim(ID: IID));
5991 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
5992 break;
5993 }
5994 default:
5995 break;
5996 }
5997 }
5998 }
5999}
6000
6001unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
6002 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6003 unsigned Depth) const {
6004 switch (Op.getOpcode()) {
6005 case AMDGPUISD::BFE_I32: {
6006 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6007 if (!Width)
6008 return 1;
6009
6010 unsigned SignBits = 32 - Width->getZExtValue() + 1;
6011 if (!isNullConstant(V: Op.getOperand(i: 1)))
6012 return SignBits;
6013
6014 // TODO: Could probably figure something out with non-0 offsets.
6015 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6016 return std::max(a: SignBits, b: Op0SignBits);
6017 }
6018
6019 case AMDGPUISD::BFE_U32: {
6020 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 2));
6021 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6022 }
6023
6024 case AMDGPUISD::CARRY:
6025 case AMDGPUISD::BORROW:
6026 return 31;
6027 case AMDGPUISD::BUFFER_LOAD_BYTE:
6028 return 25;
6029 case AMDGPUISD::BUFFER_LOAD_SHORT:
6030 return 17;
6031 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6032 return 24;
6033 case AMDGPUISD::BUFFER_LOAD_USHORT:
6034 return 16;
6035 case AMDGPUISD::FP_TO_FP16:
6036 return 16;
6037 case AMDGPUISD::SMIN3:
6038 case AMDGPUISD::SMAX3:
6039 case AMDGPUISD::SMED3:
6040 case AMDGPUISD::UMIN3:
6041 case AMDGPUISD::UMAX3:
6042 case AMDGPUISD::UMED3: {
6043 unsigned Tmp2 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
6044 if (Tmp2 == 1)
6045 return 1; // Early out.
6046
6047 unsigned Tmp1 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 1), Depth: Depth + 1);
6048 if (Tmp1 == 1)
6049 return 1; // Early out.
6050
6051 unsigned Tmp0 = DAG.ComputeNumSignBits(Op: Op.getOperand(i: 0), Depth: Depth + 1);
6052 if (Tmp0 == 1)
6053 return 1; // Early out.
6054
6055 return std::min(l: {Tmp0, Tmp1, Tmp2});
6056 }
6057 default:
6058 return 1;
6059 }
6060}
6061
6062unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
6063 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6064 const MachineRegisterInfo &MRI, unsigned Depth) const {
6065 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
6066 if (!MI)
6067 return 1;
6068
6069 // TODO: Check range metadata on MMO.
6070 switch (MI->getOpcode()) {
6071 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6072 return 25;
6073 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6074 return 17;
6075 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6076 return 24;
6077 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6078 return 16;
6079 case AMDGPU::G_AMDGPU_SMED3:
6080 case AMDGPU::G_AMDGPU_UMED3: {
6081 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6082 unsigned Tmp2 = Analysis.computeNumSignBits(R: Src2, DemandedElts, Depth: Depth + 1);
6083 if (Tmp2 == 1)
6084 return 1;
6085 unsigned Tmp1 = Analysis.computeNumSignBits(R: Src1, DemandedElts, Depth: Depth + 1);
6086 if (Tmp1 == 1)
6087 return 1;
6088 unsigned Tmp0 = Analysis.computeNumSignBits(R: Src0, DemandedElts, Depth: Depth + 1);
6089 if (Tmp0 == 1)
6090 return 1;
6091 return std::min(l: {Tmp0, Tmp1, Tmp2});
6092 }
6093 default:
6094 return 1;
6095 }
6096}
6097
6098bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
6099 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6100 unsigned Depth) const {
6101 unsigned Opcode = Op.getOpcode();
6102 switch (Opcode) {
6103 case AMDGPUISD::FMIN_LEGACY:
6104 case AMDGPUISD::FMAX_LEGACY: {
6105 if (SNaN)
6106 return true;
6107
6108 // TODO: Can check no nans on one of the operands for each one, but which
6109 // one?
6110 return false;
6111 }
6112 case AMDGPUISD::FMUL_LEGACY:
6113 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6114 if (SNaN)
6115 return true;
6116 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6117 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6118 }
6119 case AMDGPUISD::FMED3:
6120 case AMDGPUISD::FMIN3:
6121 case AMDGPUISD::FMAX3:
6122 case AMDGPUISD::FMINIMUM3:
6123 case AMDGPUISD::FMAXIMUM3:
6124 case AMDGPUISD::FMAD_FTZ: {
6125 if (SNaN)
6126 return true;
6127 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1) &&
6128 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6129 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6130 }
6131 case AMDGPUISD::CVT_F32_UBYTE0:
6132 case AMDGPUISD::CVT_F32_UBYTE1:
6133 case AMDGPUISD::CVT_F32_UBYTE2:
6134 case AMDGPUISD::CVT_F32_UBYTE3:
6135 return true;
6136
6137 case AMDGPUISD::RCP:
6138 case AMDGPUISD::RSQ:
6139 case AMDGPUISD::RCP_LEGACY:
6140 case AMDGPUISD::RSQ_CLAMP: {
6141 if (SNaN)
6142 return true;
6143
6144 // TODO: Need is known positive check.
6145 return false;
6146 }
6147 case ISD::FLDEXP:
6148 case AMDGPUISD::FRACT: {
6149 if (SNaN)
6150 return true;
6151 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
6152 }
6153 case AMDGPUISD::DIV_SCALE:
6154 case AMDGPUISD::DIV_FMAS:
6155 case AMDGPUISD::DIV_FIXUP:
6156 // TODO: Refine on operands.
6157 return SNaN;
6158 case AMDGPUISD::SIN_HW:
6159 case AMDGPUISD::COS_HW: {
6160 // TODO: Need check for infinity
6161 return SNaN;
6162 }
6163 case ISD::INTRINSIC_WO_CHAIN: {
6164 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
6165 // TODO: Handle more intrinsics
6166 switch (IntrinsicID) {
6167 case Intrinsic::amdgcn_cubeid:
6168 case Intrinsic::amdgcn_cvt_off_f32_i4:
6169 return true;
6170
6171 case Intrinsic::amdgcn_frexp_mant: {
6172 if (SNaN)
6173 return true;
6174 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1);
6175 }
6176 case Intrinsic::amdgcn_cvt_pkrtz: {
6177 if (SNaN)
6178 return true;
6179 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6180 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1);
6181 }
6182 case Intrinsic::amdgcn_rcp:
6183 case Intrinsic::amdgcn_rsq:
6184 case Intrinsic::amdgcn_rcp_legacy:
6185 case Intrinsic::amdgcn_rsq_legacy:
6186 case Intrinsic::amdgcn_rsq_clamp: {
6187 if (SNaN)
6188 return true;
6189
6190 // TODO: Need is known positive check.
6191 return false;
6192 }
6193 case Intrinsic::amdgcn_trig_preop:
6194 case Intrinsic::amdgcn_fdot2:
6195 // TODO: Refine on operand
6196 return SNaN;
6197 case Intrinsic::amdgcn_fma_legacy:
6198 if (SNaN)
6199 return true;
6200 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 1), SNaN, Depth: Depth + 1) &&
6201 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 2), SNaN, Depth: Depth + 1) &&
6202 DAG.isKnownNeverNaN(Op: Op.getOperand(i: 3), SNaN, Depth: Depth + 1);
6203 default:
6204 return false;
6205 }
6206 }
6207 default:
6208 return false;
6209 }
6210}
6211
6212bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
6213 Register N0, Register N1) const {
6214 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
6215}
6216