1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIMachineFunctionInfo.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
23#include "llvm/ADT/FloatingPointMode.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/UniformityAnalysis.h"
27#include "llvm/CodeGen/Analysis.h"
28#include "llvm/CodeGen/ByteProvider.h"
29#include "llvm/CodeGen/FunctionLoweringInfo.h"
30#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
31#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
32#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
33#include "llvm/CodeGen/MachineFrameInfo.h"
34#include "llvm/CodeGen/MachineFunction.h"
35#include "llvm/CodeGen/MachineLoopInfo.h"
36#include "llvm/IR/DiagnosticInfo.h"
37#include "llvm/IR/IRBuilder.h"
38#include "llvm/IR/IntrinsicInst.h"
39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
41#include "llvm/IR/MDBuilder.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/KnownBits.h"
44#include "llvm/Support/ModRef.h"
45#include "llvm/Transforms/Utils/LowerAtomic.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
54static cl::opt<bool>
55 DisableLoopAlignment("amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(Val: false));
58
59static cl::opt<bool> UseDivergentRegisterIndexing(
60 "amdgpu-use-divergent-register-indexing", cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(Val: false));
63
64// TODO: This option should be removed once we switch to always using PTRADD in
65// the SelectionDAG.
66static cl::opt<bool> UseSelectionDAGPTRADD(
67 "amdgpu-use-sdag-ptradd", cl::Hidden,
68 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
69 "SelectionDAG ISel"),
70 cl::init(Val: false));
71
72static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
73 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
74 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
75}
76
77static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
78 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
79 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
80}
81
82static unsigned findFirstFreeSGPR(CCState &CCInfo) {
83 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
84 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
85 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
86 return AMDGPU::SGPR0 + Reg;
87 }
88 }
89 llvm_unreachable("Cannot allocate sgpr");
90}
91
92SITargetLowering::SITargetLowering(const TargetMachine &TM,
93 const GCNSubtarget &STI)
94 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
95 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
96 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
97
98 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
99 addRegisterClass(VT: MVT::f32, RC: &AMDGPU::VGPR_32RegClass);
100
101 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
102
103 const SIRegisterInfo *TRI = STI.getRegisterInfo();
104 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
105
106 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
107 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
108 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
109
110 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
111 addRegisterClass(VT: MVT::v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 96));
112
113 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
114 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
115
116 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(VT: MVT::v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 128));
118
119 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
120 addRegisterClass(VT: MVT::v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 160));
121
122 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
123 addRegisterClass(VT: MVT::v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
124
125 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(VT: MVT::v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
127
128 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
129 addRegisterClass(VT: MVT::v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 224));
130
131 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
132 addRegisterClass(VT: MVT::v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256));
133
134 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(VT: MVT::v4f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256));
136
137 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
138 addRegisterClass(VT: MVT::v9f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 288));
139
140 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
141 addRegisterClass(VT: MVT::v10f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 320));
142
143 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
144 addRegisterClass(VT: MVT::v11f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 352));
145
146 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
147 addRegisterClass(VT: MVT::v12f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 384));
148
149 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
150 addRegisterClass(VT: MVT::v16f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512));
151
152 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(VT: MVT::v8f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512));
154
155 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
156 addRegisterClass(VT: MVT::v16f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024));
157
158 if (Subtarget->has16BitInsts()) {
159 if (Subtarget->useRealTrue16Insts()) {
160 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
161 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
162 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
163 } else {
164 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
165 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
166 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
167 }
168
169 // Unless there are also VOP3P operations, not operations are really legal.
170 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
171 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
172 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
173 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
174 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
175 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
176 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
177 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
178 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
179 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
180 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
181 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
182 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
183 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
184 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
185 }
186
187 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
188 addRegisterClass(VT: MVT::v32f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024));
189
190 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
191
192 // The boolean content concept here is too inflexible. Compares only ever
193 // really produce a 1-bit result. Any copy/extend from these will turn into a
194 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
195 // it's what most targets use.
196 setBooleanContents(ZeroOrOneBooleanContent);
197 setBooleanVectorContents(ZeroOrOneBooleanContent);
198
199 // We need to custom lower vector stores from local memory
200 setOperationAction(Ops: ISD::LOAD,
201 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Action: Custom);
206
207 setOperationAction(Ops: ISD::STORE,
208 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
212 Action: Custom);
213
214 if (isTypeLegal(VT: MVT::bf16)) {
215 for (unsigned Opc :
216 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
217 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
218 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
219 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
220 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
221 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
222 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
223 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
224 ISD::SETCC}) {
225 // FIXME: The promoted to type shouldn't need to be explicit
226 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
227 AddPromotedToType(Opc, OrigVT: MVT::bf16, DestVT: MVT::f32);
228 }
229
230 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
231
232 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
233 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
234
235 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
236 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
237 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
238
239 // We only need to custom lower because we can't specify an action for bf16
240 // sources.
241 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
242 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
243 }
244
245 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
246 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
247 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
248 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
249 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
250 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
251 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
252 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
253 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
254 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
255 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
261
262 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
263 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
264 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
265 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
266 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
267 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
268 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
269
270 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
271
272 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
273 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
274 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
275 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
276
277 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
278
279 setOperationAction(Ops: ISD::SELECT_CC,
280 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
281
282 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
283 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
284 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
285
286 setOperationAction(Ops: ISD::TRUNCATE,
287 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
288 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
289 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
290 Action: Expand);
291 setOperationAction(Ops: ISD::FP_ROUND,
292 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
293 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
294 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295 Action: Expand);
296
297 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
298 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
299 MVT::v3i16, MVT::v4i16, MVT::Other},
300 Action: Custom);
301
302 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
303 setOperationAction(Ops: ISD::BR_CC,
304 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
305
306 setOperationAction(Ops: {ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
307
308 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
309
310 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
311 Action: Expand);
312
313#if 0
314 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
315#endif
316
317 // We only support LOAD/STORE and vector manipulation ops for vectors
318 // with > 4 elements.
319 for (MVT VT :
320 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
321 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
322 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
323 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
324 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
325 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
326 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
327 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
328 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
329 switch (Op) {
330 case ISD::LOAD:
331 case ISD::STORE:
332 case ISD::BUILD_VECTOR:
333 case ISD::BITCAST:
334 case ISD::UNDEF:
335 case ISD::EXTRACT_VECTOR_ELT:
336 case ISD::INSERT_VECTOR_ELT:
337 case ISD::SCALAR_TO_VECTOR:
338 case ISD::IS_FPCLASS:
339 break;
340 case ISD::EXTRACT_SUBVECTOR:
341 case ISD::INSERT_SUBVECTOR:
342 case ISD::CONCAT_VECTORS:
343 setOperationAction(Op, VT, Action: Custom);
344 break;
345 default:
346 setOperationAction(Op, VT, Action: Expand);
347 break;
348 }
349 }
350 }
351
352 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
353
354 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
355 // is expanded to avoid having two separate loops in case the index is a VGPR.
356
357 // Most operations are naturally 32-bit vector operations. We only support
358 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
359 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
360 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
361 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
362
363 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
364 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
365
366 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
367 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
368
369 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
370 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
371 }
372
373 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
374 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
375 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
376
377 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
378 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
379
380 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
381 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
382
383 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
384 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
385 }
386
387 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
388 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
389 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
390
391 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
392 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
393
394 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
395 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
396
397 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
398 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
399 }
400
401 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
402 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
403 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
404
405 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
406 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
407
408 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
409 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
410
411 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
412 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
413 }
414
415 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
416 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
417 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
418
419 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
420 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
421
422 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
423 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
424
425 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
426 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
427 }
428
429 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
430 VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
431 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
432 Action: Custom);
433
434 if (Subtarget->hasPkMovB32()) {
435 // TODO: 16-bit element vectors should be legal with even aligned elements.
436 // TODO: Can be legal with wider source types than the result with
437 // subregister extracts.
438 setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
439 }
440
441 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
442 Action: Custom);
443
444 // Avoid stack access for these.
445 // TODO: Generalize to more vector types.
446 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
447 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
448 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
449 Action: Custom);
450
451 // Deal with vec3 vector operations when widened to vec4.
452 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
453 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
454
455 // Deal with vec5/6/7 vector operations when widened to vec8.
456 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
457 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
458 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
459 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
460 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
461 Action: Custom);
462
463 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
464 // and output demarshalling
465 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
466
467 // We can't return success/failure, only the old value,
468 // let LLVM add the comparison
469 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
470 Action: Expand);
471
472 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
473
474 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
475
476 // FIXME: This should be narrowed to i32, but that only happens if i64 is
477 // illegal.
478 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
479 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
480
481 // On SI this is s_memtime and s_memrealtime on VI.
482 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
483
484 if (Subtarget->hasSMemRealTime() ||
485 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
486 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
487 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
488
489 if (Subtarget->has16BitInsts()) {
490 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
491 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
492 } else {
493 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
494 }
495
496 if (Subtarget->hasMadMacF32Insts())
497 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
498
499 if (!Subtarget->hasBFI())
500 // fcopysign can be done in a single instruction with BFI.
501 setOperationAction(Ops: ISD::FCOPYSIGN, VTs: {MVT::f32, MVT::f64}, Action: Expand);
502
503 if (!Subtarget->hasBCNT(Size: 32))
504 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Expand);
505
506 if (!Subtarget->hasBCNT(Size: 64))
507 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Expand);
508
509 if (Subtarget->hasFFBH())
510 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
511
512 if (Subtarget->hasFFBL())
513 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
514
515 // We only really have 32-bit BFE instructions (and 16-bit on VI).
516 //
517 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
518 // effort to match them now. We want this to be false for i64 cases when the
519 // extraction isn't restricted to the upper or lower half. Ideally we would
520 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
521 // span the midpoint are probably relatively rare, so don't worry about them
522 // for now.
523 if (Subtarget->hasBFE())
524 setHasExtractBitsInsn(true);
525
526 // Clamp modifier on add/sub
527 if (Subtarget->hasIntClamp())
528 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
529
530 if (Subtarget->hasAddNoCarry())
531 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
532 Action: Legal);
533
534 setOperationAction(
535 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
536 VTs: {MVT::f32, MVT::f64}, Action: Custom);
537
538 // These are really only legal for ieee_mode functions. We should be avoiding
539 // them for functions that don't have ieee_mode enabled, so just say they are
540 // legal.
541 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
542 VTs: {MVT::f32, MVT::f64}, Action: Legal);
543
544 if (Subtarget->haveRoundOpsF64())
545 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
546 Action: Legal);
547 else
548 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
549 VT: MVT::f64, Action: Custom);
550
551 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
552 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
553 Action: Legal);
554 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
555
556 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
557 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
558
559 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
560 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
561
562 // Custom lower these because we can't specify a rule based on an illegal
563 // source bf16.
564 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
565 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
566
567 if (Subtarget->has16BitInsts()) {
568 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
569 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
570 VT: MVT::i16, Action: Legal);
571
572 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
573
574 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
575 VT: MVT::i16, Action: Expand);
576
577 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
578 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
579 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
580 ISD::CTPOP},
581 VT: MVT::i16, Action: Promote);
582
583 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
584
585 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
586
587 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
588 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
589 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
590 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
591
592 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
593 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
594 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
595
596 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
597
598 // F16 - Constant Actions.
599 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
600 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
601
602 // F16 - Load/Store Actions.
603 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
604 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
605 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
606 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
607
608 // BF16 - Load/Store Actions.
609 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
610 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
611 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
612 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
613
614 // F16 - VOP1 Actions.
615 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
616 ISD::FSIN, ISD::FROUND},
617 VT: MVT::f16, Action: Custom);
618
619 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::f16, Action: Promote);
620 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::bf16, Action: Promote);
621
622 // F16 - VOP2 Actions.
623 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
624 Action: Expand);
625 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
626 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
627 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
628
629 // F16 - VOP3 Actions.
630 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
631 if (STI.hasMadF16())
632 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
633
634 for (MVT VT :
635 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
636 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
637 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
638 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
639 switch (Op) {
640 case ISD::LOAD:
641 case ISD::STORE:
642 case ISD::BUILD_VECTOR:
643 case ISD::BITCAST:
644 case ISD::UNDEF:
645 case ISD::EXTRACT_VECTOR_ELT:
646 case ISD::INSERT_VECTOR_ELT:
647 case ISD::INSERT_SUBVECTOR:
648 case ISD::SCALAR_TO_VECTOR:
649 case ISD::IS_FPCLASS:
650 break;
651 case ISD::EXTRACT_SUBVECTOR:
652 case ISD::CONCAT_VECTORS:
653 setOperationAction(Op, VT, Action: Custom);
654 break;
655 default:
656 setOperationAction(Op, VT, Action: Expand);
657 break;
658 }
659 }
660 }
661
662 // v_perm_b32 can handle either of these.
663 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
664 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
665
666 // XXX - Do these do anything? Vector constants turn into build_vector.
667 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
668
669 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
670 Action: Legal);
671
672 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
673 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
674 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
675 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
676
677 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
678 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
679 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
680 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
681
682 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
683 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
684 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
685 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
686 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
687 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
688
689 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
690 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
691 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
692 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
693 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
694 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
695
696 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
697 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
698 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
699 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
700 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
701 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
702
703 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
704 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
705 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
706 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
707 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
708 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
709
710 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
711 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
712 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
713 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
714
715 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
716 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
717 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
718 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
719 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
720 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
721
722 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
723 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
724 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
725 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
726 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
727 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
728
729 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
730 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
731 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
732 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
733 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
734 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
735
736 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
737 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
738 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
739 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
740 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
741 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
742
743 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
744 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
745 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
746 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
747 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
748 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
749
750 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
751 VT: MVT::v2i32, Action: Expand);
752 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
753
754 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
755 VT: MVT::v4i32, Action: Expand);
756
757 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
758 VT: MVT::v8i32, Action: Expand);
759
760 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
761 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
762
763 setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
764 // This isn't really legal, but this avoids the legalizer unrolling it (and
765 // allows matching fneg (fabs x) patterns)
766 setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
767
768 // Can do this in one BFI plus a constant materialize.
769 setOperationAction(Ops: ISD::FCOPYSIGN,
770 VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
771 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
772 MVT::v32f16, MVT::v32bf16},
773 Action: Custom);
774
775 setOperationAction(
776 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
777 VT: MVT::f16, Action: Custom);
778 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
779
780 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
781 ISD::FMAXIMUMNUM},
782 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
783 Action: Custom);
784
785 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
786 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
787 Action: Expand);
788
789 for (MVT Vec16 :
790 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
791 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792 setOperationAction(
793 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
794 VT: Vec16, Action: Custom);
795 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
796 }
797 }
798
799 if (Subtarget->hasVOP3PInsts()) {
800 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
801 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
802 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
803 VT: MVT::v2i16, Action: Legal);
804
805 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
806 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
807 VT: MVT::v2f16, Action: Legal);
808
809 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
810 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
811
812 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
813 VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
814 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
815 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
816 Action: Custom);
817
818 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
819 // Split vector operations.
820 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
821 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
822 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
823 ISD::SSUBSAT},
824 VT, Action: Custom);
825
826 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
827 // Split vector operations.
828 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
829 VT, Action: Custom);
830
831 setOperationAction(
832 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
833 VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
834
835 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
836 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
837 Action: Custom);
838
839 if (Subtarget->hasPackedFP32Ops()) {
840 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
841 VT: MVT::v2f32, Action: Legal);
842 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
843 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
844 Action: Custom);
845 }
846 }
847
848 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
849
850 if (Subtarget->has16BitInsts()) {
851 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
852 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
853 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
854 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
855 } else {
856 // Legalization hack.
857 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
858
859 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
860 }
861
862 setOperationAction(Ops: ISD::SELECT,
863 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
864 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
865 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
866 MVT::v32f16, MVT::v32bf16},
867 Action: Custom);
868
869 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
870
871 if (Subtarget->hasScalarSMulU64())
872 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
873
874 if (Subtarget->hasMad64_32())
875 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
876
877 if (Subtarget->hasPrefetch() && Subtarget->hasSafeSmemPrefetch())
878 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
879
880 if (Subtarget->hasIEEEMinMax()) {
881 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
882 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
883 } else {
884 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
885 if (Subtarget->hasMinimum3Maximum3F32())
886 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
887
888 if (Subtarget->hasMinimum3Maximum3PKF16()) {
889 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
890
891 // If only the vector form is available, we need to widen to a vector.
892 if (!Subtarget->hasMinimum3Maximum3F16())
893 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
894 }
895 }
896
897 if (Subtarget->hasVOP3PInsts()) {
898 // We want to break these into v2f16 pieces, not scalarize.
899 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
900 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
901 Action: Custom);
902 }
903
904 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
905 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
906 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
907 MVT::i8},
908 Action: Custom);
909
910 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
911 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
912 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
913 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
914 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
915 Action: Custom);
916
917 setOperationAction(Ops: ISD::INTRINSIC_VOID,
918 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
919 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
920 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
921 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
922 Action: Custom);
923
924 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
925 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
926 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
927 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
928 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
929
930 // TODO: Could move this to custom lowering, could benefit from combines on
931 // extract of relevant bits.
932 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
933
934 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
935
936 if (Subtarget->hasBF16ConversionInsts()) {
937 setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
938 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
939 }
940
941 if (Subtarget->hasCvtPkF16F32Inst()) {
942 setOperationAction(Ops: ISD::FP_ROUND,
943 VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
944 Action: Custom);
945 }
946
947 setTargetDAGCombine({ISD::ADD,
948 ISD::PTRADD,
949 ISD::UADDO_CARRY,
950 ISD::SUB,
951 ISD::USUBO_CARRY,
952 ISD::MUL,
953 ISD::FADD,
954 ISD::FSUB,
955 ISD::FDIV,
956 ISD::FMUL,
957 ISD::FMINNUM,
958 ISD::FMAXNUM,
959 ISD::FMINNUM_IEEE,
960 ISD::FMAXNUM_IEEE,
961 ISD::FMINIMUM,
962 ISD::FMAXIMUM,
963 ISD::FMINIMUMNUM,
964 ISD::FMAXIMUMNUM,
965 ISD::FMA,
966 ISD::SMIN,
967 ISD::SMAX,
968 ISD::UMIN,
969 ISD::UMAX,
970 ISD::SETCC,
971 ISD::SELECT,
972 ISD::SMIN,
973 ISD::SMAX,
974 ISD::UMIN,
975 ISD::UMAX,
976 ISD::AND,
977 ISD::OR,
978 ISD::XOR,
979 ISD::SHL,
980 ISD::SRL,
981 ISD::SRA,
982 ISD::FSHR,
983 ISD::SINT_TO_FP,
984 ISD::UINT_TO_FP,
985 ISD::FCANONICALIZE,
986 ISD::SCALAR_TO_VECTOR,
987 ISD::ZERO_EXTEND,
988 ISD::SIGN_EXTEND_INREG,
989 ISD::EXTRACT_VECTOR_ELT,
990 ISD::INSERT_VECTOR_ELT,
991 ISD::FCOPYSIGN});
992
993 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
994 setTargetDAGCombine(ISD::FP_ROUND);
995
996 // All memory operations. Some folding on the pointer operand is done to help
997 // matching the constant offsets in the addressing modes.
998 setTargetDAGCombine({ISD::LOAD,
999 ISD::STORE,
1000 ISD::ATOMIC_LOAD,
1001 ISD::ATOMIC_STORE,
1002 ISD::ATOMIC_CMP_SWAP,
1003 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1004 ISD::ATOMIC_SWAP,
1005 ISD::ATOMIC_LOAD_ADD,
1006 ISD::ATOMIC_LOAD_SUB,
1007 ISD::ATOMIC_LOAD_AND,
1008 ISD::ATOMIC_LOAD_OR,
1009 ISD::ATOMIC_LOAD_XOR,
1010 ISD::ATOMIC_LOAD_NAND,
1011 ISD::ATOMIC_LOAD_MIN,
1012 ISD::ATOMIC_LOAD_MAX,
1013 ISD::ATOMIC_LOAD_UMIN,
1014 ISD::ATOMIC_LOAD_UMAX,
1015 ISD::ATOMIC_LOAD_FADD,
1016 ISD::ATOMIC_LOAD_FMIN,
1017 ISD::ATOMIC_LOAD_FMAX,
1018 ISD::ATOMIC_LOAD_UINC_WRAP,
1019 ISD::ATOMIC_LOAD_UDEC_WRAP,
1020 ISD::INTRINSIC_VOID,
1021 ISD::INTRINSIC_W_CHAIN});
1022
1023 // FIXME: In other contexts we pretend this is a per-function property.
1024 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1025
1026 setSchedulingPreference(Sched::RegPressure);
1027}
1028
1029const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1030
1031ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1032 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1033 return RCRegs;
1034}
1035
1036//===----------------------------------------------------------------------===//
1037// TargetLowering queries
1038//===----------------------------------------------------------------------===//
1039
1040// v_mad_mix* support a conversion from f16 to f32.
1041//
1042// There is only one special case when denormals are enabled we don't currently,
1043// where this is OK to use.
1044bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1045 EVT DestVT, EVT SrcVT) const {
1046 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1047 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1048 DestVT.getScalarType() == MVT::f32 &&
1049 SrcVT.getScalarType() == MVT::f16 &&
1050 // TODO: This probably only requires no input flushing?
1051 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1052}
1053
1054bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1055 LLT DestTy, LLT SrcTy) const {
1056 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1057 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1058 DestTy.getScalarSizeInBits() == 32 &&
1059 SrcTy.getScalarSizeInBits() == 16 &&
1060 // TODO: This probably only requires no input flushing?
1061 denormalModeIsFlushAllF32(MF: *MI.getMF());
1062}
1063
1064bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1065 // SI has some legal vector types, but no legal vector operations. Say no
1066 // shuffles are legal in order to prefer scalarizing some vector operations.
1067 return false;
1068}
1069
1070MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1071 CallingConv::ID CC,
1072 EVT VT) const {
1073 if (CC == CallingConv::AMDGPU_KERNEL)
1074 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1075
1076 if (VT.isVector()) {
1077 EVT ScalarVT = VT.getScalarType();
1078 unsigned Size = ScalarVT.getSizeInBits();
1079 if (Size == 16) {
1080 if (Subtarget->has16BitInsts()) {
1081 if (VT.isInteger())
1082 return MVT::v2i16;
1083 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1084 }
1085 return VT.isInteger() ? MVT::i32 : MVT::f32;
1086 }
1087
1088 if (Size < 16)
1089 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1090 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1091 }
1092
1093 if (VT.getSizeInBits() > 32)
1094 return MVT::i32;
1095
1096 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1097}
1098
1099unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1100 CallingConv::ID CC,
1101 EVT VT) const {
1102 if (CC == CallingConv::AMDGPU_KERNEL)
1103 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1104
1105 if (VT.isVector()) {
1106 unsigned NumElts = VT.getVectorNumElements();
1107 EVT ScalarVT = VT.getScalarType();
1108 unsigned Size = ScalarVT.getSizeInBits();
1109
1110 // FIXME: Should probably promote 8-bit vectors to i16.
1111 if (Size == 16 && Subtarget->has16BitInsts())
1112 return (NumElts + 1) / 2;
1113
1114 if (Size <= 32)
1115 return NumElts;
1116
1117 if (Size > 32)
1118 return NumElts * ((Size + 31) / 32);
1119 } else if (VT.getSizeInBits() > 32)
1120 return (VT.getSizeInBits() + 31) / 32;
1121
1122 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1123}
1124
1125unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1126 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1127 unsigned &NumIntermediates, MVT &RegisterVT) const {
1128 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1129 unsigned NumElts = VT.getVectorNumElements();
1130 EVT ScalarVT = VT.getScalarType();
1131 unsigned Size = ScalarVT.getSizeInBits();
1132 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1133 // support, but unless we can properly handle 3-vectors, it will be still be
1134 // inconsistent.
1135 if (Size == 16 && Subtarget->has16BitInsts()) {
1136 if (ScalarVT == MVT::bf16) {
1137 RegisterVT = MVT::i32;
1138 IntermediateVT = MVT::v2bf16;
1139 } else {
1140 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1141 IntermediateVT = RegisterVT;
1142 }
1143 NumIntermediates = (NumElts + 1) / 2;
1144 return NumIntermediates;
1145 }
1146
1147 if (Size == 32) {
1148 RegisterVT = ScalarVT.getSimpleVT();
1149 IntermediateVT = RegisterVT;
1150 NumIntermediates = NumElts;
1151 return NumIntermediates;
1152 }
1153
1154 if (Size < 16 && Subtarget->has16BitInsts()) {
1155 // FIXME: Should probably form v2i16 pieces
1156 RegisterVT = MVT::i16;
1157 IntermediateVT = ScalarVT;
1158 NumIntermediates = NumElts;
1159 return NumIntermediates;
1160 }
1161
1162 if (Size != 16 && Size <= 32) {
1163 RegisterVT = MVT::i32;
1164 IntermediateVT = ScalarVT;
1165 NumIntermediates = NumElts;
1166 return NumIntermediates;
1167 }
1168
1169 if (Size > 32) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = RegisterVT;
1172 NumIntermediates = NumElts * ((Size + 31) / 32);
1173 return NumIntermediates;
1174 }
1175 }
1176
1177 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1178 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1179}
1180
1181static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1182 const DataLayout &DL, Type *Ty,
1183 unsigned MaxNumLanes) {
1184 assert(MaxNumLanes != 0);
1185
1186 LLVMContext &Ctx = Ty->getContext();
1187 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1188 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1189 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1190 NumElements: NumElts);
1191 }
1192
1193 return TLI.getValueType(DL, Ty);
1194}
1195
1196// Peek through TFE struct returns to only use the data size.
1197static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1198 const DataLayout &DL, Type *Ty,
1199 unsigned MaxNumLanes) {
1200 auto *ST = dyn_cast<StructType>(Val: Ty);
1201 if (!ST)
1202 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1203
1204 // TFE intrinsics return an aggregate type.
1205 assert(ST->getNumContainedTypes() == 2 &&
1206 ST->getContainedType(1)->isIntegerTy(32));
1207 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1208}
1209
1210/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1211/// in-memory representation. This return value is a custom type because there
1212/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1213/// could cause issues during codegen, these address space 7 pointers will be
1214/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1215/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1216/// for cost modeling, to work. (This also sets us up decently for doing the
1217/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1218MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1219 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1220 return MVT::amdgpuBufferFatPointer;
1221 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1222 DL.getPointerSizeInBits(AS) == 192)
1223 return MVT::amdgpuBufferStridedPointer;
1224 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1225}
1226/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1227/// v8i32 when padding is added.
1228/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1229/// also v8i32 with padding.
1230MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1231 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1232 DL.getPointerSizeInBits(AS) == 160) ||
1233 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1234 DL.getPointerSizeInBits(AS) == 192))
1235 return MVT::v8i32;
1236 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1237}
1238
1239bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1240 const CallInst &CI,
1241 MachineFunction &MF,
1242 unsigned IntrID) const {
1243 Info.flags = MachineMemOperand::MONone;
1244 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1245 Info.flags |= MachineMemOperand::MOInvariant;
1246 if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1247 Info.flags |= MachineMemOperand::MONonTemporal;
1248 Info.flags |= getTargetMMOFlags(I: CI);
1249
1250 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1251 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1252 AttributeSet Attr =
1253 Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1254 MemoryEffects ME = Attr.getMemoryEffects();
1255 if (ME.doesNotAccessMemory())
1256 return false;
1257
1258 // TODO: Should images get their own address space?
1259 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1260
1261 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1262 if (RsrcIntr->IsImage) {
1263 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1264 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1265 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1266 Info.align.reset();
1267 }
1268
1269 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1270 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1271 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1272 // We conservatively set the memory operand of a buffer intrinsic to the
1273 // base resource pointer, so that we can access alias information about
1274 // those pointers. Cases like "this points at the same value
1275 // but with a different offset" are handled in
1276 // areMemAccessesTriviallyDisjoint.
1277 Info.ptrVal = RsrcArg;
1278 }
1279
1280 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1281 if (!IsSPrefetch) {
1282 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1283 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1284 Info.flags |= MachineMemOperand::MOVolatile;
1285 }
1286
1287 Info.flags |= MachineMemOperand::MODereferenceable;
1288 if (ME.onlyReadsMemory()) {
1289 if (RsrcIntr->IsImage) {
1290 unsigned MaxNumLanes = 4;
1291
1292 if (!BaseOpcode->Gather4) {
1293 // If this isn't a gather, we may have excess loaded elements in the
1294 // IR type. Check the dmask for the real number of elements loaded.
1295 unsigned DMask =
1296 cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1297 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1298 }
1299
1300 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1301 Ty: CI.getType(), MaxNumLanes);
1302 } else {
1303 Info.memVT =
1304 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1305 MaxNumLanes: std::numeric_limits<unsigned>::max());
1306 }
1307
1308 // FIXME: What does alignment mean for an image?
1309 Info.opc = ISD::INTRINSIC_W_CHAIN;
1310 Info.flags |= MachineMemOperand::MOLoad;
1311 } else if (ME.onlyWritesMemory()) {
1312 Info.opc = ISD::INTRINSIC_VOID;
1313
1314 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1315 if (RsrcIntr->IsImage) {
1316 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1317 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1318 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1319 MaxNumLanes: DMaskLanes);
1320 } else
1321 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1322
1323 Info.flags |= MachineMemOperand::MOStore;
1324 } else {
1325 // Atomic, NoReturn Sampler or prefetch
1326 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1327 : ISD::INTRINSIC_W_CHAIN;
1328 Info.flags |=
1329 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1330
1331 if (!IsSPrefetch)
1332 Info.flags |= MachineMemOperand::MOStore;
1333
1334 switch (IntrID) {
1335 default:
1336 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1337 // Fake memory access type for no return sampler intrinsics
1338 Info.memVT = MVT::i32;
1339 } else {
1340 // XXX - Should this be volatile without known ordering?
1341 Info.flags |= MachineMemOperand::MOVolatile;
1342 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1343 }
1344 break;
1345 case Intrinsic::amdgcn_raw_buffer_load_lds:
1346 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1347 case Intrinsic::amdgcn_struct_buffer_load_lds:
1348 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1349 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1350 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1351 Info.ptrVal = CI.getArgOperand(i: 1);
1352 return true;
1353 }
1354 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1355 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1356 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1357 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1358 Info.memVT =
1359 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1360 MaxNumLanes: std::numeric_limits<unsigned>::max());
1361 Info.flags &= ~MachineMemOperand::MOStore;
1362 return true;
1363 }
1364 }
1365 }
1366 return true;
1367 }
1368
1369 switch (IntrID) {
1370 case Intrinsic::amdgcn_ds_ordered_add:
1371 case Intrinsic::amdgcn_ds_ordered_swap: {
1372 Info.opc = ISD::INTRINSIC_W_CHAIN;
1373 Info.memVT = MVT::getVT(Ty: CI.getType());
1374 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1375 Info.align.reset();
1376 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1377
1378 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1379 if (!Vol->isZero())
1380 Info.flags |= MachineMemOperand::MOVolatile;
1381
1382 return true;
1383 }
1384 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1385 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1386 Info.opc = ISD::INTRINSIC_W_CHAIN;
1387 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1388 Info.ptrVal = nullptr;
1389 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1390 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1391 return true;
1392 }
1393 case Intrinsic::amdgcn_ds_append:
1394 case Intrinsic::amdgcn_ds_consume: {
1395 Info.opc = ISD::INTRINSIC_W_CHAIN;
1396 Info.memVT = MVT::getVT(Ty: CI.getType());
1397 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1398 Info.align.reset();
1399 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1400
1401 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1402 if (!Vol->isZero())
1403 Info.flags |= MachineMemOperand::MOVolatile;
1404
1405 return true;
1406 }
1407 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1408 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1409 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1410 ? ISD::INTRINSIC_W_CHAIN
1411 : ISD::INTRINSIC_VOID;
1412 Info.memVT = MVT::getVT(Ty: CI.getType());
1413 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1414 Info.memVT = MVT::i64;
1415 Info.size = 8;
1416 Info.align.reset();
1417 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1418 return true;
1419 }
1420 case Intrinsic::amdgcn_global_atomic_csub: {
1421 Info.opc = ISD::INTRINSIC_W_CHAIN;
1422 Info.memVT = MVT::getVT(Ty: CI.getType());
1423 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1424 Info.align.reset();
1425 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1426 MachineMemOperand::MOVolatile;
1427 return true;
1428 }
1429 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1430 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1431 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1432 Info.opc = ISD::INTRINSIC_W_CHAIN;
1433 Info.memVT =
1434 MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1435 ? CI.getType()
1436 : cast<StructType>(Val: CI.getType())
1437 ->getElementType(N: 0)); // XXX: what is correct VT?
1438
1439 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1440 Info.align.reset();
1441 Info.flags |=
1442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_global_atomic_fmin_num:
1446 case Intrinsic::amdgcn_global_atomic_fmax_num:
1447 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1448 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1449 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1450 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1451 Info.opc = ISD::INTRINSIC_W_CHAIN;
1452 Info.memVT = MVT::getVT(Ty: CI.getType());
1453 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1454 Info.align.reset();
1455 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1456 MachineMemOperand::MODereferenceable |
1457 MachineMemOperand::MOVolatile;
1458 return true;
1459 }
1460 case Intrinsic::amdgcn_ds_load_tr6_b96:
1461 case Intrinsic::amdgcn_ds_load_tr4_b64:
1462 case Intrinsic::amdgcn_ds_load_tr8_b64:
1463 case Intrinsic::amdgcn_ds_load_tr16_b128:
1464 case Intrinsic::amdgcn_global_load_tr6_b96:
1465 case Intrinsic::amdgcn_global_load_tr4_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1468 case Intrinsic::amdgcn_ds_read_tr4_b64:
1469 case Intrinsic::amdgcn_ds_read_tr6_b96:
1470 case Intrinsic::amdgcn_ds_read_tr8_b64:
1471 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1472 Info.opc = ISD::INTRINSIC_W_CHAIN;
1473 Info.memVT = MVT::getVT(Ty: CI.getType());
1474 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1475 Info.align.reset();
1476 Info.flags |= MachineMemOperand::MOLoad;
1477 return true;
1478 }
1479 case Intrinsic::amdgcn_ds_gws_init:
1480 case Intrinsic::amdgcn_ds_gws_barrier:
1481 case Intrinsic::amdgcn_ds_gws_sema_v:
1482 case Intrinsic::amdgcn_ds_gws_sema_br:
1483 case Intrinsic::amdgcn_ds_gws_sema_p:
1484 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1485 Info.opc = ISD::INTRINSIC_VOID;
1486
1487 const GCNTargetMachine &TM =
1488 static_cast<const GCNTargetMachine &>(getTargetMachine());
1489
1490 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1491 Info.ptrVal = MFI->getGWSPSV(TM);
1492
1493 // This is an abstract access, but we need to specify a type and size.
1494 Info.memVT = MVT::i32;
1495 Info.size = 4;
1496 Info.align = Align(4);
1497
1498 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1499 Info.flags |= MachineMemOperand::MOLoad;
1500 else
1501 Info.flags |= MachineMemOperand::MOStore;
1502 return true;
1503 }
1504 case Intrinsic::amdgcn_load_to_lds:
1505 case Intrinsic::amdgcn_global_load_lds: {
1506 Info.opc = ISD::INTRINSIC_VOID;
1507 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1508 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1509 Info.ptrVal = CI.getArgOperand(i: 1);
1510 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1511 return true;
1512 }
1513 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1514 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1515 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1516 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1517 Info.opc = ISD::INTRINSIC_W_CHAIN;
1518
1519 const GCNTargetMachine &TM =
1520 static_cast<const GCNTargetMachine &>(getTargetMachine());
1521
1522 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1523 Info.ptrVal = MFI->getGWSPSV(TM);
1524
1525 // This is an abstract access, but we need to specify a type and size.
1526 Info.memVT = MVT::i32;
1527 Info.size = 4;
1528 Info.align = Align(4);
1529
1530 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1531 return true;
1532 }
1533 case Intrinsic::amdgcn_s_prefetch_data: {
1534 Info.opc = ISD::INTRINSIC_VOID;
1535 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: 8);
1536 Info.ptrVal = CI.getArgOperand(i: 0);
1537 Info.flags |= MachineMemOperand::MOLoad;
1538 return true;
1539 }
1540 default:
1541 return false;
1542 }
1543}
1544
1545void SITargetLowering::CollectTargetIntrinsicOperands(
1546 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1547 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1548 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1549 // The DAG's ValueType loses the addrspaces.
1550 // Add them as 2 extra Constant operands "from" and "to".
1551 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1552 unsigned DstAS = I.getType()->getPointerAddressSpace();
1553 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1554 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1555 break;
1556 }
1557 default:
1558 break;
1559 }
1560}
1561
1562bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1563 SmallVectorImpl<Value *> &Ops,
1564 Type *&AccessTy) const {
1565 Value *Ptr = nullptr;
1566 switch (II->getIntrinsicID()) {
1567 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1568 case Intrinsic::amdgcn_ds_append:
1569 case Intrinsic::amdgcn_ds_consume:
1570 case Intrinsic::amdgcn_ds_load_tr8_b64:
1571 case Intrinsic::amdgcn_ds_load_tr16_b128:
1572 case Intrinsic::amdgcn_ds_load_tr4_b64:
1573 case Intrinsic::amdgcn_ds_load_tr6_b96:
1574 case Intrinsic::amdgcn_ds_read_tr4_b64:
1575 case Intrinsic::amdgcn_ds_read_tr6_b96:
1576 case Intrinsic::amdgcn_ds_read_tr8_b64:
1577 case Intrinsic::amdgcn_ds_read_tr16_b64:
1578 case Intrinsic::amdgcn_ds_ordered_add:
1579 case Intrinsic::amdgcn_ds_ordered_swap:
1580 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1581 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1582 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1583 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1584 case Intrinsic::amdgcn_global_atomic_csub:
1585 case Intrinsic::amdgcn_global_atomic_fmax_num:
1586 case Intrinsic::amdgcn_global_atomic_fmin_num:
1587 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1588 case Intrinsic::amdgcn_global_load_tr_b64:
1589 case Intrinsic::amdgcn_global_load_tr_b128:
1590 case Intrinsic::amdgcn_global_load_tr4_b64:
1591 case Intrinsic::amdgcn_global_load_tr6_b96:
1592 Ptr = II->getArgOperand(i: 0);
1593 break;
1594 case Intrinsic::amdgcn_load_to_lds:
1595 case Intrinsic::amdgcn_global_load_lds:
1596 Ptr = II->getArgOperand(i: 1);
1597 break;
1598 default:
1599 return false;
1600 }
1601 AccessTy = II->getType();
1602 Ops.push_back(Elt: Ptr);
1603 return true;
1604}
1605
1606bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1607 unsigned AddrSpace) const {
1608 if (!Subtarget->hasFlatInstOffsets()) {
1609 // Flat instructions do not have offsets, and only have the register
1610 // address.
1611 return AM.BaseOffs == 0 && AM.Scale == 0;
1612 }
1613
1614 decltype(SIInstrFlags::FLAT) FlatVariant =
1615 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1616 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1617 : SIInstrFlags::FLAT;
1618
1619 return AM.Scale == 0 &&
1620 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1621 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1622}
1623
1624bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1625 if (Subtarget->hasFlatGlobalInsts())
1626 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1627
1628 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1629 // Assume the we will use FLAT for all global memory accesses
1630 // on VI.
1631 // FIXME: This assumption is currently wrong. On VI we still use
1632 // MUBUF instructions for the r + i addressing mode. As currently
1633 // implemented, the MUBUF instructions only work on buffer < 4GB.
1634 // It may be possible to support > 4GB buffers with MUBUF instructions,
1635 // by setting the stride value in the resource descriptor which would
1636 // increase the size limit to (stride * 4GB). However, this is risky,
1637 // because it has never been validated.
1638 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1639 }
1640
1641 return isLegalMUBUFAddressingMode(AM);
1642}
1643
1644bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1645 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1646 // additionally can do r + r + i with addr64. 32-bit has more addressing
1647 // mode options. Depending on the resource constant, it can also do
1648 // (i64 r0) + (i32 r1) * (i14 i).
1649 //
1650 // Private arrays end up using a scratch buffer most of the time, so also
1651 // assume those use MUBUF instructions. Scratch loads / stores are currently
1652 // implemented as mubuf instructions with offen bit set, so slightly
1653 // different than the normal addr64.
1654 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1655 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1656 return false;
1657
1658 // FIXME: Since we can split immediate into soffset and immediate offset,
1659 // would it make sense to allow any immediate?
1660
1661 switch (AM.Scale) {
1662 case 0: // r + i or just i, depending on HasBaseReg.
1663 return true;
1664 case 1:
1665 return true; // We have r + r or r + i.
1666 case 2:
1667 if (AM.HasBaseReg) {
1668 // Reject 2 * r + r.
1669 return false;
1670 }
1671
1672 // Allow 2 * r as r + r
1673 // Or 2 * r + i is allowed as r + r + i.
1674 return true;
1675 default: // Don't allow n * r
1676 return false;
1677 }
1678}
1679
1680bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1681 const AddrMode &AM, Type *Ty,
1682 unsigned AS,
1683 Instruction *I) const {
1684 // No global is ever allowed as a base.
1685 if (AM.BaseGV)
1686 return false;
1687
1688 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1689 return isLegalGlobalAddressingMode(AM);
1690
1691 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1692 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1693 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1694 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1695 // If the offset isn't a multiple of 4, it probably isn't going to be
1696 // correctly aligned.
1697 // FIXME: Can we get the real alignment here?
1698 if (AM.BaseOffs % 4 != 0)
1699 return isLegalMUBUFAddressingMode(AM);
1700
1701 if (!Subtarget->hasScalarSubwordLoads()) {
1702 // There are no SMRD extloads, so if we have to do a small type access we
1703 // will use a MUBUF load.
1704 // FIXME?: We also need to do this if unaligned, but we don't know the
1705 // alignment here.
1706 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1707 return isLegalGlobalAddressingMode(AM);
1708 }
1709
1710 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1711 // SMRD instructions have an 8-bit, dword offset on SI.
1712 if (!isUInt<8>(x: AM.BaseOffs / 4))
1713 return false;
1714 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1715 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1716 // in 8-bits, it can use a smaller encoding.
1717 if (!isUInt<32>(x: AM.BaseOffs / 4))
1718 return false;
1719 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1720 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1721 if (!isUInt<20>(x: AM.BaseOffs))
1722 return false;
1723 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1724 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1725 // for S_BUFFER_* instructions).
1726 if (!isInt<21>(x: AM.BaseOffs))
1727 return false;
1728 } else {
1729 // On GFX12, all offsets are signed 24-bit in bytes.
1730 if (!isInt<24>(x: AM.BaseOffs))
1731 return false;
1732 }
1733
1734 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1735 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1736 AM.BaseOffs < 0) {
1737 // Scalar (non-buffer) loads can only use a negative offset if
1738 // soffset+offset is non-negative. Since the compiler can only prove that
1739 // in a few special cases, it is safer to claim that negative offsets are
1740 // not supported.
1741 return false;
1742 }
1743
1744 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1745 return true;
1746
1747 if (AM.Scale == 1 && AM.HasBaseReg)
1748 return true;
1749
1750 return false;
1751 }
1752
1753 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1754 return Subtarget->enableFlatScratch()
1755 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
1756 : isLegalMUBUFAddressingMode(AM);
1757
1758 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1759 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1760 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1761 // field.
1762 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1763 // an 8-bit dword offset but we don't know the alignment here.
1764 if (!isUInt<16>(x: AM.BaseOffs))
1765 return false;
1766
1767 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1768 return true;
1769
1770 if (AM.Scale == 1 && AM.HasBaseReg)
1771 return true;
1772
1773 return false;
1774 }
1775
1776 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1777 // For an unknown address space, this usually means that this is for some
1778 // reason being used for pure arithmetic, and not based on some addressing
1779 // computation. We don't have instructions that compute pointers with any
1780 // addressing modes, so treat them as having no offset like flat
1781 // instructions.
1782 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1783 }
1784
1785 // Assume a user alias of global for unknown address spaces.
1786 return isLegalGlobalAddressingMode(AM);
1787}
1788
1789bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1790 const MachineFunction &MF) const {
1791 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1792 return (MemVT.getSizeInBits() <= 4 * 32);
1793 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1794 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1795 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1796 }
1797 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1798 return (MemVT.getSizeInBits() <= 2 * 32);
1799 return true;
1800}
1801
1802bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1803 unsigned Size, unsigned AddrSpace, Align Alignment,
1804 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1805 if (IsFast)
1806 *IsFast = 0;
1807
1808 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1809 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1810 // Check if alignment requirements for ds_read/write instructions are
1811 // disabled.
1812 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1813 return false;
1814
1815 Align RequiredAlignment(
1816 PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))); // Natural alignment.
1817 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1818 Alignment < RequiredAlignment)
1819 return false;
1820
1821 // Either, the alignment requirements are "enabled", or there is an
1822 // unaligned LDS access related hardware bug though alignment requirements
1823 // are "disabled". In either case, we need to check for proper alignment
1824 // requirements.
1825 //
1826 switch (Size) {
1827 case 64:
1828 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1829 // address is negative, then the instruction is incorrectly treated as
1830 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1831 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1832 // load later in the SILoadStoreOptimizer.
1833 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1834 return false;
1835
1836 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1837 // can do a 4 byte aligned, 8 byte access in a single operation using
1838 // ds_read2/write2_b32 with adjacent offsets.
1839 RequiredAlignment = Align(4);
1840
1841 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1842 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1843 // ds_write2_b32 depending on the alignment. In either case with either
1844 // alignment there is no faster way of doing this.
1845
1846 // The numbers returned here and below are not additive, it is a 'speed
1847 // rank'. They are just meant to be compared to decide if a certain way
1848 // of lowering an operation is faster than another. For that purpose
1849 // naturally aligned operation gets it bitsize to indicate that "it
1850 // operates with a speed comparable to N-bit wide load". With the full
1851 // alignment ds128 is slower than ds96 for example. If underaligned it
1852 // is comparable to a speed of a single dword access, which would then
1853 // mean 32 < 128 and it is faster to issue a wide load regardless.
1854 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1855 // wider load which will not be aligned anymore the latter is slower.
1856 if (IsFast)
1857 *IsFast = (Alignment >= RequiredAlignment) ? 64
1858 : (Alignment < Align(4)) ? 32
1859 : 1;
1860 return true;
1861 }
1862
1863 break;
1864 case 96:
1865 if (!Subtarget->hasDS96AndDS128())
1866 return false;
1867
1868 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1869 // gfx8 and older.
1870
1871 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1872 // Naturally aligned access is fastest. However, also report it is Fast
1873 // if memory is aligned less than DWORD. A narrow load or store will be
1874 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1875 // be more of them, so overall we will pay less penalty issuing a single
1876 // instruction.
1877
1878 // See comment on the values above.
1879 if (IsFast)
1880 *IsFast = (Alignment >= RequiredAlignment) ? 96
1881 : (Alignment < Align(4)) ? 32
1882 : 1;
1883 return true;
1884 }
1885
1886 break;
1887 case 128:
1888 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1889 return false;
1890
1891 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1892 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1893 // single operation using ds_read2/write2_b64.
1894 RequiredAlignment = Align(8);
1895
1896 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1897 // Naturally aligned access is fastest. However, also report it is Fast
1898 // if memory is aligned less than DWORD. A narrow load or store will be
1899 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1900 // will be more of them, so overall we will pay less penalty issuing a
1901 // single instruction.
1902
1903 // See comment on the values above.
1904 if (IsFast)
1905 *IsFast = (Alignment >= RequiredAlignment) ? 128
1906 : (Alignment < Align(4)) ? 32
1907 : 1;
1908 return true;
1909 }
1910
1911 break;
1912 default:
1913 if (Size > 32)
1914 return false;
1915
1916 break;
1917 }
1918
1919 // See comment on the values above.
1920 // Note that we have a single-dword or sub-dword here, so if underaligned
1921 // it is a slowest possible access, hence returned value is 0.
1922 if (IsFast)
1923 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1924
1925 return Alignment >= RequiredAlignment ||
1926 Subtarget->hasUnalignedDSAccessEnabled();
1927 }
1928
1929 // FIXME: We have to be conservative here and assume that flat operations
1930 // will access scratch. If we had access to the IR function, then we
1931 // could determine if any private memory was used in the function.
1932 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1933 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
1934 bool AlignedBy4 = Alignment >= Align(4);
1935 if (IsFast)
1936 *IsFast = AlignedBy4;
1937
1938 return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
1939 }
1940
1941 // So long as they are correct, wide global memory operations perform better
1942 // than multiple smaller memory ops -- even when misaligned
1943 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
1944 if (IsFast)
1945 *IsFast = Size;
1946
1947 return Alignment >= Align(4) ||
1948 Subtarget->hasUnalignedBufferAccessEnabled();
1949 }
1950
1951 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
1952 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
1953 // out-of-bounds behavior, but in the edge case where an access starts
1954 // out-of-bounds and then enter in-bounds, the entire access would be treated
1955 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
1956 // natural alignment of buffer accesses.
1957 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
1958 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
1959 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1960 if (!Subtarget->hasRelaxedBufferOOBMode() &&
1961 Alignment < Align(PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))))
1962 return false;
1963 }
1964
1965 // Smaller than dword value must be aligned.
1966 if (Size < 32)
1967 return false;
1968
1969 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1970 // byte-address are ignored, thus forcing Dword alignment.
1971 // This applies to private, global, and constant memory.
1972 if (IsFast)
1973 *IsFast = 1;
1974
1975 return Size >= 32 && Alignment >= Align(4);
1976}
1977
1978bool SITargetLowering::allowsMisalignedMemoryAccesses(
1979 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1980 unsigned *IsFast) const {
1981 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
1982 Alignment, Flags, IsFast);
1983}
1984
1985EVT SITargetLowering::getOptimalMemOpType(
1986 const MemOp &Op, const AttributeList &FuncAttributes) const {
1987 // FIXME: Should account for address space here.
1988
1989 // The default fallback uses the private pointer size as a guess for a type to
1990 // use. Make sure we switch these to 64-bit accesses.
1991
1992 if (Op.size() >= 16 &&
1993 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
1994 return MVT::v4i32;
1995
1996 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
1997 return MVT::v2i32;
1998
1999 // Use the default.
2000 return MVT::Other;
2001}
2002
2003bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2004 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2005 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2006}
2007
2008bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2009 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2010 AS == AMDGPUAS::PRIVATE_ADDRESS;
2011}
2012
2013bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2014 unsigned DestAS) const {
2015 // Flat -> private/local is a simple truncate.
2016 // Flat -> global is no-op
2017 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2018 return true;
2019
2020 const GCNTargetMachine &TM =
2021 static_cast<const GCNTargetMachine &>(getTargetMachine());
2022 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2023}
2024
2025TargetLoweringBase::LegalizeTypeAction
2026SITargetLowering::getPreferredVectorAction(MVT VT) const {
2027 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2028 VT.getScalarType().bitsLE(VT: MVT::i16))
2029 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2030 return TargetLoweringBase::getPreferredVectorAction(VT);
2031}
2032
2033bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2034 Type *Ty) const {
2035 // FIXME: Could be smarter if called for vector constants.
2036 return true;
2037}
2038
2039bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2040 unsigned Index) const {
2041 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2042 return false;
2043
2044 // TODO: Add more cases that are cheap.
2045 return Index == 0;
2046}
2047
2048bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2049 // TODO: This should be more aggressive, particular for 16-bit element
2050 // vectors. However there are some mixed improvements and regressions.
2051 EVT EltTy = VT.getVectorElementType();
2052 return EltTy.getSizeInBits() % 32 == 0;
2053}
2054
2055bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2056 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2057 switch (Op) {
2058 case ISD::LOAD:
2059 case ISD::STORE:
2060 return true;
2061 default:
2062 return false;
2063 }
2064 }
2065
2066 // SimplifySetCC uses this function to determine whether or not it should
2067 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2068 if (VT == MVT::i1 && Op == ISD::SETCC)
2069 return false;
2070
2071 return TargetLowering::isTypeDesirableForOp(Op, VT);
2072}
2073
2074SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2075 const SDLoc &SL,
2076 SDValue Chain,
2077 uint64_t Offset) const {
2078 const DataLayout &DL = DAG.getDataLayout();
2079 MachineFunction &MF = DAG.getMachineFunction();
2080 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2081 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2082
2083 auto [InputPtrReg, RC, ArgTy] =
2084 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2085
2086 // We may not have the kernarg segment argument if we have no kernel
2087 // arguments.
2088 if (!InputPtrReg)
2089 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2090
2091 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2092 SDValue BasePtr = DAG.getCopyFromReg(
2093 Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2094
2095 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2096}
2097
2098SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2099 const SDLoc &SL) const {
2100 uint64_t Offset =
2101 getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2102 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2103}
2104
2105SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2106 const SDLoc &SL) const {
2107
2108 Function &F = DAG.getMachineFunction().getFunction();
2109 std::optional<uint32_t> KnownSize =
2110 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2111 if (KnownSize.has_value())
2112 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2113 return SDValue();
2114}
2115
2116SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2117 const SDLoc &SL, SDValue Val,
2118 bool Signed,
2119 const ISD::InputArg *Arg) const {
2120 // First, if it is a widened vector, narrow it.
2121 if (VT.isVector() &&
2122 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2123 EVT NarrowedVT =
2124 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2125 NumElements: VT.getVectorNumElements());
2126 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2127 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2128 }
2129
2130 // Then convert the vector elements or scalar value.
2131 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2132 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2133 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2134 }
2135
2136 if (MemVT.isFloatingPoint())
2137 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2138 else if (Signed)
2139 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2140 else
2141 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2142
2143 return Val;
2144}
2145
2146SDValue SITargetLowering::lowerKernargMemParameter(
2147 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2148 uint64_t Offset, Align Alignment, bool Signed,
2149 const ISD::InputArg *Arg) const {
2150 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2151
2152 // Try to avoid using an extload by loading earlier than the argument address,
2153 // and extracting the relevant bits. The load should hopefully be merged with
2154 // the previous argument.
2155 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2156 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2157 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2158 int64_t OffsetDiff = Offset - AlignDownOffset;
2159
2160 EVT IntVT = MemVT.changeTypeToInteger();
2161
2162 // TODO: If we passed in the base kernel offset we could have a better
2163 // alignment than 4, but we don't really need it.
2164 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2165 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr, PtrInfo, Alignment: Align(4),
2166 MMOFlags: MachineMemOperand::MODereferenceable |
2167 MachineMemOperand::MOInvariant);
2168
2169 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2170 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2171
2172 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2173 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2174 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2175
2176 return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: 1)}, dl: SL);
2177 }
2178
2179 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2180 SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment,
2181 MMOFlags: MachineMemOperand::MODereferenceable |
2182 MachineMemOperand::MOInvariant);
2183
2184 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2185 return DAG.getMergeValues(Ops: {Val, Load.getValue(R: 1)}, dl: SL);
2186}
2187
2188SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2189 CCValAssign &VA, const SDLoc &SL,
2190 SDValue Chain,
2191 const ISD::InputArg &Arg) const {
2192 MachineFunction &MF = DAG.getMachineFunction();
2193 MachineFrameInfo &MFI = MF.getFrameInfo();
2194
2195 if (Arg.Flags.isByVal()) {
2196 unsigned Size = Arg.Flags.getByValSize();
2197 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2198 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2199 }
2200
2201 unsigned ArgOffset = VA.getLocMemOffset();
2202 unsigned ArgSize = VA.getValVT().getStoreSize();
2203
2204 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2205
2206 // Create load nodes to retrieve arguments from the stack.
2207 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2208 SDValue ArgValue;
2209
2210 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2211 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2212 MVT MemVT = VA.getValVT();
2213
2214 switch (VA.getLocInfo()) {
2215 default:
2216 break;
2217 case CCValAssign::BCvt:
2218 MemVT = VA.getLocVT();
2219 break;
2220 case CCValAssign::SExt:
2221 ExtType = ISD::SEXTLOAD;
2222 break;
2223 case CCValAssign::ZExt:
2224 ExtType = ISD::ZEXTLOAD;
2225 break;
2226 case CCValAssign::AExt:
2227 ExtType = ISD::EXTLOAD;
2228 break;
2229 }
2230
2231 ArgValue = DAG.getExtLoad(
2232 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2233 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2234 return ArgValue;
2235}
2236
2237SDValue SITargetLowering::getPreloadedValue(
2238 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2239 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2240 const ArgDescriptor *Reg = nullptr;
2241 const TargetRegisterClass *RC;
2242 LLT Ty;
2243
2244 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2245 const ArgDescriptor WorkGroupIDX =
2246 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2247 // If GridZ is not programmed in an entry function then the hardware will set
2248 // it to all zeros, so there is no need to mask the GridY value in the low
2249 // order bits.
2250 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2251 Reg: AMDGPU::TTMP7,
2252 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2253 const ArgDescriptor WorkGroupIDZ =
2254 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2255 if (Subtarget->hasArchitectedSGPRs() &&
2256 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2257 switch (PVID) {
2258 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2259 Reg = &WorkGroupIDX;
2260 RC = &AMDGPU::SReg_32RegClass;
2261 Ty = LLT::scalar(SizeInBits: 32);
2262 break;
2263 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2264 Reg = &WorkGroupIDY;
2265 RC = &AMDGPU::SReg_32RegClass;
2266 Ty = LLT::scalar(SizeInBits: 32);
2267 break;
2268 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2269 Reg = &WorkGroupIDZ;
2270 RC = &AMDGPU::SReg_32RegClass;
2271 Ty = LLT::scalar(SizeInBits: 32);
2272 break;
2273 default:
2274 break;
2275 }
2276 }
2277
2278 if (!Reg)
2279 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2280 if (!Reg) {
2281 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2282 // It's possible for a kernarg intrinsic call to appear in a kernel with
2283 // no allocated segment, in which case we do not add the user sgpr
2284 // argument, so just return null.
2285 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2286 }
2287
2288 // It's undefined behavior if a function marked with the amdgpu-no-*
2289 // attributes uses the corresponding intrinsic.
2290 return DAG.getPOISON(VT);
2291 }
2292
2293 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2294}
2295
2296static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2297 CallingConv::ID CallConv,
2298 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2299 FunctionType *FType,
2300 SIMachineFunctionInfo *Info) {
2301 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2302 const ISD::InputArg *Arg = &Ins[I];
2303
2304 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2305 "vector type argument should have been split");
2306
2307 // First check if it's a PS input addr.
2308 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2309 PSInputNum <= 15) {
2310 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2311
2312 // Inconveniently only the first part of the split is marked as isSplit,
2313 // so skip to the end. We only want to increment PSInputNum once for the
2314 // entire split argument.
2315 if (Arg->Flags.isSplit()) {
2316 while (!Arg->Flags.isSplitEnd()) {
2317 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2318 "unexpected vector split in ps argument type");
2319 if (!SkipArg)
2320 Splits.push_back(Elt: *Arg);
2321 Arg = &Ins[++I];
2322 }
2323 }
2324
2325 if (SkipArg) {
2326 // We can safely skip PS inputs.
2327 Skipped.set(Arg->getOrigArgIndex());
2328 ++PSInputNum;
2329 continue;
2330 }
2331
2332 Info->markPSInputAllocated(Index: PSInputNum);
2333 if (Arg->Used)
2334 Info->markPSInputEnabled(Index: PSInputNum);
2335
2336 ++PSInputNum;
2337 }
2338
2339 Splits.push_back(Elt: *Arg);
2340 }
2341}
2342
2343// Allocate special inputs passed in VGPRs.
2344void SITargetLowering::allocateSpecialEntryInputVGPRs(
2345 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2346 SIMachineFunctionInfo &Info) const {
2347 const LLT S32 = LLT::scalar(SizeInBits: 32);
2348 MachineRegisterInfo &MRI = MF.getRegInfo();
2349
2350 if (Info.hasWorkItemIDX()) {
2351 Register Reg = AMDGPU::VGPR0;
2352 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2353
2354 CCInfo.AllocateReg(Reg);
2355 unsigned Mask =
2356 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2357 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2358 }
2359
2360 if (Info.hasWorkItemIDY()) {
2361 assert(Info.hasWorkItemIDX());
2362 if (Subtarget->hasPackedTID()) {
2363 Info.setWorkItemIDY(
2364 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 10));
2365 } else {
2366 unsigned Reg = AMDGPU::VGPR1;
2367 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2368
2369 CCInfo.AllocateReg(Reg);
2370 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2371 }
2372 }
2373
2374 if (Info.hasWorkItemIDZ()) {
2375 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2376 if (Subtarget->hasPackedTID()) {
2377 Info.setWorkItemIDZ(
2378 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 20));
2379 } else {
2380 unsigned Reg = AMDGPU::VGPR2;
2381 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2382
2383 CCInfo.AllocateReg(Reg);
2384 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2385 }
2386 }
2387}
2388
2389// Try to allocate a VGPR at the end of the argument list, or if no argument
2390// VGPRs are left allocating a stack slot.
2391// If \p Mask is is given it indicates bitfield position in the register.
2392// If \p Arg is given use it with new ]p Mask instead of allocating new.
2393static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2394 ArgDescriptor Arg = ArgDescriptor()) {
2395 if (Arg.isSet())
2396 return ArgDescriptor::createArg(Arg, Mask);
2397
2398 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2399 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2400 if (RegIdx == ArgVGPRs.size()) {
2401 // Spill to stack required.
2402 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2403
2404 return ArgDescriptor::createStack(Offset, Mask);
2405 }
2406
2407 unsigned Reg = ArgVGPRs[RegIdx];
2408 Reg = CCInfo.AllocateReg(Reg);
2409 assert(Reg != AMDGPU::NoRegister);
2410
2411 MachineFunction &MF = CCInfo.getMachineFunction();
2412 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2413 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2414 return ArgDescriptor::createRegister(Reg, Mask);
2415}
2416
2417static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2418 const TargetRegisterClass *RC,
2419 unsigned NumArgRegs) {
2420 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2421 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2422 if (RegIdx == ArgSGPRs.size())
2423 report_fatal_error(reason: "ran out of SGPRs for arguments");
2424
2425 unsigned Reg = ArgSGPRs[RegIdx];
2426 Reg = CCInfo.AllocateReg(Reg);
2427 assert(Reg != AMDGPU::NoRegister);
2428
2429 MachineFunction &MF = CCInfo.getMachineFunction();
2430 MF.addLiveIn(PReg: Reg, RC);
2431 return ArgDescriptor::createRegister(Reg);
2432}
2433
2434// If this has a fixed position, we still should allocate the register in the
2435// CCInfo state. Technically we could get away with this for values passed
2436// outside of the normal argument range.
2437static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2438 const TargetRegisterClass *RC,
2439 MCRegister Reg) {
2440 Reg = CCInfo.AllocateReg(Reg);
2441 assert(Reg != AMDGPU::NoRegister);
2442 MachineFunction &MF = CCInfo.getMachineFunction();
2443 MF.addLiveIn(PReg: Reg, RC);
2444}
2445
2446static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2447 if (Arg) {
2448 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2449 Reg: Arg.getRegister());
2450 } else
2451 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2452}
2453
2454static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2455 if (Arg) {
2456 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2457 Reg: Arg.getRegister());
2458 } else
2459 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2460}
2461
2462/// Allocate implicit function VGPR arguments at the end of allocated user
2463/// arguments.
2464void SITargetLowering::allocateSpecialInputVGPRs(
2465 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2466 SIMachineFunctionInfo &Info) const {
2467 const unsigned Mask = 0x3ff;
2468 ArgDescriptor Arg;
2469
2470 if (Info.hasWorkItemIDX()) {
2471 Arg = allocateVGPR32Input(CCInfo, Mask);
2472 Info.setWorkItemIDX(Arg);
2473 }
2474
2475 if (Info.hasWorkItemIDY()) {
2476 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2477 Info.setWorkItemIDY(Arg);
2478 }
2479
2480 if (Info.hasWorkItemIDZ())
2481 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2482}
2483
2484/// Allocate implicit function VGPR arguments in fixed registers.
2485void SITargetLowering::allocateSpecialInputVGPRsFixed(
2486 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2487 SIMachineFunctionInfo &Info) const {
2488 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2489 if (!Reg)
2490 report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2491
2492 const unsigned Mask = 0x3ff;
2493 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2494 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2495 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2496}
2497
2498void SITargetLowering::allocateSpecialInputSGPRs(
2499 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2500 SIMachineFunctionInfo &Info) const {
2501 auto &ArgInfo = Info.getArgInfo();
2502 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2503
2504 // TODO: Unify handling with private memory pointers.
2505 if (UserSGPRInfo.hasDispatchPtr())
2506 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2507
2508 if (UserSGPRInfo.hasQueuePtr())
2509 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2510
2511 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2512 // constant offset from the kernarg segment.
2513 if (Info.hasImplicitArgPtr())
2514 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2515
2516 if (UserSGPRInfo.hasDispatchID())
2517 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2518
2519 // flat_scratch_init is not applicable for non-kernel functions.
2520
2521 if (Info.hasWorkGroupIDX())
2522 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2523
2524 if (Info.hasWorkGroupIDY())
2525 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2526
2527 if (Info.hasWorkGroupIDZ())
2528 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2529
2530 if (Info.hasLDSKernelId())
2531 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2532}
2533
2534// Allocate special inputs passed in user SGPRs.
2535void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2536 MachineFunction &MF,
2537 const SIRegisterInfo &TRI,
2538 SIMachineFunctionInfo &Info) const {
2539 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2540 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2541 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2542 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2543 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2544 }
2545
2546 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2547 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2548 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2549 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2550 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2551 }
2552
2553 if (UserSGPRInfo.hasDispatchPtr()) {
2554 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2555 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2556 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2557 }
2558
2559 if (UserSGPRInfo.hasQueuePtr()) {
2560 Register QueuePtrReg = Info.addQueuePtr(TRI);
2561 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
2562 CCInfo.AllocateReg(Reg: QueuePtrReg);
2563 }
2564
2565 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2566 MachineRegisterInfo &MRI = MF.getRegInfo();
2567 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2568 CCInfo.AllocateReg(Reg: InputPtrReg);
2569
2570 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2571 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2572 }
2573
2574 if (UserSGPRInfo.hasDispatchID()) {
2575 Register DispatchIDReg = Info.addDispatchID(TRI);
2576 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
2577 CCInfo.AllocateReg(Reg: DispatchIDReg);
2578 }
2579
2580 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2581 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2582 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
2583 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
2584 }
2585
2586 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2587 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2588 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
2589 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
2590 }
2591
2592 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2593 // these from the dispatch pointer.
2594}
2595
2596// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2597// sequential starting from the first argument.
2598void SITargetLowering::allocatePreloadKernArgSGPRs(
2599 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2600 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2601 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2602 Function &F = MF.getFunction();
2603 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2604 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2605 bool InPreloadSequence = true;
2606 unsigned InIdx = 0;
2607 bool AlignedForImplictArgs = false;
2608 unsigned ImplicitArgOffset = 0;
2609 for (auto &Arg : F.args()) {
2610 if (!InPreloadSequence || !Arg.hasInRegAttr())
2611 break;
2612
2613 unsigned ArgIdx = Arg.getArgNo();
2614 // Don't preload non-original args or parts not in the current preload
2615 // sequence.
2616 if (InIdx < Ins.size() &&
2617 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2618 break;
2619
2620 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2621 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2622 InIdx++) {
2623 assert(ArgLocs[ArgIdx].isMemLoc());
2624 auto &ArgLoc = ArgLocs[InIdx];
2625 const Align KernelArgBaseAlign = Align(16);
2626 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2627 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
2628 unsigned NumAllocSGPRs =
2629 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
2630
2631 // Fix alignment for hidden arguments.
2632 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
2633 if (!AlignedForImplictArgs) {
2634 ImplicitArgOffset =
2635 alignTo(Size: LastExplicitArgOffset,
2636 A: Subtarget->getAlignmentForImplicitArgPtr()) -
2637 LastExplicitArgOffset;
2638 AlignedForImplictArgs = true;
2639 }
2640 ArgOffset += ImplicitArgOffset;
2641 }
2642
2643 // Arg is preloaded into the previous SGPR.
2644 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2645 assert(InIdx >= 1 && "No previous SGPR");
2646 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2647 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2648 continue;
2649 }
2650
2651 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2652 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
2653 // Check for free user SGPRs for preloading.
2654 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2655 InPreloadSequence = false;
2656 break;
2657 }
2658
2659 // Preload this argument.
2660 const TargetRegisterClass *RC =
2661 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
2662 SmallVectorImpl<MCRegister> *PreloadRegs =
2663 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
2664
2665 if (PreloadRegs->size() > 1)
2666 RC = &AMDGPU::SGPR_32RegClass;
2667 for (auto &Reg : *PreloadRegs) {
2668 assert(Reg);
2669 MF.addLiveIn(PReg: Reg, RC);
2670 CCInfo.AllocateReg(Reg);
2671 }
2672
2673 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2674 }
2675 }
2676}
2677
2678void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2679 const SIRegisterInfo &TRI,
2680 SIMachineFunctionInfo &Info) const {
2681 // Always allocate this last since it is a synthetic preload.
2682 if (Info.hasLDSKernelId()) {
2683 Register Reg = Info.addLDSKernelId();
2684 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2685 CCInfo.AllocateReg(Reg);
2686 }
2687}
2688
2689// Allocate special input registers that are initialized per-wave.
2690void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
2691 SIMachineFunctionInfo &Info,
2692 CallingConv::ID CallConv,
2693 bool IsShader) const {
2694 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2695 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2696 // Note: user SGPRs are handled by the front-end for graphics shaders
2697 // Pad up the used user SGPRs with dead inputs.
2698
2699 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2700 // before enabling architected SGPRs for workgroup IDs.
2701 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2702
2703 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2704 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2705 // rely on it to reach 16 since if we end up having no stack usage, it will
2706 // not really be added.
2707 unsigned NumRequiredSystemSGPRs =
2708 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
2709 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
2710 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2711 Register Reg = Info.addReservedUserSGPR();
2712 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2713 CCInfo.AllocateReg(Reg);
2714 }
2715 }
2716
2717 if (!HasArchitectedSGPRs) {
2718 if (Info.hasWorkGroupIDX()) {
2719 Register Reg = Info.addWorkGroupIDX();
2720 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2721 CCInfo.AllocateReg(Reg);
2722 }
2723
2724 if (Info.hasWorkGroupIDY()) {
2725 Register Reg = Info.addWorkGroupIDY();
2726 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2727 CCInfo.AllocateReg(Reg);
2728 }
2729
2730 if (Info.hasWorkGroupIDZ()) {
2731 Register Reg = Info.addWorkGroupIDZ();
2732 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2733 CCInfo.AllocateReg(Reg);
2734 }
2735 }
2736
2737 if (Info.hasWorkGroupInfo()) {
2738 Register Reg = Info.addWorkGroupInfo();
2739 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2740 CCInfo.AllocateReg(Reg);
2741 }
2742
2743 if (Info.hasPrivateSegmentWaveByteOffset()) {
2744 // Scratch wave offset passed in system SGPR.
2745 unsigned PrivateSegmentWaveByteOffsetReg;
2746
2747 if (IsShader) {
2748 PrivateSegmentWaveByteOffsetReg =
2749 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2750
2751 // This is true if the scratch wave byte offset doesn't have a fixed
2752 // location.
2753 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2754 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2755 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2756 }
2757 } else
2758 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2759
2760 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
2761 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
2762 }
2763
2764 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2765 Info.getNumPreloadedSGPRs() >= 16);
2766}
2767
2768static void reservePrivateMemoryRegs(const TargetMachine &TM,
2769 MachineFunction &MF,
2770 const SIRegisterInfo &TRI,
2771 SIMachineFunctionInfo &Info) {
2772 // Now that we've figured out where the scratch register inputs are, see if
2773 // should reserve the arguments and use them directly.
2774 MachineFrameInfo &MFI = MF.getFrameInfo();
2775 bool HasStackObjects = MFI.hasStackObjects();
2776 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2777
2778 // Record that we know we have non-spill stack objects so we don't need to
2779 // check all stack objects later.
2780 if (HasStackObjects)
2781 Info.setHasNonSpillStackObjects(true);
2782
2783 // Everything live out of a block is spilled with fast regalloc, so it's
2784 // almost certain that spilling will be required.
2785 if (TM.getOptLevel() == CodeGenOptLevel::None)
2786 HasStackObjects = true;
2787
2788 // For now assume stack access is needed in any callee functions, so we need
2789 // the scratch registers to pass in.
2790 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2791
2792 if (!ST.enableFlatScratch()) {
2793 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
2794 // If we have stack objects, we unquestionably need the private buffer
2795 // resource. For the Code Object V2 ABI, this will be the first 4 user
2796 // SGPR inputs. We can reserve those and use them directly.
2797
2798 Register PrivateSegmentBufferReg =
2799 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2800 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2801 } else {
2802 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2803 // We tentatively reserve the last registers (skipping the last registers
2804 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2805 // we'll replace these with the ones immediately after those which were
2806 // really allocated. In the prologue copies will be inserted from the
2807 // argument to these reserved registers.
2808
2809 // Without HSA, relocations are used for the scratch pointer and the
2810 // buffer resource setup is always inserted in the prologue. Scratch wave
2811 // offset is still in an input SGPR.
2812 Info.setScratchRSrcReg(ReservedBufferReg);
2813 }
2814 }
2815
2816 MachineRegisterInfo &MRI = MF.getRegInfo();
2817
2818 // For entry functions we have to set up the stack pointer if we use it,
2819 // whereas non-entry functions get this "for free". This means there is no
2820 // intrinsic advantage to using S32 over S34 in cases where we do not have
2821 // calls but do need a frame pointer (i.e. if we are requested to have one
2822 // because frame pointer elimination is disabled). To keep things simple we
2823 // only ever use S32 as the call ABI stack pointer, and so using it does not
2824 // imply we need a separate frame pointer.
2825 //
2826 // Try to use s32 as the SP, but move it if it would interfere with input
2827 // arguments. This won't work with calls though.
2828 //
2829 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2830 // registers.
2831 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
2832 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2833 } else {
2834 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2835
2836 if (MFI.hasCalls())
2837 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
2838
2839 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2840 if (!MRI.isLiveIn(Reg)) {
2841 Info.setStackPtrOffsetReg(Reg);
2842 break;
2843 }
2844 }
2845
2846 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2847 report_fatal_error(reason: "failed to find register for SP");
2848 }
2849
2850 // hasFP should be accurate for entry functions even before the frame is
2851 // finalized, because it does not rely on the known stack size, only
2852 // properties like whether variable sized objects are present.
2853 if (ST.getFrameLowering()->hasFP(MF)) {
2854 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2855 }
2856}
2857
2858bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2859 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2860 return !Info->isEntryFunction();
2861}
2862
2863void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
2864
2865void SITargetLowering::insertCopiesSplitCSR(
2866 MachineBasicBlock *Entry,
2867 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2868 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2869
2870 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
2871 if (!IStart)
2872 return;
2873
2874 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2875 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2876 MachineBasicBlock::iterator MBBI = Entry->begin();
2877 for (const MCPhysReg *I = IStart; *I; ++I) {
2878 const TargetRegisterClass *RC = nullptr;
2879 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
2880 RC = &AMDGPU::SGPR_64RegClass;
2881 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
2882 RC = &AMDGPU::SGPR_32RegClass;
2883 else
2884 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2885
2886 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
2887 // Create copy from CSR to a virtual register.
2888 Entry->addLiveIn(PhysReg: *I);
2889 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
2890 .addReg(RegNo: *I);
2891
2892 // Insert the copy-back instructions right before the terminator.
2893 for (auto *Exit : Exits)
2894 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
2895 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
2896 .addReg(RegNo: NewVR);
2897 }
2898}
2899
2900SDValue SITargetLowering::LowerFormalArguments(
2901 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2902 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2903 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2904 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2905
2906 MachineFunction &MF = DAG.getMachineFunction();
2907 const Function &Fn = MF.getFunction();
2908 FunctionType *FType = MF.getFunction().getFunctionType();
2909 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2910 bool IsError = false;
2911
2912 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
2913 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
2914 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
2915 IsError = true;
2916 }
2917
2918 SmallVector<ISD::InputArg, 16> Splits;
2919 SmallVector<CCValAssign, 16> ArgLocs;
2920 BitVector Skipped(Ins.size());
2921 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2922 *DAG.getContext());
2923
2924 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
2925 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
2926 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
2927
2928 if (IsGraphics) {
2929 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2930 assert(!UserSGPRInfo.hasDispatchPtr() &&
2931 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2932 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2933 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2934 (void)UserSGPRInfo;
2935 if (!Subtarget->enableFlatScratch())
2936 assert(!UserSGPRInfo.hasFlatScratchInit());
2937 if ((CallConv != CallingConv::AMDGPU_CS &&
2938 CallConv != CallingConv::AMDGPU_Gfx) ||
2939 !Subtarget->hasArchitectedSGPRs())
2940 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2941 !Info->hasWorkGroupIDZ());
2942 }
2943
2944 if (CallConv == CallingConv::AMDGPU_PS) {
2945 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2946
2947 // At least one interpolation mode must be enabled or else the GPU will
2948 // hang.
2949 //
2950 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2951 // set PSInputAddr, the user wants to enable some bits after the compilation
2952 // based on run-time states. Since we can't know what the final PSInputEna
2953 // will look like, so we shouldn't do anything here and the user should take
2954 // responsibility for the correct programming.
2955 //
2956 // Otherwise, the following restrictions apply:
2957 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2958 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2959 // enabled too.
2960 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2961 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
2962 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
2963 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
2964 Info->markPSInputAllocated(Index: 0);
2965 Info->markPSInputEnabled(Index: 0);
2966 }
2967 if (Subtarget->isAmdPalOS()) {
2968 // For isAmdPalOS, the user does not enable some bits after compilation
2969 // based on run-time states; the register values being generated here are
2970 // the final ones set in hardware. Therefore we need to apply the
2971 // workaround to PSInputAddr and PSInputEnable together. (The case where
2972 // a bit is set in PSInputAddr but not PSInputEnable is where the
2973 // frontend set up an input arg for a particular interpolation mode, but
2974 // nothing uses that input arg. Really we should have an earlier pass
2975 // that removes such an arg.)
2976 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2977 if ((PsInputBits & 0x7F) == 0 ||
2978 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2979 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
2980 }
2981 } else if (IsKernel) {
2982 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2983 } else {
2984 Splits.append(in_start: Ins.begin(), in_end: Ins.end());
2985 }
2986
2987 if (IsKernel)
2988 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
2989
2990 if (IsEntryFunc) {
2991 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2992 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2993 if (IsKernel && Subtarget->hasKernargPreload())
2994 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
2995
2996 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
2997 } else if (!IsGraphics) {
2998 // For the fixed ABI, pass workitem IDs in the last argument register.
2999 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
3000
3001 // FIXME: Sink this into allocateSpecialInputSGPRs
3002 if (!Subtarget->enableFlatScratch())
3003 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3004
3005 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3006 }
3007
3008 if (!IsKernel) {
3009 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3010 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3011 }
3012
3013 SmallVector<SDValue, 16> Chains;
3014
3015 // FIXME: This is the minimum kernel argument alignment. We should improve
3016 // this to the maximum alignment of the arguments.
3017 //
3018 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3019 // kern arg offset.
3020 const Align KernelArgBaseAlign = Align(16);
3021
3022 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
3023 const ISD::InputArg &Arg = Ins[i];
3024 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3025 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3026 continue;
3027 }
3028
3029 CCValAssign &VA = ArgLocs[ArgIdx++];
3030 MVT VT = VA.getLocVT();
3031
3032 if (IsEntryFunc && VA.isMemLoc()) {
3033 VT = Ins[i].VT;
3034 EVT MemVT = VA.getLocVT();
3035
3036 const uint64_t Offset = VA.getLocMemOffset();
3037 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3038
3039 if (Arg.Flags.isByRef()) {
3040 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3041
3042 const GCNTargetMachine &TM =
3043 static_cast<const GCNTargetMachine &>(getTargetMachine());
3044 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3045 DestAS: Arg.Flags.getPointerAddrSpace())) {
3046 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3047 DestAS: Arg.Flags.getPointerAddrSpace());
3048 }
3049
3050 InVals.push_back(Elt: Ptr);
3051 continue;
3052 }
3053
3054 SDValue NewArg;
3055 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3056 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3057 // In this case the argument is packed into the previous preload SGPR.
3058 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
3059 int64_t OffsetDiff = Offset - AlignDownOffset;
3060 EVT IntVT = MemVT.changeTypeToInteger();
3061
3062 const SIMachineFunctionInfo *Info =
3063 MF.getInfo<SIMachineFunctionInfo>();
3064 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3065 Register Reg =
3066 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
3067
3068 assert(Reg);
3069 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3070 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3071
3072 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
3073 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3074
3075 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3076 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3077 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3078 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3079
3080 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
3081 } else {
3082 const SIMachineFunctionInfo *Info =
3083 MF.getInfo<SIMachineFunctionInfo>();
3084 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3085 const SmallVectorImpl<MCRegister> &PreloadRegs =
3086 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3087
3088 SDValue Copy;
3089 if (PreloadRegs.size() == 1) {
3090 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
3091 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3092 NewArg = DAG.getCopyFromReg(
3093 Chain, dl: DL, Reg: VReg,
3094 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3095 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3096
3097 } else {
3098 // If the kernarg alignment does not match the alignment of the SGPR
3099 // tuple RC that can accommodate this argument, it will be built up
3100 // via copies from from the individual SGPRs that the argument was
3101 // preloaded to.
3102 SmallVector<SDValue, 4> Elts;
3103 for (auto Reg : PreloadRegs) {
3104 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3105 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3106 Elts.push_back(Elt: Copy);
3107 }
3108 NewArg =
3109 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3110 NumElements: PreloadRegs.size()),
3111 DL, Ops: Elts);
3112 }
3113
3114 // If the argument was preloaded to multiple consecutive 32-bit
3115 // registers because of misalignment between addressable SGPR tuples
3116 // and the argument size, we can still assume that because of kernarg
3117 // segment alignment restrictions that NewArg's size is the same as
3118 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3119 // truncate since we cannot preload to less than a single SGPR and the
3120 // MemVT may be smaller.
3121 EVT MemVTInt =
3122 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3123 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3124 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3125
3126 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3127 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3128 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3129 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3130 }
3131 } else {
3132 // Hidden arguments that are in the kernel signature must be preloaded
3133 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3134 // the argument list and is not preloaded.
3135 if (Arg.isOrigArg()) {
3136 Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3137 if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3138 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3139 *OrigArg->getParent(),
3140 "hidden argument in kernel signature was not preloaded",
3141 DL.getDebugLoc()));
3142 }
3143 }
3144
3145 NewArg =
3146 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3147 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3148 }
3149 Chains.push_back(Elt: NewArg.getValue(R: 1));
3150
3151 auto *ParamTy =
3152 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3153 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3154 ParamTy &&
3155 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3156 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3157 // On SI local pointers are just offsets into LDS, so they are always
3158 // less than 16-bits. On CI and newer they could potentially be
3159 // real pointers, so we can't guarantee their size.
3160 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3161 N2: DAG.getValueType(MVT::i16));
3162 }
3163
3164 InVals.push_back(Elt: NewArg);
3165 continue;
3166 }
3167 if (!IsEntryFunc && VA.isMemLoc()) {
3168 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3169 InVals.push_back(Elt: Val);
3170 if (!Arg.Flags.isByVal())
3171 Chains.push_back(Elt: Val.getValue(R: 1));
3172 continue;
3173 }
3174
3175 assert(VA.isRegLoc() && "Parameter must be in a register!");
3176
3177 Register Reg = VA.getLocReg();
3178 const TargetRegisterClass *RC = nullptr;
3179 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3180 RC = &AMDGPU::VGPR_32RegClass;
3181 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3182 RC = &AMDGPU::SGPR_32RegClass;
3183 else
3184 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3185 EVT ValVT = VA.getValVT();
3186
3187 Reg = MF.addLiveIn(PReg: Reg, RC);
3188 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3189
3190 if (Arg.Flags.isSRet()) {
3191 // The return object should be reasonably addressable.
3192
3193 // FIXME: This helps when the return is a real sret. If it is a
3194 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3195 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3196 unsigned NumBits =
3197 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3198 Val = DAG.getNode(
3199 Opcode: ISD::AssertZext, DL, VT, N1: Val,
3200 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3201 }
3202
3203 // If this is an 8 or 16-bit value, it is really passed promoted
3204 // to 32 bits. Insert an assert[sz]ext to capture this, then
3205 // truncate to the right size.
3206 switch (VA.getLocInfo()) {
3207 case CCValAssign::Full:
3208 break;
3209 case CCValAssign::BCvt:
3210 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val);
3211 break;
3212 case CCValAssign::SExt:
3213 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val, N2: DAG.getValueType(ValVT));
3214 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3215 break;
3216 case CCValAssign::ZExt:
3217 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val, N2: DAG.getValueType(ValVT));
3218 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3219 break;
3220 case CCValAssign::AExt:
3221 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3222 break;
3223 default:
3224 llvm_unreachable("Unknown loc info!");
3225 }
3226
3227 InVals.push_back(Elt: Val);
3228 }
3229
3230 // Start adding system SGPRs.
3231 if (IsEntryFunc)
3232 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3233
3234 // DAG.getPass() returns nullptr when using new pass manager.
3235 // TODO: Use DAG.getMFAM() to access analysis result.
3236 if (DAG.getPass()) {
3237 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3238 ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3239 }
3240
3241 unsigned StackArgSize = CCInfo.getStackSize();
3242 Info->setBytesInStackArgArea(StackArgSize);
3243
3244 return Chains.empty() ? Chain
3245 : DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3246}
3247
3248// TODO: If return values can't fit in registers, we should return as many as
3249// possible in registers before passing on stack.
3250bool SITargetLowering::CanLowerReturn(
3251 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3252 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3253 const Type *RetTy) const {
3254 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3255 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3256 // for shaders. Vector types should be explicitly handled by CC.
3257 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3258 return true;
3259
3260 SmallVector<CCValAssign, 16> RVLocs;
3261 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3262 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3263 return false;
3264
3265 // We must use the stack if return would require unavailable registers.
3266 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3267 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3268 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3269 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3270 return false;
3271
3272 return true;
3273}
3274
3275SDValue
3276SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3277 bool isVarArg,
3278 const SmallVectorImpl<ISD::OutputArg> &Outs,
3279 const SmallVectorImpl<SDValue> &OutVals,
3280 const SDLoc &DL, SelectionDAG &DAG) const {
3281 MachineFunction &MF = DAG.getMachineFunction();
3282 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3283 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3284
3285 if (AMDGPU::isKernel(CC: CallConv)) {
3286 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3287 OutVals, DL, DAG);
3288 }
3289
3290 bool IsShader = AMDGPU::isShader(CC: CallConv);
3291
3292 Info->setIfReturnsVoid(Outs.empty());
3293 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3294
3295 // CCValAssign - represent the assignment of the return value to a location.
3296 SmallVector<CCValAssign, 48> RVLocs;
3297
3298 // CCState - Info about the registers and stack slots.
3299 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3300 *DAG.getContext());
3301
3302 // Analyze outgoing return values.
3303 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3304
3305 SDValue Glue;
3306 SmallVector<SDValue, 48> RetOps;
3307 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3308
3309 SDValue ReadFirstLane =
3310 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3311 // Copy the result values into the output registers.
3312 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3313 ++I, ++RealRVLocIdx) {
3314 CCValAssign &VA = RVLocs[I];
3315 assert(VA.isRegLoc() && "Can only return in registers!");
3316 // TODO: Partially return in registers if return values don't fit.
3317 SDValue Arg = OutVals[RealRVLocIdx];
3318
3319 // Copied from other backends.
3320 switch (VA.getLocInfo()) {
3321 case CCValAssign::Full:
3322 break;
3323 case CCValAssign::BCvt:
3324 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3325 break;
3326 case CCValAssign::SExt:
3327 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3328 break;
3329 case CCValAssign::ZExt:
3330 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3331 break;
3332 case CCValAssign::AExt:
3333 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3334 break;
3335 default:
3336 llvm_unreachable("Unknown loc info!");
3337 }
3338 if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3339 Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3340 N1: ReadFirstLane, N2: Arg);
3341 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3342 Glue = Chain.getValue(R: 1);
3343 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3344 }
3345
3346 // FIXME: Does sret work properly?
3347 if (!Info->isEntryFunction()) {
3348 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3349 const MCPhysReg *I =
3350 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3351 if (I) {
3352 for (; *I; ++I) {
3353 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3354 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3355 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3356 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3357 else
3358 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3359 }
3360 }
3361 }
3362
3363 // Update chain and glue.
3364 RetOps[0] = Chain;
3365 if (Glue.getNode())
3366 RetOps.push_back(Elt: Glue);
3367
3368 unsigned Opc = AMDGPUISD::ENDPGM;
3369 if (!IsWaveEnd)
3370 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3371 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3372}
3373
3374SDValue SITargetLowering::LowerCallResult(
3375 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3376 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3377 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3378 SDValue ThisVal) const {
3379 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3380
3381 // Assign locations to each value returned by this call.
3382 SmallVector<CCValAssign, 16> RVLocs;
3383 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3384 *DAG.getContext());
3385 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3386
3387 // Copy all of the result registers out of their specified physreg.
3388 for (CCValAssign VA : RVLocs) {
3389 SDValue Val;
3390
3391 if (VA.isRegLoc()) {
3392 Val =
3393 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3394 Chain = Val.getValue(R: 1);
3395 InGlue = Val.getValue(R: 2);
3396 } else if (VA.isMemLoc()) {
3397 report_fatal_error(reason: "TODO: return values in memory");
3398 } else
3399 llvm_unreachable("unknown argument location type");
3400
3401 switch (VA.getLocInfo()) {
3402 case CCValAssign::Full:
3403 break;
3404 case CCValAssign::BCvt:
3405 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3406 break;
3407 case CCValAssign::ZExt:
3408 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3409 N2: DAG.getValueType(VA.getValVT()));
3410 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3411 break;
3412 case CCValAssign::SExt:
3413 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3414 N2: DAG.getValueType(VA.getValVT()));
3415 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3416 break;
3417 case CCValAssign::AExt:
3418 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3419 break;
3420 default:
3421 llvm_unreachable("Unknown loc info!");
3422 }
3423
3424 InVals.push_back(Elt: Val);
3425 }
3426
3427 return Chain;
3428}
3429
3430// Add code to pass special inputs required depending on used features separate
3431// from the explicit user arguments present in the IR.
3432void SITargetLowering::passSpecialInputs(
3433 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3434 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3435 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3436 // If we don't have a call site, this was a call inserted by
3437 // legalization. These can never use special inputs.
3438 if (!CLI.CB)
3439 return;
3440
3441 SelectionDAG &DAG = CLI.DAG;
3442 const SDLoc &DL = CLI.DL;
3443 const Function &F = DAG.getMachineFunction().getFunction();
3444
3445 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3446 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3447
3448 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3449 &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3450 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3451 // DAG.getPass() returns nullptr when using new pass manager.
3452 // TODO: Use DAG.getMFAM() to access analysis result.
3453 if (DAG.getPass()) {
3454 auto &ArgUsageInfo =
3455 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3456 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc);
3457 }
3458 }
3459
3460 // TODO: Unify with private memory register handling. This is complicated by
3461 // the fact that at least in kernels, the input argument is not necessarily
3462 // in the same location as the input.
3463 // clang-format off
3464 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3465 StringLiteral> ImplicitAttrs[] = {
3466 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3467 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3468 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3469 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3470 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3471 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3472 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3473 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3474 };
3475 // clang-format on
3476
3477 for (auto [InputID, Attr] : ImplicitAttrs) {
3478 // If the callee does not use the attribute value, skip copying the value.
3479 if (CLI.CB->hasFnAttr(Kind: Attr))
3480 continue;
3481
3482 const auto [OutgoingArg, ArgRC, ArgTy] =
3483 CalleeArgInfo->getPreloadedValue(Value: InputID);
3484 if (!OutgoingArg)
3485 continue;
3486
3487 const auto [IncomingArg, IncomingArgRC, Ty] =
3488 CallerArgInfo.getPreloadedValue(Value: InputID);
3489 assert(IncomingArgRC == ArgRC);
3490
3491 // All special arguments are ints for now.
3492 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3493 SDValue InputReg;
3494
3495 if (IncomingArg) {
3496 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3497 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3498 // The implicit arg ptr is special because it doesn't have a corresponding
3499 // input for kernels, and is computed from the kernarg segment pointer.
3500 InputReg = getImplicitArgPtr(DAG, SL: DL);
3501 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3502 std::optional<uint32_t> Id =
3503 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3504 if (Id.has_value()) {
3505 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3506 } else {
3507 InputReg = DAG.getPOISON(VT: ArgVT);
3508 }
3509 } else {
3510 // We may have proven the input wasn't needed, although the ABI is
3511 // requiring it. We just need to allocate the register appropriately.
3512 InputReg = DAG.getPOISON(VT: ArgVT);
3513 }
3514
3515 if (OutgoingArg->isRegister()) {
3516 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3517 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3518 report_fatal_error(reason: "failed to allocate implicit input argument");
3519 } else {
3520 unsigned SpecialArgOffset =
3521 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3522 SDValue ArgStore =
3523 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3524 MemOpChains.push_back(Elt: ArgStore);
3525 }
3526 }
3527
3528 // Pack workitem IDs into a single register or pass it as is if already
3529 // packed.
3530
3531 auto [OutgoingArg, ArgRC, Ty] =
3532 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3533 if (!OutgoingArg)
3534 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3535 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3536 if (!OutgoingArg)
3537 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3538 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3539 if (!OutgoingArg)
3540 return;
3541
3542 const ArgDescriptor *IncomingArgX = std::get<0>(
3543 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3544 const ArgDescriptor *IncomingArgY = std::get<0>(
3545 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3546 const ArgDescriptor *IncomingArgZ = std::get<0>(
3547 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3548
3549 SDValue InputReg;
3550 SDLoc SL;
3551
3552 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3553 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3554 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3555
3556 // If incoming ids are not packed we need to pack them.
3557 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3558 NeedWorkItemIDX) {
3559 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
3560 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3561 } else {
3562 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3563 }
3564 }
3565
3566 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3567 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
3568 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
3569 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
3570 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
3571 InputReg = InputReg.getNode()
3572 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
3573 : Y;
3574 }
3575
3576 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3577 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
3578 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
3579 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
3580 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
3581 InputReg = InputReg.getNode()
3582 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
3583 : Z;
3584 }
3585
3586 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3587 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3588 // We're in a situation where the outgoing function requires the workitem
3589 // ID, but the calling function does not have it (e.g a graphics function
3590 // calling a C calling convention function). This is illegal, but we need
3591 // to produce something.
3592 InputReg = DAG.getPOISON(VT: MVT::i32);
3593 } else {
3594 // Workitem ids are already packed, any of present incoming arguments
3595 // will carry all required fields.
3596 ArgDescriptor IncomingArg =
3597 ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
3598 : IncomingArgY ? *IncomingArgY
3599 : *IncomingArgZ,
3600 Mask: ~0u);
3601 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
3602 }
3603 }
3604
3605 if (OutgoingArg->isRegister()) {
3606 if (InputReg)
3607 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3608
3609 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
3610 } else {
3611 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
3612 if (InputReg) {
3613 SDValue ArgStore =
3614 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3615 MemOpChains.push_back(Elt: ArgStore);
3616 }
3617 }
3618}
3619
3620bool SITargetLowering::isEligibleForTailCallOptimization(
3621 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3622 const SmallVectorImpl<ISD::OutputArg> &Outs,
3623 const SmallVectorImpl<SDValue> &OutVals,
3624 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3625 if (AMDGPU::isChainCC(CC: CalleeCC))
3626 return true;
3627
3628 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
3629 return false;
3630
3631 // For a divergent call target, we need to do a waterfall loop over the
3632 // possible callees which precludes us from using a simple jump.
3633 if (Callee->isDivergent())
3634 return false;
3635
3636 MachineFunction &MF = DAG.getMachineFunction();
3637 const Function &CallerF = MF.getFunction();
3638 CallingConv::ID CallerCC = CallerF.getCallingConv();
3639 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3640 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3641
3642 // Kernels aren't callable, and don't have a live in return address so it
3643 // doesn't make sense to do a tail call with entry functions.
3644 if (!CallerPreserved)
3645 return false;
3646
3647 bool CCMatch = CallerCC == CalleeCC;
3648
3649 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3650 if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
3651 return true;
3652 return false;
3653 }
3654
3655 // TODO: Can we handle var args?
3656 if (IsVarArg)
3657 return false;
3658
3659 for (const Argument &Arg : CallerF.args()) {
3660 if (Arg.hasByValAttr())
3661 return false;
3662 }
3663
3664 LLVMContext &Ctx = *DAG.getContext();
3665
3666 // Check that the call results are passed in the same way.
3667 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
3668 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
3669 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
3670 return false;
3671
3672 // The callee has to preserve all registers the caller needs to preserve.
3673 if (!CCMatch) {
3674 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3675 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
3676 return false;
3677 }
3678
3679 // Nothing more to check if the callee is taking no arguments.
3680 if (Outs.empty())
3681 return true;
3682
3683 SmallVector<CCValAssign, 16> ArgLocs;
3684 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3685
3686 // FIXME: We are not allocating special input registers, so we will be
3687 // deciding based on incorrect register assignments.
3688 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
3689
3690 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3691 // If the stack arguments for this call do not fit into our own save area then
3692 // the call cannot be made tail.
3693 // TODO: Is this really necessary?
3694 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3695 return false;
3696
3697 for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
3698 // FIXME: What about inreg arguments that end up passed in memory?
3699 if (!CCVA.isRegLoc())
3700 continue;
3701
3702 // If we are passing an argument in an SGPR, and the value is divergent,
3703 // this call requires a waterfall loop.
3704 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
3705 LLVM_DEBUG(
3706 dbgs() << "Cannot tail call due to divergent outgoing argument in "
3707 << printReg(CCVA.getLocReg(), TRI) << '\n');
3708 return false;
3709 }
3710 }
3711
3712 const MachineRegisterInfo &MRI = MF.getRegInfo();
3713 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
3714}
3715
3716bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3717 if (!CI->isTailCall())
3718 return false;
3719
3720 const Function *ParentFn = CI->getParent()->getParent();
3721 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
3722 return false;
3723 return true;
3724}
3725
3726namespace {
3727// Chain calls have special arguments that we need to handle. These are
3728// tagging along at the end of the arguments list(s), after the SGPR and VGPR
3729// arguments (index 0 and 1 respectively).
3730enum ChainCallArgIdx {
3731 Exec = 2,
3732 Flags,
3733 NumVGPRs,
3734 FallbackExec,
3735 FallbackCallee
3736};
3737} // anonymous namespace
3738
3739// The wave scratch offset register is used as the global base pointer.
3740SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3741 SmallVectorImpl<SDValue> &InVals) const {
3742 CallingConv::ID CallConv = CLI.CallConv;
3743 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
3744
3745 SelectionDAG &DAG = CLI.DAG;
3746
3747 const SDLoc &DL = CLI.DL;
3748 SDValue Chain = CLI.Chain;
3749 SDValue Callee = CLI.Callee;
3750
3751 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
3752 bool UsesDynamicVGPRs = false;
3753 if (IsChainCallConv) {
3754 // The last arguments should be the value that we need to put in EXEC,
3755 // followed by the flags and any other arguments with special meanings.
3756 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
3757 // we don't treat them like the "real" arguments.
3758 auto RequestedExecIt =
3759 llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
3760 return Arg.OrigArgIndex == 2;
3761 });
3762 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
3763
3764 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
3765 CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
3766 CE: CLI.OutVals.end());
3767 CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
3768
3769 assert(CLI.Outs.back().OrigArgIndex < 2 &&
3770 "Haven't popped all the special args");
3771
3772 TargetLowering::ArgListEntry RequestedExecArg =
3773 CLI.Args[ChainCallArgIdx::Exec];
3774 if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
3775 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
3776
3777 // Convert constants into TargetConstants, so they become immediate operands
3778 // instead of being selected into S_MOV.
3779 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
3780 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
3781 ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
3782 Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: 0)));
3783 } else
3784 ChainCallSpecialArgs.push_back(Elt: Arg.Node);
3785 };
3786
3787 PushNodeOrTargetConstant(RequestedExecArg);
3788
3789 // Process any other special arguments depending on the value of the flags.
3790 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
3791
3792 const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
3793 if (FlagsValue.isZero()) {
3794 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
3795 return lowerUnhandledCall(CLI, InVals,
3796 Reason: "no additional args allowed if flags == 0");
3797 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
3798 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
3799 return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
3800 }
3801
3802 if (!Subtarget->isWave32()) {
3803 return lowerUnhandledCall(
3804 CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
3805 }
3806
3807 UsesDynamicVGPRs = true;
3808 std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
3809 last: CLI.Args.end(), f: PushNodeOrTargetConstant);
3810 }
3811 }
3812
3813 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3814 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3815 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3816 bool &IsTailCall = CLI.IsTailCall;
3817 bool IsVarArg = CLI.IsVarArg;
3818 bool IsSibCall = false;
3819 MachineFunction &MF = DAG.getMachineFunction();
3820
3821 if (Callee.isUndef() || isNullConstant(V: Callee)) {
3822 if (!CLI.IsTailCall) {
3823 for (ISD::InputArg &Arg : CLI.Ins)
3824 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3825 }
3826
3827 return Chain;
3828 }
3829
3830 if (IsVarArg) {
3831 return lowerUnhandledCall(CLI, InVals,
3832 Reason: "unsupported call to variadic function ");
3833 }
3834
3835 if (!CLI.CB)
3836 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
3837
3838 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3839 return lowerUnhandledCall(CLI, InVals,
3840 Reason: "unsupported required tail call to function ");
3841 }
3842
3843 if (IsTailCall) {
3844 IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
3845 Outs, OutVals, Ins, DAG);
3846 if (!IsTailCall &&
3847 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3848 report_fatal_error(reason: "failed to perform tail call elimination on a call "
3849 "site marked musttail or on llvm.amdgcn.cs.chain");
3850 }
3851
3852 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3853
3854 // A sibling call is one where we're under the usual C ABI and not planning
3855 // to change that but can still do a tail call:
3856 if (!TailCallOpt && IsTailCall)
3857 IsSibCall = true;
3858
3859 if (IsTailCall)
3860 ++NumTailCalls;
3861 }
3862
3863 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3864 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3865 SmallVector<SDValue, 8> MemOpChains;
3866
3867 // Analyze operands of the call, assigning locations to each operand.
3868 SmallVector<CCValAssign, 16> ArgLocs;
3869 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3870 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
3871
3872 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) {
3873 // With a fixed ABI, allocate fixed registers before user arguments.
3874 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
3875 }
3876
3877 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
3878
3879 // Get a count of how many bytes are to be pushed on the stack.
3880 unsigned NumBytes = CCInfo.getStackSize();
3881
3882 if (IsSibCall) {
3883 // Since we're not changing the ABI to make this a tail call, the memory
3884 // operands are already available in the caller's incoming argument space.
3885 NumBytes = 0;
3886 }
3887
3888 // FPDiff is the byte offset of the call's argument area from the callee's.
3889 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3890 // by this amount for a tail call. In a sibling call it must be 0 because the
3891 // caller will deallocate the entire stack and the callee still expects its
3892 // arguments to begin at SP+0. Completely unused for non-tail calls.
3893 int32_t FPDiff = 0;
3894 MachineFrameInfo &MFI = MF.getFrameInfo();
3895 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3896
3897 // Adjust the stack pointer for the new arguments...
3898 // These operations are automatically eliminated by the prolog/epilog pass
3899 if (!IsSibCall)
3900 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
3901
3902 if (!IsSibCall || IsChainCallConv) {
3903 if (!Subtarget->enableFlatScratch()) {
3904 SmallVector<SDValue, 4> CopyFromChains;
3905
3906 // In the HSA case, this should be an identity copy.
3907 SDValue ScratchRSrcReg =
3908 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
3909 RegsToPass.emplace_back(Args: IsChainCallConv
3910 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3911 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3912 Args&: ScratchRSrcReg);
3913 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
3914 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
3915 }
3916 }
3917
3918 const unsigned NumSpecialInputs = RegsToPass.size();
3919
3920 MVT PtrVT = MVT::i32;
3921
3922 // Walk the register/memloc assignments, inserting copies/loads.
3923 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3924 CCValAssign &VA = ArgLocs[i];
3925 SDValue Arg = OutVals[i];
3926
3927 // Promote the value if needed.
3928 switch (VA.getLocInfo()) {
3929 case CCValAssign::Full:
3930 break;
3931 case CCValAssign::BCvt:
3932 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3933 break;
3934 case CCValAssign::ZExt:
3935 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3936 break;
3937 case CCValAssign::SExt:
3938 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3939 break;
3940 case CCValAssign::AExt:
3941 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3942 break;
3943 case CCValAssign::FPExt:
3944 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3945 break;
3946 default:
3947 llvm_unreachable("Unknown loc info!");
3948 }
3949
3950 if (VA.isRegLoc()) {
3951 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
3952 } else {
3953 assert(VA.isMemLoc());
3954
3955 SDValue DstAddr;
3956 MachinePointerInfo DstInfo;
3957
3958 unsigned LocMemOffset = VA.getLocMemOffset();
3959 int32_t Offset = LocMemOffset;
3960
3961 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
3962 MaybeAlign Alignment;
3963
3964 if (IsTailCall) {
3965 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3966 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3967 : VA.getValVT().getStoreSize();
3968
3969 // FIXME: We can have better than the minimum byval required alignment.
3970 Alignment =
3971 Flags.isByVal()
3972 ? Flags.getNonZeroByValAlign()
3973 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
3974
3975 Offset = Offset + FPDiff;
3976 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
3977
3978 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
3979 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3980
3981 // Make sure any stack arguments overlapping with where we're storing
3982 // are loaded before this eventual operation. Otherwise they'll be
3983 // clobbered.
3984
3985 // FIXME: Why is this really necessary? This seems to just result in a
3986 // lot of code to copy the stack and write them back to the same
3987 // locations, which are supposed to be immutable?
3988 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
3989 } else {
3990 // Stores to the argument stack area are relative to the stack pointer.
3991 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
3992 VT: MVT::i32);
3993 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
3994 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
3995 Alignment =
3996 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
3997 }
3998
3999 if (Outs[i].Flags.isByVal()) {
4000 SDValue SizeNode =
4001 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
4002 SDValue Cpy =
4003 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4004 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
4005 /*isVol = */ false, /*AlwaysInline = */ true,
4006 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4007 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4008
4009 MemOpChains.push_back(Elt: Cpy);
4010 } else {
4011 SDValue Store =
4012 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4013 MemOpChains.push_back(Elt: Store);
4014 }
4015 }
4016 }
4017
4018 if (!MemOpChains.empty())
4019 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4020
4021 SDValue ReadFirstLaneID =
4022 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4023
4024 SDValue TokenGlue;
4025 if (CLI.ConvergenceControlToken) {
4026 TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4027 Operand: CLI.ConvergenceControlToken);
4028 }
4029
4030 // Build a sequence of copy-to-reg nodes chained together with token chain
4031 // and flag operands which copy the outgoing args into the appropriate regs.
4032 SDValue InGlue;
4033
4034 unsigned ArgIdx = 0;
4035 for (auto [Reg, Val] : RegsToPass) {
4036 if (ArgIdx++ >= NumSpecialInputs &&
4037 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4038 // For chain calls, the inreg arguments are required to be
4039 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4040 // they are uniform.
4041 //
4042 // For other calls, if an inreg arguments is known to be uniform,
4043 // speculatively insert a readfirstlane in case it is in a VGPR.
4044 //
4045 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4046 // value, so let that continue to produce invalid code.
4047
4048 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4049 if (TokenGlue)
4050 ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4051 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4052 Ops: ReadfirstlaneArgs);
4053 }
4054
4055 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4056 InGlue = Chain.getValue(R: 1);
4057 }
4058
4059 // We don't usually want to end the call-sequence here because we would tidy
4060 // the frame up *after* the call, however in the ABI-changing tail-call case
4061 // we've carefully laid out the parameters so that when sp is reset they'll be
4062 // in the correct location.
4063 if (IsTailCall && !IsSibCall) {
4064 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
4065 InGlue = Chain.getValue(R: 1);
4066 }
4067
4068 std::vector<SDValue> Ops({Chain});
4069
4070 // Add a redundant copy of the callee global which will not be legalized, as
4071 // we need direct access to the callee later.
4072 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4073 const GlobalValue *GV = GSD->getGlobal();
4074 Ops.push_back(x: Callee);
4075 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4076 } else {
4077 if (IsTailCall) {
4078 // isEligibleForTailCallOptimization considered whether the call target is
4079 // divergent, but we may still end up with a uniform value in a VGPR.
4080 // Insert a readfirstlane just in case.
4081 SDValue ReadFirstLaneID =
4082 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4083
4084 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4085 if (TokenGlue)
4086 ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4087 Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4088 Ops: ReadfirstlaneArgs);
4089 }
4090
4091 Ops.push_back(x: Callee);
4092 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
4093 }
4094
4095 if (IsTailCall) {
4096 // Each tail call may have to adjust the stack by a different amount, so
4097 // this information must travel along with the operation for eventual
4098 // consumption by emitEpilogue.
4099 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4100 }
4101
4102 if (IsChainCallConv)
4103 llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4104
4105 // Add argument registers to the end of the list so that they are known live
4106 // into the call.
4107 for (auto &[Reg, Val] : RegsToPass)
4108 Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4109
4110 // Add a register mask operand representing the call-preserved registers.
4111 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4112 assert(Mask && "Missing call preserved mask for calling convention");
4113 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4114
4115 if (SDValue Token = CLI.ConvergenceControlToken) {
4116 SmallVector<SDValue, 2> GlueOps;
4117 GlueOps.push_back(Elt: Token);
4118 if (InGlue)
4119 GlueOps.push_back(Elt: InGlue);
4120
4121 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4122 VT: MVT::Glue, Ops: GlueOps),
4123 0);
4124 }
4125
4126 if (InGlue)
4127 Ops.push_back(x: InGlue);
4128
4129 // If we're doing a tall call, use a TC_RETURN here rather than an
4130 // actual call instruction.
4131 if (IsTailCall) {
4132 MFI.setHasTailCall();
4133 unsigned OPC = AMDGPUISD::TC_RETURN;
4134 switch (CallConv) {
4135 case CallingConv::AMDGPU_Gfx:
4136 OPC = AMDGPUISD::TC_RETURN_GFX;
4137 break;
4138 case CallingConv::AMDGPU_CS_Chain:
4139 case CallingConv::AMDGPU_CS_ChainPreserve:
4140 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4141 : AMDGPUISD::TC_RETURN_CHAIN;
4142 break;
4143 }
4144
4145 return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4146 }
4147
4148 // Returns a chain and a flag for retval copy to use.
4149 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4150 Chain = Call.getValue(R: 0);
4151 InGlue = Call.getValue(R: 1);
4152
4153 uint64_t CalleePopBytes = NumBytes;
4154 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
4155 if (!Ins.empty())
4156 InGlue = Chain.getValue(R: 1);
4157
4158 // Handle result values, copying them out of physregs into vregs that we
4159 // return.
4160 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4161 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
4162}
4163
4164// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4165// except for:
4166// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4167// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4168SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4169 SelectionDAG &DAG) const {
4170 const MachineFunction &MF = DAG.getMachineFunction();
4171 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4172
4173 SDLoc dl(Op);
4174 EVT VT = Op.getValueType();
4175 SDValue Chain = Op.getOperand(i: 0);
4176 Register SPReg = Info->getStackPtrOffsetReg();
4177
4178 // Chain the dynamic stack allocation so that it doesn't modify the stack
4179 // pointer when other instructions are using the stack.
4180 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
4181
4182 SDValue Size = Op.getOperand(i: 1);
4183 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4184 Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getAlignValue();
4185
4186 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4187 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4188 "Stack grows upwards for AMDGPU");
4189
4190 Chain = BaseAddr.getValue(R: 1);
4191 Align StackAlign = TFL->getStackAlign();
4192 if (Alignment > StackAlign) {
4193 uint64_t ScaledAlignment = (uint64_t)Alignment.value()
4194 << Subtarget->getWavefrontSizeLog2();
4195 uint64_t StackAlignMask = ScaledAlignment - 1;
4196 SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4197 N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4198 BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4199 N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4200 }
4201
4202 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4203 SDValue NewSP;
4204 if (isa<ConstantSDNode>(Val: Size)) {
4205 // For constant sized alloca, scale alloca size by wave-size
4206 SDValue ScaledSize = DAG.getNode(
4207 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4208 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4209 NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4210 } else {
4211 // For dynamic sized alloca, perform wave-wide reduction to get max of
4212 // alloca size(divergent) and then scale it by wave-size
4213 SDValue WaveReduction =
4214 DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4215 Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4216 N2: Size, N3: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
4217 SDValue ScaledSize = DAG.getNode(
4218 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4219 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4220 NewSP =
4221 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4222 SDValue ReadFirstLaneID =
4223 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4224 NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4225 N2: NewSP);
4226 }
4227
4228 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4229 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4230
4231 return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4232}
4233
4234SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4235 if (Op.getValueType() != MVT::i32)
4236 return Op; // Defer to cannot select error.
4237
4238 Register SP = getStackPointerRegisterToSaveRestore();
4239 SDLoc SL(Op);
4240
4241 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4242
4243 // Convert from wave uniform to swizzled vector address. This should protect
4244 // from any edge cases where the stacksave result isn't directly used with
4245 // stackrestore.
4246 SDValue VectorAddress =
4247 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4248 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4249}
4250
4251SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4252 SelectionDAG &DAG) const {
4253 SDLoc SL(Op);
4254 assert(Op.getValueType() == MVT::i32);
4255
4256 uint32_t BothRoundHwReg =
4257 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4258 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4259
4260 SDValue IntrinID =
4261 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4262 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4263 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4264
4265 // There are two rounding modes, one for f32 and one for f64/f16. We only
4266 // report in the standard value range if both are the same.
4267 //
4268 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4269 // ties away from zero is not supported, and the other values are rotated by
4270 // 1.
4271 //
4272 // If the two rounding modes are not the same, report a target defined value.
4273
4274 // Mode register rounding mode fields:
4275 //
4276 // [1:0] Single-precision round mode.
4277 // [3:2] Double/Half-precision round mode.
4278 //
4279 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4280 //
4281 // Hardware Spec
4282 // Toward-0 3 0
4283 // Nearest Even 0 1
4284 // +Inf 1 2
4285 // -Inf 2 3
4286 // NearestAway0 N/A 4
4287 //
4288 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4289 // table we can index by the raw hardware mode.
4290 //
4291 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4292
4293 SDValue BitTable =
4294 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4295
4296 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4297 SDValue RoundModeTimesNumBits =
4298 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4299
4300 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4301 // knew only one mode was demanded.
4302 SDValue TableValue =
4303 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4304 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4305
4306 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4307 SDValue TableEntry =
4308 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4309
4310 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4311 // if it's an extended value.
4312 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4313 SDValue IsStandardValue =
4314 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4315 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4316 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4317 N2: TableEntry, N3: EnumOffset);
4318
4319 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4320}
4321
4322SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4323 SelectionDAG &DAG) const {
4324 SDLoc SL(Op);
4325
4326 SDValue NewMode = Op.getOperand(i: 1);
4327 assert(NewMode.getValueType() == MVT::i32);
4328
4329 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4330 // hardware MODE.fp_round values.
4331 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4332 uint32_t ClampedVal = std::min(
4333 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4334 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4335 NewMode = DAG.getConstant(
4336 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4337 } else {
4338 // If we know the input can only be one of the supported standard modes in
4339 // the range 0-3, we can use a simplified mapping to hardware values.
4340 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4341 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4342 // The supported standard values are 0-3. The extended values start at 8. We
4343 // need to offset by 4 if the value is in the extended range.
4344
4345 if (UseReducedTable) {
4346 // Truncate to the low 32-bits.
4347 SDValue BitTable = DAG.getConstant(
4348 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4349
4350 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4351 SDValue RoundModeTimesNumBits =
4352 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4353
4354 NewMode =
4355 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4356
4357 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4358 // the table extracted bits into inline immediates.
4359 } else {
4360 // table_index = umin(value, value - 4)
4361 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4362 SDValue BitTable =
4363 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4364
4365 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4366 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4367 SDValue IndexVal =
4368 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4369
4370 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4371 SDValue RoundModeTimesNumBits =
4372 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4373
4374 SDValue TableValue =
4375 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4376 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4377
4378 // No need to mask out the high bits since the setreg will ignore them
4379 // anyway.
4380 NewMode = TruncTable;
4381 }
4382
4383 // Insert a readfirstlane in case the value is a VGPR. We could do this
4384 // earlier and keep more operations scalar, but that interferes with
4385 // combining the source.
4386 SDValue ReadFirstLaneID =
4387 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4388 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4389 N1: ReadFirstLaneID, N2: NewMode);
4390 }
4391
4392 // N.B. The setreg will be later folded into s_round_mode on supported
4393 // targets.
4394 SDValue IntrinID =
4395 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4396 uint32_t BothRoundHwReg =
4397 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4398 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4399
4400 SDValue SetReg =
4401 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4402 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4403
4404 return SetReg;
4405}
4406
4407SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4408 if (Op->isDivergent())
4409 return SDValue();
4410
4411 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4412 case AMDGPUAS::FLAT_ADDRESS:
4413 case AMDGPUAS::GLOBAL_ADDRESS:
4414 case AMDGPUAS::CONSTANT_ADDRESS:
4415 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4416 break;
4417 default:
4418 return SDValue();
4419 }
4420
4421 return Op;
4422}
4423
4424// Work around DAG legality rules only based on the result type.
4425SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4426 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4427 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4428 EVT SrcVT = Src.getValueType();
4429
4430 if (SrcVT.getScalarType() != MVT::bf16)
4431 return Op;
4432
4433 SDLoc SL(Op);
4434 SDValue BitCast =
4435 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4436
4437 EVT DstVT = Op.getValueType();
4438 if (IsStrict)
4439 llvm_unreachable("Need STRICT_BF16_TO_FP");
4440
4441 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4442}
4443
4444SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4445 SDLoc SL(Op);
4446 if (Op.getValueType() != MVT::i64)
4447 return Op;
4448
4449 uint32_t ModeHwReg =
4450 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4451 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4452 uint32_t TrapHwReg =
4453 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4454 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4455
4456 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4457 SDValue IntrinID =
4458 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4459 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4460 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4461 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4462 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4463 SDValue TokenReg =
4464 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
4465 N2: GetTrapReg.getValue(R: 1));
4466
4467 SDValue CvtPtr =
4468 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4469 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4470
4471 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4472}
4473
4474SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4475 SDLoc SL(Op);
4476 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
4477 return Op;
4478
4479 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
4480 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4481 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
4482 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4483 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
4484
4485 SDValue ReadFirstLaneID =
4486 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4487 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4488 N1: ReadFirstLaneID, N2: NewModeReg);
4489 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4490 N1: ReadFirstLaneID, N2: NewTrapReg);
4491
4492 unsigned ModeHwReg =
4493 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4494 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4495 unsigned TrapHwReg =
4496 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4497 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4498
4499 SDValue IntrinID =
4500 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4501 SDValue SetModeReg =
4502 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4503 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4504 SDValue SetTrapReg =
4505 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4506 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4507 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4508}
4509
4510Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4511 const MachineFunction &MF) const {
4512 const Function &Fn = MF.getFunction();
4513
4514 Register Reg = StringSwitch<Register>(RegName)
4515 .Case(S: "m0", Value: AMDGPU::M0)
4516 .Case(S: "exec", Value: AMDGPU::EXEC)
4517 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4518 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4519 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4520 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4521 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4522 .Default(Value: Register());
4523 if (!Reg)
4524 return Reg;
4525
4526 if (!Subtarget->hasFlatScrRegister() &&
4527 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4528 Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef(RegName) +
4529 "\" for subtarget."));
4530 }
4531
4532 switch (Reg) {
4533 case AMDGPU::M0:
4534 case AMDGPU::EXEC_LO:
4535 case AMDGPU::EXEC_HI:
4536 case AMDGPU::FLAT_SCR_LO:
4537 case AMDGPU::FLAT_SCR_HI:
4538 if (VT.getSizeInBits() == 32)
4539 return Reg;
4540 break;
4541 case AMDGPU::EXEC:
4542 case AMDGPU::FLAT_SCR:
4543 if (VT.getSizeInBits() == 64)
4544 return Reg;
4545 break;
4546 default:
4547 llvm_unreachable("missing register type checking");
4548 }
4549
4550 report_fatal_error(
4551 reason: Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4552}
4553
4554// If kill is not the last instruction, split the block so kill is always a
4555// proper terminator.
4556MachineBasicBlock *
4557SITargetLowering::splitKillBlock(MachineInstr &MI,
4558 MachineBasicBlock *BB) const {
4559 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, /*UpdateLiveIns=*/true);
4560 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4561 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
4562 return SplitBB;
4563}
4564
4565// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4566// \p MI will be the only instruction in the loop body block. Otherwise, it will
4567// be the first instruction in the remainder block.
4568//
4569/// \returns { LoopBody, Remainder }
4570static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4571splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4572 MachineFunction *MF = MBB.getParent();
4573 MachineBasicBlock::iterator I(&MI);
4574
4575 // To insert the loop we need to split the block. Move everything after this
4576 // point to a new block, and insert a new empty block between the two.
4577 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4578 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4579 MachineFunction::iterator MBBI(MBB);
4580 ++MBBI;
4581
4582 MF->insert(MBBI, MBB: LoopBB);
4583 MF->insert(MBBI, MBB: RemainderBB);
4584
4585 LoopBB->addSuccessor(Succ: LoopBB);
4586 LoopBB->addSuccessor(Succ: RemainderBB);
4587
4588 // Move the rest of the block into a new block.
4589 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
4590
4591 if (InstInLoop) {
4592 auto Next = std::next(x: I);
4593
4594 // Move instruction to loop body.
4595 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
4596
4597 // Move the rest of the block.
4598 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
4599 } else {
4600 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
4601 }
4602
4603 MBB.addSuccessor(Succ: LoopBB);
4604
4605 return std::pair(LoopBB, RemainderBB);
4606}
4607
4608/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4609void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4610 MachineBasicBlock *MBB = MI.getParent();
4611 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4612 auto I = MI.getIterator();
4613 auto E = std::next(x: I);
4614
4615 // clang-format off
4616 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
4617 .addImm(Val: 0);
4618 // clang-format on
4619
4620 MIBundleBuilder Bundler(*MBB, I, E);
4621 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
4622}
4623
4624MachineBasicBlock *
4625SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4626 MachineBasicBlock *BB) const {
4627 const DebugLoc &DL = MI.getDebugLoc();
4628
4629 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4630
4631 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4632
4633 // Apparently kill flags are only valid if the def is in the same block?
4634 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
4635 Src->setIsKill(false);
4636
4637 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
4638
4639 MachineBasicBlock::iterator I = LoopBB->end();
4640
4641 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4642 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
4643
4644 // Clear TRAP_STS.MEM_VIOL
4645 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
4646 .addImm(Val: 0)
4647 .addImm(Val: EncodedReg);
4648
4649 bundleInstWithWaitcnt(MI);
4650
4651 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4652
4653 // Load and check TRAP_STS.MEM_VIOL
4654 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
4655 .addImm(Val: EncodedReg);
4656
4657 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4658 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
4659 .addReg(RegNo: Reg, flags: RegState::Kill)
4660 .addImm(Val: 0);
4661 // clang-format off
4662 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
4663 .addMBB(MBB: LoopBB);
4664 // clang-format on
4665
4666 return RemainderBB;
4667}
4668
4669// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4670// wavefront. If the value is uniform and just happens to be in a VGPR, this
4671// will only do one iteration. In the worst case, this will loop 64 times.
4672//
4673// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4674static MachineBasicBlock::iterator
4675emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4676 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4677 const DebugLoc &DL, const MachineOperand &Idx,
4678 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4679 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4680 Register &SGPRIdxReg) {
4681
4682 MachineFunction *MF = OrigBB.getParent();
4683 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4684 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4685 MachineBasicBlock::iterator I = LoopBB.begin();
4686
4687 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4688 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
4689 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
4690 Register CurrentIdxReg =
4691 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4692 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
4693
4694 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
4695 .addReg(RegNo: InitReg)
4696 .addMBB(MBB: &OrigBB)
4697 .addReg(RegNo: ResultReg)
4698 .addMBB(MBB: &LoopBB);
4699
4700 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
4701 .addReg(RegNo: InitSaveExecReg)
4702 .addMBB(MBB: &OrigBB)
4703 .addReg(RegNo: NewExec)
4704 .addMBB(MBB: &LoopBB);
4705
4706 // Read the next variant <- also loop target.
4707 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
4708 .addReg(RegNo: Idx.getReg(), flags: getUndefRegState(B: Idx.isUndef()));
4709
4710 // Compare the just read M0 value to all possible Idx values.
4711 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
4712 .addReg(RegNo: CurrentIdxReg)
4713 .addReg(RegNo: Idx.getReg(), flags: 0, SubReg: Idx.getSubReg());
4714
4715 // Update EXEC, save the original EXEC value to VCC.
4716 BuildMI(BB&: LoopBB, I, MIMD: DL,
4717 MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4718 : AMDGPU::S_AND_SAVEEXEC_B64),
4719 DestReg: NewExec)
4720 .addReg(RegNo: CondReg, flags: RegState::Kill);
4721
4722 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
4723
4724 if (UseGPRIdxMode) {
4725 if (Offset == 0) {
4726 SGPRIdxReg = CurrentIdxReg;
4727 } else {
4728 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
4729 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
4730 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4731 .addImm(Val: Offset);
4732 }
4733 } else {
4734 // Move index from VCC into M0
4735 if (Offset == 0) {
4736 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
4737 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill);
4738 } else {
4739 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4740 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4741 .addImm(Val: Offset);
4742 }
4743 }
4744
4745 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4746 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4747 MachineInstr *InsertPt =
4748 BuildMI(BB&: LoopBB, I, MIMD: DL,
4749 MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_XOR_B32_term
4750 : AMDGPU::S_XOR_B64_term),
4751 DestReg: Exec)
4752 .addReg(RegNo: Exec)
4753 .addReg(RegNo: NewExec);
4754
4755 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4756 // s_cbranch_scc0?
4757
4758 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4759 // clang-format off
4760 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
4761 .addMBB(MBB: &LoopBB);
4762 // clang-format on
4763
4764 return InsertPt->getIterator();
4765}
4766
4767// This has slightly sub-optimal regalloc when the source vector is killed by
4768// the read. The register allocator does not understand that the kill is
4769// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4770// subregister from it, using 1 more VGPR than necessary. This was saved when
4771// this was expanded after register allocation.
4772static MachineBasicBlock::iterator
4773loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4774 unsigned InitResultReg, unsigned PhiReg, int Offset,
4775 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4776 MachineFunction *MF = MBB.getParent();
4777 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4778 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4779 MachineRegisterInfo &MRI = MF->getRegInfo();
4780 const DebugLoc &DL = MI.getDebugLoc();
4781 MachineBasicBlock::iterator I(&MI);
4782
4783 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4784 Register DstReg = MI.getOperand(i: 0).getReg();
4785 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4786 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4787 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4788 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4789
4790 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
4791
4792 // Save the EXEC mask
4793 // clang-format off
4794 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: SaveExec)
4795 .addReg(RegNo: Exec);
4796 // clang-format on
4797
4798 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
4799
4800 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4801
4802 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
4803 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
4804 Offset, UseGPRIdxMode, SGPRIdxReg);
4805
4806 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
4807 MachineFunction::iterator MBBI(LoopBB);
4808 ++MBBI;
4809 MF->insert(MBBI, MBB: LandingPad);
4810 LoopBB->removeSuccessor(Succ: RemainderBB);
4811 LandingPad->addSuccessor(Succ: RemainderBB);
4812 LoopBB->addSuccessor(Succ: LandingPad);
4813 MachineBasicBlock::iterator First = LandingPad->begin();
4814 // clang-format off
4815 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: Exec)
4816 .addReg(RegNo: SaveExec);
4817 // clang-format on
4818
4819 return InsPt;
4820}
4821
4822// Returns subreg index, offset
4823static std::pair<unsigned, int>
4824computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4825 const TargetRegisterClass *SuperRC, unsigned VecReg,
4826 int Offset) {
4827 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
4828
4829 // Skip out of bounds offsets, or else we would end up using an undefined
4830 // register.
4831 if (Offset >= NumElts || Offset < 0)
4832 return std::pair(AMDGPU::sub0, Offset);
4833
4834 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
4835}
4836
4837static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4838 MachineRegisterInfo &MRI, MachineInstr &MI,
4839 int Offset) {
4840 MachineBasicBlock *MBB = MI.getParent();
4841 const DebugLoc &DL = MI.getDebugLoc();
4842 MachineBasicBlock::iterator I(&MI);
4843
4844 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4845
4846 assert(Idx->getReg() != AMDGPU::NoRegister);
4847
4848 if (Offset == 0) {
4849 // clang-format off
4850 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
4851 .add(MO: *Idx);
4852 // clang-format on
4853 } else {
4854 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4855 .add(MO: *Idx)
4856 .addImm(Val: Offset);
4857 }
4858}
4859
4860static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4861 MachineRegisterInfo &MRI, MachineInstr &MI,
4862 int Offset) {
4863 MachineBasicBlock *MBB = MI.getParent();
4864 const DebugLoc &DL = MI.getDebugLoc();
4865 MachineBasicBlock::iterator I(&MI);
4866
4867 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4868
4869 if (Offset == 0)
4870 return Idx->getReg();
4871
4872 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4873 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
4874 .add(MO: *Idx)
4875 .addImm(Val: Offset);
4876 return Tmp;
4877}
4878
4879static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4880 MachineBasicBlock &MBB,
4881 const GCNSubtarget &ST) {
4882 const SIInstrInfo *TII = ST.getInstrInfo();
4883 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4884 MachineFunction *MF = MBB.getParent();
4885 MachineRegisterInfo &MRI = MF->getRegInfo();
4886
4887 Register Dst = MI.getOperand(i: 0).getReg();
4888 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4889 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
4890 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4891
4892 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
4893 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4894
4895 unsigned SubReg;
4896 std::tie(args&: SubReg, args&: Offset) =
4897 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
4898
4899 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4900
4901 // Check for a SGPR index.
4902 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4903 MachineBasicBlock::iterator I(&MI);
4904 const DebugLoc &DL = MI.getDebugLoc();
4905
4906 if (UseGPRIdxMode) {
4907 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4908 // to avoid interfering with other uses, so probably requires a new
4909 // optimization pass.
4910 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4911
4912 const MCInstrDesc &GPRIDXDesc =
4913 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
4914 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4915 .addReg(RegNo: SrcReg)
4916 .addReg(RegNo: Idx)
4917 .addImm(Val: SubReg);
4918 } else {
4919 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4920
4921 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4922 .addReg(RegNo: SrcReg, flags: 0, SubReg)
4923 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
4924 }
4925
4926 MI.eraseFromParent();
4927
4928 return &MBB;
4929 }
4930
4931 // Control flow needs to be inserted if indexing with a VGPR.
4932 const DebugLoc &DL = MI.getDebugLoc();
4933 MachineBasicBlock::iterator I(&MI);
4934
4935 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4936 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4937
4938 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
4939
4940 Register SGPRIdxReg;
4941 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
4942 UseGPRIdxMode, SGPRIdxReg);
4943
4944 MachineBasicBlock *LoopBB = InsPt->getParent();
4945
4946 if (UseGPRIdxMode) {
4947 const MCInstrDesc &GPRIDXDesc =
4948 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
4949
4950 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4951 .addReg(RegNo: SrcReg)
4952 .addReg(RegNo: SGPRIdxReg)
4953 .addImm(Val: SubReg);
4954 } else {
4955 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4956 .addReg(RegNo: SrcReg, flags: 0, SubReg)
4957 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
4958 }
4959
4960 MI.eraseFromParent();
4961
4962 return LoopBB;
4963}
4964
4965static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4966 MachineBasicBlock &MBB,
4967 const GCNSubtarget &ST) {
4968 const SIInstrInfo *TII = ST.getInstrInfo();
4969 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4970 MachineFunction *MF = MBB.getParent();
4971 MachineRegisterInfo &MRI = MF->getRegInfo();
4972
4973 Register Dst = MI.getOperand(i: 0).getReg();
4974 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
4975 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4976 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
4977 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4978 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
4979 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4980
4981 // This can be an immediate, but will be folded later.
4982 assert(Val->getReg());
4983
4984 unsigned SubReg;
4985 std::tie(args&: SubReg, args&: Offset) =
4986 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
4987 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4988
4989 if (Idx->getReg() == AMDGPU::NoRegister) {
4990 MachineBasicBlock::iterator I(&MI);
4991 const DebugLoc &DL = MI.getDebugLoc();
4992
4993 assert(Offset == 0);
4994
4995 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
4996 .add(MO: *SrcVec)
4997 .add(MO: *Val)
4998 .addImm(Val: SubReg);
4999
5000 MI.eraseFromParent();
5001 return &MBB;
5002 }
5003
5004 // Check for a SGPR index.
5005 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5006 MachineBasicBlock::iterator I(&MI);
5007 const DebugLoc &DL = MI.getDebugLoc();
5008
5009 if (UseGPRIdxMode) {
5010 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5011
5012 const MCInstrDesc &GPRIDXDesc =
5013 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5014 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5015 .addReg(RegNo: SrcVec->getReg())
5016 .add(MO: *Val)
5017 .addReg(RegNo: Idx)
5018 .addImm(Val: SubReg);
5019 } else {
5020 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5021
5022 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5023 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5024 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5025 .addReg(RegNo: SrcVec->getReg())
5026 .add(MO: *Val)
5027 .addImm(Val: SubReg);
5028 }
5029 MI.eraseFromParent();
5030 return &MBB;
5031 }
5032
5033 // Control flow needs to be inserted if indexing with a VGPR.
5034 if (Val->isReg())
5035 MRI.clearKillFlags(Reg: Val->getReg());
5036
5037 const DebugLoc &DL = MI.getDebugLoc();
5038
5039 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5040
5041 Register SGPRIdxReg;
5042 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5043 UseGPRIdxMode, SGPRIdxReg);
5044 MachineBasicBlock *LoopBB = InsPt->getParent();
5045
5046 if (UseGPRIdxMode) {
5047 const MCInstrDesc &GPRIDXDesc =
5048 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5049
5050 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5051 .addReg(RegNo: PhiReg)
5052 .add(MO: *Val)
5053 .addReg(RegNo: SGPRIdxReg)
5054 .addImm(Val: SubReg);
5055 } else {
5056 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5057 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5058 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5059 .addReg(RegNo: PhiReg)
5060 .add(MO: *Val)
5061 .addImm(Val: SubReg);
5062 }
5063
5064 MI.eraseFromParent();
5065 return LoopBB;
5066}
5067
5068static uint32_t getIdentityValueForWaveReduction(unsigned Opc) {
5069 switch (Opc) {
5070 case AMDGPU::S_MIN_U32:
5071 return std::numeric_limits<uint32_t>::max();
5072 case AMDGPU::S_MIN_I32:
5073 return std::numeric_limits<int32_t>::max();
5074 case AMDGPU::S_MAX_U32:
5075 return std::numeric_limits<uint32_t>::min();
5076 case AMDGPU::S_MAX_I32:
5077 return std::numeric_limits<int32_t>::min();
5078 case AMDGPU::S_ADD_I32:
5079 case AMDGPU::S_SUB_I32:
5080 case AMDGPU::S_OR_B32:
5081 case AMDGPU::S_XOR_B32:
5082 return std::numeric_limits<uint32_t>::min();
5083 case AMDGPU::S_AND_B32:
5084 return std::numeric_limits<uint32_t>::max();
5085 default:
5086 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5087 }
5088}
5089
5090static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5091 MachineBasicBlock &BB,
5092 const GCNSubtarget &ST,
5093 unsigned Opc) {
5094 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5095 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5096 const DebugLoc &DL = MI.getDebugLoc();
5097 const SIInstrInfo *TII = ST.getInstrInfo();
5098
5099 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5100 Register SrcReg = MI.getOperand(i: 1).getReg();
5101 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5102 Register DstReg = MI.getOperand(i: 0).getReg();
5103 MachineBasicBlock *RetBB = nullptr;
5104 if (isSGPR) {
5105 switch (Opc) {
5106 case AMDGPU::S_MIN_U32:
5107 case AMDGPU::S_MIN_I32:
5108 case AMDGPU::S_MAX_U32:
5109 case AMDGPU::S_MAX_I32:
5110 case AMDGPU::S_AND_B32:
5111 case AMDGPU::S_OR_B32: {
5112 // Idempotent operations.
5113 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5114 RetBB = &BB;
5115 break;
5116 }
5117 case AMDGPU::S_XOR_B32:
5118 case AMDGPU::S_ADD_I32:
5119 case AMDGPU::S_SUB_I32: {
5120 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5121 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5122 Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5123 Register ActiveLanes = MRI.createVirtualRegister(RegClass: DstRegClass);
5124
5125 bool IsWave32 = ST.isWave32();
5126 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5127 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5128 unsigned CountReg =
5129 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5130
5131 auto Exec =
5132 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5133
5134 auto NewAccumulator = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: CountReg), DestReg: ActiveLanes)
5135 .addReg(RegNo: Exec->getOperand(i: 0).getReg());
5136
5137 switch (Opc) {
5138 case AMDGPU::S_XOR_B32: {
5139 // Performing an XOR operation on a uniform value
5140 // depends on the parity of the number of active lanes.
5141 // For even parity, the result will be 0, for odd
5142 // parity the result will be the same as the input value.
5143 Register ParityRegister = MRI.createVirtualRegister(RegClass: DstRegClass);
5144
5145 auto ParityReg =
5146 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5147 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5148 .addImm(Val: 1);
5149 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5150 .addReg(RegNo: SrcReg)
5151 .addReg(RegNo: ParityReg->getOperand(i: 0).getReg());
5152 break;
5153 }
5154 case AMDGPU::S_SUB_I32: {
5155 Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5156
5157 // Take the negation of the source operand.
5158 auto InvertedValReg =
5159 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: NegatedVal)
5160 .addImm(Val: -1)
5161 .addReg(RegNo: SrcReg);
5162 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5163 .addReg(RegNo: InvertedValReg->getOperand(i: 0).getReg())
5164 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5165 break;
5166 }
5167 case AMDGPU::S_ADD_I32: {
5168 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5169 .addReg(RegNo: SrcReg)
5170 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5171 break;
5172 }
5173 }
5174 RetBB = &BB;
5175 }
5176 }
5177 } else {
5178 // TODO: Implement DPP Strategy and switch based on immediate strategy
5179 // operand. For now, for all the cases (default, Iterative and DPP we use
5180 // iterative approach by default.)
5181
5182 // To reduce the VGPR using iterative approach, we need to iterate
5183 // over all the active lanes. Lowering consists of ComputeLoop,
5184 // which iterate over only active lanes. We use copy of EXEC register
5185 // as induction variable and every active lane modifies it using bitset0
5186 // so that we will get the next active lane for next iteration.
5187 MachineBasicBlock::iterator I = BB.end();
5188 Register SrcReg = MI.getOperand(i: 1).getReg();
5189
5190 // Create Control flow for loop
5191 // Split MI's Machine Basic block into For loop
5192 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5193
5194 // Create virtual registers required for lowering.
5195 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5196 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5197 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5198 Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5199
5200 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5201 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5202 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5203
5204 Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass);
5205 Register LaneValueReg =
5206 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5207
5208 bool IsWave32 = ST.isWave32();
5209 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5210 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5211
5212 // Create initial values of induction variable from Exec, Accumulator and
5213 // insert branch instr to newly created ComputeBlock
5214 uint32_t InitalValue = getIdentityValueForWaveReduction(Opc);
5215 auto TmpSReg =
5216 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5217 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: InitalValReg)
5218 .addImm(Val: InitalValue);
5219 // clang-format off
5220 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5221 .addMBB(MBB: ComputeLoop);
5222 // clang-format on
5223
5224 // Start constructing ComputeLoop
5225 I = ComputeLoop->end();
5226 auto Accumulator =
5227 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5228 .addReg(RegNo: InitalValReg)
5229 .addMBB(MBB: &BB);
5230 auto ActiveBits =
5231 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5232 .addReg(RegNo: TmpSReg->getOperand(i: 0).getReg())
5233 .addMBB(MBB: &BB);
5234
5235 // Perform the computations
5236 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5237 auto FF1 = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5238 .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg());
5239 auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
5240 MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32), DestReg: LaneValueReg)
5241 .addReg(RegNo: SrcReg)
5242 .addReg(RegNo: FF1->getOperand(i: 0).getReg());
5243 auto NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
5244 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
5245 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
5246
5247 // Manipulate the iterator to get the next active lane
5248 unsigned BITSETOpc =
5249 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5250 auto NewActiveBits =
5251 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
5252 .addReg(RegNo: FF1->getOperand(i: 0).getReg())
5253 .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg());
5254
5255 // Add phi nodes
5256 Accumulator.addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5257 .addMBB(MBB: ComputeLoop);
5258 ActiveBits.addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg())
5259 .addMBB(MBB: ComputeLoop);
5260
5261 // Creating branching
5262 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5263 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
5264 .addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg())
5265 .addImm(Val: 0);
5266 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5267 .addMBB(MBB: ComputeLoop);
5268
5269 RetBB = ComputeEnd;
5270 }
5271 MI.eraseFromParent();
5272 return RetBB;
5273}
5274
5275MachineBasicBlock *
5276SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
5277 MachineBasicBlock *BB) const {
5278
5279 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5280 MachineFunction *MF = BB->getParent();
5281 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
5282
5283 switch (MI.getOpcode()) {
5284 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5285 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
5286 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5287 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_I32);
5288 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5289 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
5290 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5291 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_I32);
5292 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5293 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_I32);
5294 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5295 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_I32);
5296 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5297 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B32);
5298 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5299 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B32);
5300 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5301 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B32);
5302 case AMDGPU::S_UADDO_PSEUDO:
5303 case AMDGPU::S_USUBO_PSEUDO: {
5304 const DebugLoc &DL = MI.getDebugLoc();
5305 MachineOperand &Dest0 = MI.getOperand(i: 0);
5306 MachineOperand &Dest1 = MI.getOperand(i: 1);
5307 MachineOperand &Src0 = MI.getOperand(i: 2);
5308 MachineOperand &Src1 = MI.getOperand(i: 3);
5309
5310 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5311 ? AMDGPU::S_ADD_I32
5312 : AMDGPU::S_SUB_I32;
5313 // clang-format off
5314 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
5315 .add(MO: Src0)
5316 .add(MO: Src1);
5317 // clang-format on
5318
5319 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: Dest1.getReg())
5320 .addImm(Val: 1)
5321 .addImm(Val: 0);
5322
5323 MI.eraseFromParent();
5324 return BB;
5325 }
5326 case AMDGPU::S_ADD_U64_PSEUDO:
5327 case AMDGPU::S_SUB_U64_PSEUDO: {
5328 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5329 // For GFX12, we emit s_add_u64 and s_sub_u64.
5330 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5331 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5332 const DebugLoc &DL = MI.getDebugLoc();
5333 MachineOperand &Dest = MI.getOperand(i: 0);
5334 MachineOperand &Src0 = MI.getOperand(i: 1);
5335 MachineOperand &Src1 = MI.getOperand(i: 2);
5336 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5337 if (Subtarget->hasScalarAddSub64()) {
5338 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5339 // clang-format off
5340 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5341 .add(MO: Src0)
5342 .add(MO: Src1);
5343 // clang-format on
5344 } else {
5345 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5346 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5347
5348 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5349 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5350
5351 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5352 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5353 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5354 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5355
5356 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5357 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5358 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5359 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5360
5361 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5362 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5363 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5364 .add(MO: Src0Sub0)
5365 .add(MO: Src1Sub0);
5366 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5367 .add(MO: Src0Sub1)
5368 .add(MO: Src1Sub1);
5369 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5370 .addReg(RegNo: DestSub0)
5371 .addImm(Val: AMDGPU::sub0)
5372 .addReg(RegNo: DestSub1)
5373 .addImm(Val: AMDGPU::sub1);
5374 }
5375 MI.eraseFromParent();
5376 return BB;
5377 }
5378 case AMDGPU::V_ADD_U64_PSEUDO:
5379 case AMDGPU::V_SUB_U64_PSEUDO: {
5380 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5381 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5382 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5383 const DebugLoc &DL = MI.getDebugLoc();
5384
5385 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5386
5387 MachineOperand &Dest = MI.getOperand(i: 0);
5388 MachineOperand &Src0 = MI.getOperand(i: 1);
5389 MachineOperand &Src1 = MI.getOperand(i: 2);
5390
5391 if (IsAdd && ST.hasLshlAddU64Inst()) {
5392 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
5393 DestReg: Dest.getReg())
5394 .add(MO: Src0)
5395 .addImm(Val: 0)
5396 .add(MO: Src1);
5397 TII->legalizeOperands(MI&: *Add);
5398 MI.eraseFromParent();
5399 return BB;
5400 }
5401
5402 const auto *CarryRC = TRI->getWaveMaskRegClass();
5403
5404 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5405 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5406
5407 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5408 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5409
5410 const TargetRegisterClass *Src0RC = Src0.isReg()
5411 ? MRI.getRegClass(Reg: Src0.getReg())
5412 : &AMDGPU::VReg_64RegClass;
5413 const TargetRegisterClass *Src1RC = Src1.isReg()
5414 ? MRI.getRegClass(Reg: Src1.getReg())
5415 : &AMDGPU::VReg_64RegClass;
5416
5417 const TargetRegisterClass *Src0SubRC =
5418 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5419 const TargetRegisterClass *Src1SubRC =
5420 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5421
5422 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5423 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5424 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5425 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5426
5427 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5428 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5429 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5430 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5431
5432 unsigned LoOpc =
5433 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5434 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5435 .addReg(RegNo: CarryReg, flags: RegState::Define)
5436 .add(MO: SrcReg0Sub0)
5437 .add(MO: SrcReg1Sub0)
5438 .addImm(Val: 0); // clamp bit
5439
5440 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5441 MachineInstr *HiHalf =
5442 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5443 .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead)
5444 .add(MO: SrcReg0Sub1)
5445 .add(MO: SrcReg1Sub1)
5446 .addReg(RegNo: CarryReg, flags: RegState::Kill)
5447 .addImm(Val: 0); // clamp bit
5448
5449 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5450 .addReg(RegNo: DestSub0)
5451 .addImm(Val: AMDGPU::sub0)
5452 .addReg(RegNo: DestSub1)
5453 .addImm(Val: AMDGPU::sub1);
5454 TII->legalizeOperands(MI&: *LoHalf);
5455 TII->legalizeOperands(MI&: *HiHalf);
5456 MI.eraseFromParent();
5457 return BB;
5458 }
5459 case AMDGPU::S_ADD_CO_PSEUDO:
5460 case AMDGPU::S_SUB_CO_PSEUDO: {
5461 // This pseudo has a chance to be selected
5462 // only from uniform add/subcarry node. All the VGPR operands
5463 // therefore assumed to be splat vectors.
5464 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5465 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5466 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5467 MachineBasicBlock::iterator MII = MI;
5468 const DebugLoc &DL = MI.getDebugLoc();
5469 MachineOperand &Dest = MI.getOperand(i: 0);
5470 MachineOperand &CarryDest = MI.getOperand(i: 1);
5471 MachineOperand &Src0 = MI.getOperand(i: 2);
5472 MachineOperand &Src1 = MI.getOperand(i: 3);
5473 MachineOperand &Src2 = MI.getOperand(i: 4);
5474 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5475 ? AMDGPU::S_ADDC_U32
5476 : AMDGPU::S_SUBB_U32;
5477 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
5478 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5479 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
5480 .addReg(RegNo: Src0.getReg());
5481 Src0.setReg(RegOp0);
5482 }
5483 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
5484 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5485 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
5486 .addReg(RegNo: Src1.getReg());
5487 Src1.setReg(RegOp1);
5488 }
5489 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5490 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
5491 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
5492 .addReg(RegNo: Src2.getReg());
5493 Src2.setReg(RegOp2);
5494 }
5495
5496 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
5497 unsigned WaveSize = TRI->getRegSizeInBits(RC: *Src2RC);
5498 assert(WaveSize == 64 || WaveSize == 32);
5499
5500 if (WaveSize == 64) {
5501 if (ST.hasScalarCompareEq64()) {
5502 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
5503 .addReg(RegNo: Src2.getReg())
5504 .addImm(Val: 0);
5505 } else {
5506 const TargetRegisterClass *SubRC =
5507 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5508 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5509 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
5510 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5511 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
5512 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5513
5514 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
5515 .add(MO: Src2Sub0)
5516 .add(MO: Src2Sub1);
5517
5518 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5519 .addReg(RegNo: Src2_32, flags: RegState::Kill)
5520 .addImm(Val: 0);
5521 }
5522 } else {
5523 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5524 .addReg(RegNo: Src2.getReg())
5525 .addImm(Val: 0);
5526 }
5527
5528 // clang-format off
5529 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5530 .add(MO: Src0)
5531 .add(MO: Src1);
5532 // clang-format on
5533
5534 unsigned SelOpc =
5535 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5536
5537 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
5538 .addImm(Val: -1)
5539 .addImm(Val: 0);
5540
5541 MI.eraseFromParent();
5542 return BB;
5543 }
5544 case AMDGPU::SI_INIT_M0: {
5545 MachineOperand &M0Init = MI.getOperand(i: 0);
5546 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
5547 MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
5548 DestReg: AMDGPU::M0)
5549 .add(MO: M0Init);
5550 MI.eraseFromParent();
5551 return BB;
5552 }
5553 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5554 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
5555 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
5556 MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
5557 .addImm(Val: 0)
5558 .addImm(Val: 0);
5559 return BB;
5560 }
5561 case AMDGPU::GET_GROUPSTATICSIZE: {
5562 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5563 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5564 DebugLoc DL = MI.getDebugLoc();
5565 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
5566 .add(MO: MI.getOperand(i: 0))
5567 .addImm(Val: MFI->getLDSSize());
5568 MI.eraseFromParent();
5569 return BB;
5570 }
5571 case AMDGPU::GET_SHADERCYCLESHILO: {
5572 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5573 MachineRegisterInfo &MRI = MF->getRegInfo();
5574 const DebugLoc &DL = MI.getDebugLoc();
5575 // The algorithm is:
5576 //
5577 // hi1 = getreg(SHADER_CYCLES_HI)
5578 // lo1 = getreg(SHADER_CYCLES_LO)
5579 // hi2 = getreg(SHADER_CYCLES_HI)
5580 //
5581 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5582 // Otherwise there was overflow and the result is hi2:0. In both cases the
5583 // result should represent the actual time at some point during the sequence
5584 // of three getregs.
5585 using namespace AMDGPU::Hwreg;
5586 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5587 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
5588 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
5589 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5590 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
5591 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
5592 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5593 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
5594 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
5595 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
5596 .addReg(RegNo: RegHi1)
5597 .addReg(RegNo: RegHi2);
5598 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5599 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
5600 .addReg(RegNo: RegLo1)
5601 .addImm(Val: 0);
5602 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
5603 .add(MO: MI.getOperand(i: 0))
5604 .addReg(RegNo: RegLo)
5605 .addImm(Val: AMDGPU::sub0)
5606 .addReg(RegNo: RegHi2)
5607 .addImm(Val: AMDGPU::sub1);
5608 MI.eraseFromParent();
5609 return BB;
5610 }
5611 case AMDGPU::SI_INDIRECT_SRC_V1:
5612 case AMDGPU::SI_INDIRECT_SRC_V2:
5613 case AMDGPU::SI_INDIRECT_SRC_V4:
5614 case AMDGPU::SI_INDIRECT_SRC_V8:
5615 case AMDGPU::SI_INDIRECT_SRC_V9:
5616 case AMDGPU::SI_INDIRECT_SRC_V10:
5617 case AMDGPU::SI_INDIRECT_SRC_V11:
5618 case AMDGPU::SI_INDIRECT_SRC_V12:
5619 case AMDGPU::SI_INDIRECT_SRC_V16:
5620 case AMDGPU::SI_INDIRECT_SRC_V32:
5621 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
5622 case AMDGPU::SI_INDIRECT_DST_V1:
5623 case AMDGPU::SI_INDIRECT_DST_V2:
5624 case AMDGPU::SI_INDIRECT_DST_V4:
5625 case AMDGPU::SI_INDIRECT_DST_V8:
5626 case AMDGPU::SI_INDIRECT_DST_V9:
5627 case AMDGPU::SI_INDIRECT_DST_V10:
5628 case AMDGPU::SI_INDIRECT_DST_V11:
5629 case AMDGPU::SI_INDIRECT_DST_V12:
5630 case AMDGPU::SI_INDIRECT_DST_V16:
5631 case AMDGPU::SI_INDIRECT_DST_V32:
5632 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
5633 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5634 case AMDGPU::SI_KILL_I1_PSEUDO:
5635 return splitKillBlock(MI, BB);
5636 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5637 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5638 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5639 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5640
5641 Register Dst = MI.getOperand(i: 0).getReg();
5642 const MachineOperand &Src0 = MI.getOperand(i: 1);
5643 const MachineOperand &Src1 = MI.getOperand(i: 2);
5644 const DebugLoc &DL = MI.getDebugLoc();
5645 Register SrcCond = MI.getOperand(i: 3).getReg();
5646
5647 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5648 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5649 const auto *CondRC = TRI->getWaveMaskRegClass();
5650 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
5651
5652 const TargetRegisterClass *Src0RC = Src0.isReg()
5653 ? MRI.getRegClass(Reg: Src0.getReg())
5654 : &AMDGPU::VReg_64RegClass;
5655 const TargetRegisterClass *Src1RC = Src1.isReg()
5656 ? MRI.getRegClass(Reg: Src1.getReg())
5657 : &AMDGPU::VReg_64RegClass;
5658
5659 const TargetRegisterClass *Src0SubRC =
5660 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5661 const TargetRegisterClass *Src1SubRC =
5662 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5663
5664 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5665 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5666 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5667 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5668
5669 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5670 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5671 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5672 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5673
5674 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
5675 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
5676 .addImm(Val: 0)
5677 .add(MO: Src0Sub0)
5678 .addImm(Val: 0)
5679 .add(MO: Src1Sub0)
5680 .addReg(RegNo: SrcCondCopy);
5681 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
5682 .addImm(Val: 0)
5683 .add(MO: Src0Sub1)
5684 .addImm(Val: 0)
5685 .add(MO: Src1Sub1)
5686 .addReg(RegNo: SrcCondCopy);
5687
5688 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
5689 .addReg(RegNo: DstLo)
5690 .addImm(Val: AMDGPU::sub0)
5691 .addReg(RegNo: DstHi)
5692 .addImm(Val: AMDGPU::sub1);
5693 MI.eraseFromParent();
5694 return BB;
5695 }
5696 case AMDGPU::SI_BR_UNDEF: {
5697 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5698 const DebugLoc &DL = MI.getDebugLoc();
5699 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5700 .add(MO: MI.getOperand(i: 0));
5701 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
5702 MI.eraseFromParent();
5703 return BB;
5704 }
5705 case AMDGPU::ADJCALLSTACKUP:
5706 case AMDGPU::ADJCALLSTACKDOWN: {
5707 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5708 MachineInstrBuilder MIB(*MF, &MI);
5709 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine)
5710 .addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit);
5711 return BB;
5712 }
5713 case AMDGPU::SI_CALL_ISEL: {
5714 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5715 const DebugLoc &DL = MI.getDebugLoc();
5716
5717 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
5718
5719 MachineInstrBuilder MIB;
5720 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
5721
5722 for (const MachineOperand &MO : MI.operands())
5723 MIB.add(MO);
5724
5725 MIB.cloneMemRefs(OtherMI: MI);
5726 MI.eraseFromParent();
5727 return BB;
5728 }
5729 case AMDGPU::V_ADD_CO_U32_e32:
5730 case AMDGPU::V_SUB_CO_U32_e32:
5731 case AMDGPU::V_SUBREV_CO_U32_e32: {
5732 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5733 const DebugLoc &DL = MI.getDebugLoc();
5734 unsigned Opc = MI.getOpcode();
5735
5736 bool NeedClampOperand = false;
5737 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
5738 Opc = AMDGPU::getVOPe64(Opcode: Opc);
5739 NeedClampOperand = true;
5740 }
5741
5742 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
5743 if (TII->isVOP3(MI: *I)) {
5744 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5745 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5746 I.addReg(RegNo: TRI->getVCC(), flags: RegState::Define);
5747 }
5748 I.add(MO: MI.getOperand(i: 1)).add(MO: MI.getOperand(i: 2));
5749 if (NeedClampOperand)
5750 I.addImm(Val: 0); // clamp bit for e64 encoding
5751
5752 TII->legalizeOperands(MI&: *I);
5753
5754 MI.eraseFromParent();
5755 return BB;
5756 }
5757 case AMDGPU::V_ADDC_U32_e32:
5758 case AMDGPU::V_SUBB_U32_e32:
5759 case AMDGPU::V_SUBBREV_U32_e32:
5760 // These instructions have an implicit use of vcc which counts towards the
5761 // constant bus limit.
5762 TII->legalizeOperands(MI);
5763 return BB;
5764 case AMDGPU::DS_GWS_INIT:
5765 case AMDGPU::DS_GWS_SEMA_BR:
5766 case AMDGPU::DS_GWS_BARRIER:
5767 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::data0);
5768 [[fallthrough]];
5769 case AMDGPU::DS_GWS_SEMA_V:
5770 case AMDGPU::DS_GWS_SEMA_P:
5771 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5772 // A s_waitcnt 0 is required to be the instruction immediately following.
5773 if (getSubtarget()->hasGWSAutoReplay()) {
5774 bundleInstWithWaitcnt(MI);
5775 return BB;
5776 }
5777
5778 return emitGWSMemViolTestLoop(MI, BB);
5779 case AMDGPU::S_SETREG_B32: {
5780 // Try to optimize cases that only set the denormal mode or rounding mode.
5781 //
5782 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5783 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5784 // instead.
5785 //
5786 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5787 // allow you to have a no side effect instruction in the output of a
5788 // sideeffecting pattern.
5789 auto [ID, Offset, Width] =
5790 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
5791 if (ID != AMDGPU::Hwreg::ID_MODE)
5792 return BB;
5793
5794 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
5795 const unsigned SetMask = WidthMask << Offset;
5796
5797 if (getSubtarget()->hasDenormModeInst()) {
5798 unsigned SetDenormOp = 0;
5799 unsigned SetRoundOp = 0;
5800
5801 // The dedicated instructions can only set the whole denorm or round mode
5802 // at once, not a subset of bits in either.
5803 if (SetMask ==
5804 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5805 // If this fully sets both the round and denorm mode, emit the two
5806 // dedicated instructions for these.
5807 SetRoundOp = AMDGPU::S_ROUND_MODE;
5808 SetDenormOp = AMDGPU::S_DENORM_MODE;
5809 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5810 SetRoundOp = AMDGPU::S_ROUND_MODE;
5811 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5812 SetDenormOp = AMDGPU::S_DENORM_MODE;
5813 }
5814
5815 if (SetRoundOp || SetDenormOp) {
5816 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5817 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
5818 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
5819 unsigned ImmVal = Def->getOperand(i: 1).getImm();
5820 if (SetRoundOp) {
5821 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
5822 .addImm(Val: ImmVal & 0xf);
5823
5824 // If we also have the denorm mode, get just the denorm mode bits.
5825 ImmVal >>= 4;
5826 }
5827
5828 if (SetDenormOp) {
5829 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
5830 .addImm(Val: ImmVal & 0xf);
5831 }
5832
5833 MI.eraseFromParent();
5834 return BB;
5835 }
5836 }
5837 }
5838
5839 // If only FP bits are touched, used the no side effects pseudo.
5840 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5841 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5842 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
5843
5844 return BB;
5845 }
5846 case AMDGPU::S_INVERSE_BALLOT_U32:
5847 case AMDGPU::S_INVERSE_BALLOT_U64:
5848 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5849 // necessary. After that they are equivalent to a COPY.
5850 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
5851 return BB;
5852 case AMDGPU::ENDPGM_TRAP: {
5853 const DebugLoc &DL = MI.getDebugLoc();
5854 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
5855 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
5856 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
5857 return BB;
5858 }
5859
5860 // We need a block split to make the real endpgm a terminator. We also don't
5861 // want to break phis in successor blocks, so we can't just delete to the
5862 // end of the block.
5863
5864 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
5865 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5866 MF->push_back(MBB: TrapBB);
5867 // clang-format off
5868 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
5869 .addImm(Val: 0);
5870 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5871 .addMBB(MBB: TrapBB);
5872 // clang-format on
5873
5874 BB->addSuccessor(Succ: TrapBB);
5875 MI.eraseFromParent();
5876 return SplitBB;
5877 }
5878 case AMDGPU::SIMULATED_TRAP: {
5879 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5880 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5881 MachineBasicBlock *SplitBB =
5882 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
5883 MI.eraseFromParent();
5884 return SplitBB;
5885 }
5886 default:
5887 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5888 if (!MI.mayStore())
5889 AddMemOpInit(MI);
5890 return BB;
5891 }
5892 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
5893 }
5894}
5895
5896bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5897 // This currently forces unfolding various combinations of fsub into fma with
5898 // free fneg'd operands. As long as we have fast FMA (controlled by
5899 // isFMAFasterThanFMulAndFAdd), we should perform these.
5900
5901 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5902 // most of these combines appear to be cycle neutral but save on instruction
5903 // count / code size.
5904 return true;
5905}
5906
5907bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5908
5909EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5910 EVT VT) const {
5911 if (!VT.isVector()) {
5912 return MVT::i1;
5913 }
5914 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
5915}
5916
5917MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5918 // TODO: Should i16 be used always if legal? For now it would force VALU
5919 // shifts.
5920 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5921}
5922
5923LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5924 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5925 ? Ty.changeElementSize(NewEltSize: 16)
5926 : Ty.changeElementSize(NewEltSize: 32);
5927}
5928
5929// Answering this is somewhat tricky and depends on the specific device which
5930// have different rates for fma or all f64 operations.
5931//
5932// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5933// regardless of which device (although the number of cycles differs between
5934// devices), so it is always profitable for f64.
5935//
5936// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5937// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5938// which we can always do even without fused FP ops since it returns the same
5939// result as the separate operations and since it is always full
5940// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5941// however does not support denormals, so we do report fma as faster if we have
5942// a fast fma device and require denormals.
5943//
5944bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5945 EVT VT) const {
5946 VT = VT.getScalarType();
5947
5948 switch (VT.getSimpleVT().SimpleTy) {
5949 case MVT::f32: {
5950 // If mad is not available this depends only on if f32 fma is full rate.
5951 if (!Subtarget->hasMadMacF32Insts())
5952 return Subtarget->hasFastFMAF32();
5953
5954 // Otherwise f32 mad is always full rate and returns the same result as
5955 // the separate operations so should be preferred over fma.
5956 // However does not support denormals.
5957 if (!denormalModeIsFlushAllF32(MF))
5958 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5959
5960 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5961 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5962 }
5963 case MVT::f64:
5964 return true;
5965 case MVT::f16:
5966 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5967 default:
5968 break;
5969 }
5970
5971 return false;
5972}
5973
5974bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5975 LLT Ty) const {
5976 switch (Ty.getScalarSizeInBits()) {
5977 case 16:
5978 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
5979 case 32:
5980 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
5981 case 64:
5982 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
5983 default:
5984 break;
5985 }
5986
5987 return false;
5988}
5989
5990bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5991 if (!Ty.isScalar())
5992 return false;
5993
5994 if (Ty.getScalarSizeInBits() == 16)
5995 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
5996 if (Ty.getScalarSizeInBits() == 32)
5997 return Subtarget->hasMadMacF32Insts() &&
5998 denormalModeIsFlushAllF32(MF: *MI.getMF());
5999
6000 return false;
6001}
6002
6003bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6004 const SDNode *N) const {
6005 // TODO: Check future ftz flag
6006 // v_mad_f32/v_mac_f32 do not support denormals.
6007 EVT VT = N->getValueType(ResNo: 0);
6008 if (VT == MVT::f32)
6009 return Subtarget->hasMadMacF32Insts() &&
6010 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6011 if (VT == MVT::f16) {
6012 return Subtarget->hasMadF16() &&
6013 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6014 }
6015
6016 return false;
6017}
6018
6019//===----------------------------------------------------------------------===//
6020// Custom DAG Lowering Operations
6021//===----------------------------------------------------------------------===//
6022
6023// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6024// wider vector type is legal.
6025SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6026 SelectionDAG &DAG) const {
6027 unsigned Opc = Op.getOpcode();
6028 EVT VT = Op.getValueType();
6029 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6030 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6031 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6032 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6033
6034 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6035
6036 SDLoc SL(Op);
6037 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op->getFlags());
6038 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op->getFlags());
6039
6040 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6041}
6042
6043// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6044// wider vector type is legal.
6045SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6046 SelectionDAG &DAG) const {
6047 unsigned Opc = Op.getOpcode();
6048 EVT VT = Op.getValueType();
6049 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6050 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6051 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6052 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6053 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6054 VT == MVT::v32bf16);
6055
6056 auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6057 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6058
6059 SDLoc SL(Op);
6060
6061 SDValue OpLo =
6062 DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op->getFlags());
6063 SDValue OpHi =
6064 DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op->getFlags());
6065
6066 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6067}
6068
6069SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6070 SelectionDAG &DAG) const {
6071 unsigned Opc = Op.getOpcode();
6072 EVT VT = Op.getValueType();
6073 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6074 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6075 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6076 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6077 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6078 VT == MVT::v32bf16);
6079
6080 SDValue Op0 = Op.getOperand(i: 0);
6081 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6082 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
6083 : std::pair(Op0, Op0);
6084
6085 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6086 auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
6087
6088 SDLoc SL(Op);
6089 auto ResVT = DAG.GetSplitDestVTs(VT);
6090
6091 SDValue OpLo =
6092 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op->getFlags());
6093 SDValue OpHi =
6094 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op->getFlags());
6095
6096 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6097}
6098
6099SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
6100 switch (Op.getOpcode()) {
6101 default:
6102 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
6103 case ISD::BRCOND:
6104 return LowerBRCOND(Op, DAG);
6105 case ISD::RETURNADDR:
6106 return LowerRETURNADDR(Op, DAG);
6107 case ISD::LOAD: {
6108 SDValue Result = LowerLOAD(Op, DAG);
6109 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6110 "Load should return a value and a chain");
6111 return Result;
6112 }
6113 case ISD::FSQRT: {
6114 EVT VT = Op.getValueType();
6115 if (VT == MVT::f32)
6116 return lowerFSQRTF32(Op, DAG);
6117 if (VT == MVT::f64)
6118 return lowerFSQRTF64(Op, DAG);
6119 return SDValue();
6120 }
6121 case ISD::FSIN:
6122 case ISD::FCOS:
6123 return LowerTrig(Op, DAG);
6124 case ISD::SELECT:
6125 return LowerSELECT(Op, DAG);
6126 case ISD::FDIV:
6127 return LowerFDIV(Op, DAG);
6128 case ISD::FFREXP:
6129 return LowerFFREXP(Op, DAG);
6130 case ISD::ATOMIC_CMP_SWAP:
6131 return LowerATOMIC_CMP_SWAP(Op, DAG);
6132 case ISD::STORE:
6133 return LowerSTORE(Op, DAG);
6134 case ISD::GlobalAddress: {
6135 MachineFunction &MF = DAG.getMachineFunction();
6136 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6137 return LowerGlobalAddress(MFI, Op, DAG);
6138 }
6139 case ISD::INTRINSIC_WO_CHAIN:
6140 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6141 case ISD::INTRINSIC_W_CHAIN:
6142 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6143 case ISD::INTRINSIC_VOID:
6144 return LowerINTRINSIC_VOID(Op, DAG);
6145 case ISD::ADDRSPACECAST:
6146 return lowerADDRSPACECAST(Op, DAG);
6147 case ISD::INSERT_SUBVECTOR:
6148 return lowerINSERT_SUBVECTOR(Op, DAG);
6149 case ISD::INSERT_VECTOR_ELT:
6150 return lowerINSERT_VECTOR_ELT(Op, DAG);
6151 case ISD::EXTRACT_VECTOR_ELT:
6152 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6153 case ISD::VECTOR_SHUFFLE:
6154 return lowerVECTOR_SHUFFLE(Op, DAG);
6155 case ISD::SCALAR_TO_VECTOR:
6156 return lowerSCALAR_TO_VECTOR(Op, DAG);
6157 case ISD::BUILD_VECTOR:
6158 return lowerBUILD_VECTOR(Op, DAG);
6159 case ISD::FP_ROUND:
6160 case ISD::STRICT_FP_ROUND:
6161 return lowerFP_ROUND(Op, DAG);
6162 case ISD::TRAP:
6163 return lowerTRAP(Op, DAG);
6164 case ISD::DEBUGTRAP:
6165 return lowerDEBUGTRAP(Op, DAG);
6166 case ISD::ABS:
6167 case ISD::FABS:
6168 case ISD::FNEG:
6169 case ISD::FCANONICALIZE:
6170 case ISD::BSWAP:
6171 return splitUnaryVectorOp(Op, DAG);
6172 case ISD::FMINNUM:
6173 case ISD::FMAXNUM:
6174 return lowerFMINNUM_FMAXNUM(Op, DAG);
6175 case ISD::FMINIMUMNUM:
6176 case ISD::FMAXIMUMNUM:
6177 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6178 case ISD::FMINIMUM:
6179 case ISD::FMAXIMUM:
6180 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6181 case ISD::FLDEXP:
6182 case ISD::STRICT_FLDEXP:
6183 return lowerFLDEXP(Op, DAG);
6184 case ISD::FMA:
6185 return splitTernaryVectorOp(Op, DAG);
6186 case ISD::FP_TO_SINT:
6187 case ISD::FP_TO_UINT:
6188 return LowerFP_TO_INT(Op, DAG);
6189 case ISD::SHL:
6190 case ISD::SRA:
6191 case ISD::SRL:
6192 case ISD::ADD:
6193 case ISD::SUB:
6194 case ISD::SMIN:
6195 case ISD::SMAX:
6196 case ISD::UMIN:
6197 case ISD::UMAX:
6198 case ISD::FADD:
6199 case ISD::FMUL:
6200 case ISD::FMINNUM_IEEE:
6201 case ISD::FMAXNUM_IEEE:
6202 case ISD::UADDSAT:
6203 case ISD::USUBSAT:
6204 case ISD::SADDSAT:
6205 case ISD::SSUBSAT:
6206 return splitBinaryVectorOp(Op, DAG);
6207 case ISD::FCOPYSIGN:
6208 return lowerFCOPYSIGN(Op, DAG);
6209 case ISD::MUL:
6210 return lowerMUL(Op, DAG);
6211 case ISD::SMULO:
6212 case ISD::UMULO:
6213 return lowerXMULO(Op, DAG);
6214 case ISD::SMUL_LOHI:
6215 case ISD::UMUL_LOHI:
6216 return lowerXMUL_LOHI(Op, DAG);
6217 case ISD::DYNAMIC_STACKALLOC:
6218 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6219 case ISD::STACKSAVE:
6220 return LowerSTACKSAVE(Op, DAG);
6221 case ISD::GET_ROUNDING:
6222 return lowerGET_ROUNDING(Op, DAG);
6223 case ISD::SET_ROUNDING:
6224 return lowerSET_ROUNDING(Op, DAG);
6225 case ISD::PREFETCH:
6226 return lowerPREFETCH(Op, DAG);
6227 case ISD::FP_EXTEND:
6228 case ISD::STRICT_FP_EXTEND:
6229 return lowerFP_EXTEND(Op, DAG);
6230 case ISD::GET_FPENV:
6231 return lowerGET_FPENV(Op, DAG);
6232 case ISD::SET_FPENV:
6233 return lowerSET_FPENV(Op, DAG);
6234 }
6235 return SDValue();
6236}
6237
6238// Used for D16: Casts the result of an instruction into the right vector,
6239// packs values if loads return unpacked values.
6240static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
6241 const SDLoc &DL, SelectionDAG &DAG,
6242 bool Unpacked) {
6243 if (!LoadVT.isVector())
6244 return Result;
6245
6246 // Cast back to the original packed type or to a larger type that is a
6247 // multiple of 32 bit for D16. Widening the return type is a required for
6248 // legalization.
6249 EVT FittingLoadVT = LoadVT;
6250 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6251 FittingLoadVT =
6252 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
6253 NumElements: LoadVT.getVectorNumElements() + 1);
6254 }
6255
6256 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6257 // Truncate to v2i16/v4i16.
6258 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6259
6260 // Workaround legalizer not scalarizing truncate after vector op
6261 // legalization but not creating intermediate vector trunc.
6262 SmallVector<SDValue, 4> Elts;
6263 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
6264 for (SDValue &Elt : Elts)
6265 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
6266
6267 // Pad illegal v1i16/v3fi6 to v4i16
6268 if ((LoadVT.getVectorNumElements() % 2) == 1)
6269 Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
6270
6271 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
6272
6273 // Bitcast to original type (v2f16/v4f16).
6274 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
6275 }
6276
6277 // Cast back to the original packed type.
6278 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
6279}
6280
6281SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6282 SelectionDAG &DAG,
6283 ArrayRef<SDValue> Ops,
6284 bool IsIntrinsic) const {
6285 SDLoc DL(M);
6286
6287 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6288 EVT LoadVT = M->getValueType(ResNo: 0);
6289
6290 EVT EquivLoadVT = LoadVT;
6291 if (LoadVT.isVector()) {
6292 if (Unpacked) {
6293 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
6294 NumElements: LoadVT.getVectorNumElements());
6295 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6296 // Widen v3f16 to legal type
6297 EquivLoadVT =
6298 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
6299 NumElements: LoadVT.getVectorNumElements() + 1);
6300 }
6301 }
6302
6303 // Change from v4f16/v2f16 to EquivLoadVT.
6304 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
6305
6306 SDValue Load = DAG.getMemIntrinsicNode(
6307 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
6308 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
6309
6310 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
6311
6312 return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: 1)}, dl: DL);
6313}
6314
6315SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6316 SelectionDAG &DAG,
6317 ArrayRef<SDValue> Ops) const {
6318 SDLoc DL(M);
6319 EVT LoadVT = M->getValueType(ResNo: 0);
6320 EVT EltType = LoadVT.getScalarType();
6321 EVT IntVT = LoadVT.changeTypeToInteger();
6322
6323 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6324
6325 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6326 bool IsTFE = M->getNumValues() == 3;
6327
6328 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6329 : AMDGPUISD::BUFFER_LOAD_FORMAT)
6330 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6331 : AMDGPUISD::BUFFER_LOAD;
6332
6333 if (IsD16) {
6334 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6335 }
6336
6337 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6338 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6339 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
6340 IsTFE);
6341
6342 if (isTypeLegal(VT: LoadVT)) {
6343 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
6344 MMO: M->getMemOperand(), DAG);
6345 }
6346
6347 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
6348 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
6349 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
6350 MMO: M->getMemOperand(), DAG);
6351 return DAG.getMergeValues(
6352 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
6353 dl: DL);
6354}
6355
6356static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6357 SelectionDAG &DAG) {
6358 EVT VT = N->getValueType(ResNo: 0);
6359 unsigned CondCode = N->getConstantOperandVal(Num: 3);
6360 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
6361 return DAG.getPOISON(VT);
6362
6363 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6364
6365 SDValue LHS = N->getOperand(Num: 1);
6366 SDValue RHS = N->getOperand(Num: 2);
6367
6368 SDLoc DL(N);
6369
6370 EVT CmpVT = LHS.getValueType();
6371 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
6372 unsigned PromoteOp =
6373 ICmpInst::isSigned(predicate: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6374 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
6375 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
6376 }
6377
6378 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
6379
6380 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6381 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6382
6383 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
6384 N3: DAG.getCondCode(Cond: CCOpcode));
6385 if (VT.bitsEq(VT: CCVT))
6386 return SetCC;
6387 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
6388}
6389
6390static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
6391 SelectionDAG &DAG) {
6392 EVT VT = N->getValueType(ResNo: 0);
6393
6394 unsigned CondCode = N->getConstantOperandVal(Num: 3);
6395 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
6396 return DAG.getPOISON(VT);
6397
6398 SDValue Src0 = N->getOperand(Num: 1);
6399 SDValue Src1 = N->getOperand(Num: 2);
6400 EVT CmpVT = Src0.getValueType();
6401 SDLoc SL(N);
6402
6403 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
6404 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
6405 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
6406 }
6407
6408 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6409 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
6410 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6411 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6412 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
6413 N3: DAG.getCondCode(Cond: CCOpcode));
6414 if (VT.bitsEq(VT: CCVT))
6415 return SetCC;
6416 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
6417}
6418
6419static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6420 SelectionDAG &DAG) {
6421 EVT VT = N->getValueType(ResNo: 0);
6422 SDValue Src = N->getOperand(Num: 1);
6423 SDLoc SL(N);
6424
6425 if (Src.getOpcode() == ISD::SETCC) {
6426 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6427 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: 0),
6428 N2: Src.getOperand(i: 1), N3: Src.getOperand(i: 2));
6429 }
6430 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
6431 // (ballot 0) -> 0
6432 if (Arg->isZero())
6433 return DAG.getConstant(Val: 0, DL: SL, VT);
6434
6435 // (ballot 1) -> EXEC/EXEC_LO
6436 if (Arg->isOne()) {
6437 Register Exec;
6438 if (VT.getScalarSizeInBits() == 32)
6439 Exec = AMDGPU::EXEC_LO;
6440 else if (VT.getScalarSizeInBits() == 64)
6441 Exec = AMDGPU::EXEC;
6442 else
6443 return SDValue();
6444
6445 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
6446 }
6447 }
6448
6449 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6450 // ISD::SETNE)
6451 return DAG.getNode(
6452 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
6453 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
6454}
6455
6456static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6457 SelectionDAG &DAG) {
6458 EVT VT = N->getValueType(ResNo: 0);
6459 unsigned ValSize = VT.getSizeInBits();
6460 unsigned IID = N->getConstantOperandVal(Num: 0);
6461 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6462 IID == Intrinsic::amdgcn_permlanex16;
6463 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6464 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6465 SDLoc SL(N);
6466 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
6467 const GCNSubtarget *ST = TLI.getSubtarget();
6468 unsigned SplitSize = 32;
6469 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
6470 ST->hasDPALU_DPP() &&
6471 AMDGPU::isLegalDPALU_DPPControl(DC: N->getConstantOperandVal(Num: 3)))
6472 SplitSize = 64;
6473
6474 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6475 SDValue Src2, MVT ValT) -> SDValue {
6476 SmallVector<SDValue, 8> Operands;
6477 switch (IID) {
6478 case Intrinsic::amdgcn_permlane16:
6479 case Intrinsic::amdgcn_permlanex16:
6480 case Intrinsic::amdgcn_update_dpp:
6481 Operands.push_back(Elt: N->getOperand(Num: 6));
6482 Operands.push_back(Elt: N->getOperand(Num: 5));
6483 Operands.push_back(Elt: N->getOperand(Num: 4));
6484 [[fallthrough]];
6485 case Intrinsic::amdgcn_writelane:
6486 Operands.push_back(Elt: Src2);
6487 [[fallthrough]];
6488 case Intrinsic::amdgcn_readlane:
6489 case Intrinsic::amdgcn_set_inactive:
6490 case Intrinsic::amdgcn_set_inactive_chain_arg:
6491 case Intrinsic::amdgcn_mov_dpp8:
6492 Operands.push_back(Elt: Src1);
6493 [[fallthrough]];
6494 case Intrinsic::amdgcn_readfirstlane:
6495 case Intrinsic::amdgcn_permlane64:
6496 Operands.push_back(Elt: Src0);
6497 break;
6498 default:
6499 llvm_unreachable("unhandled lane op");
6500 }
6501
6502 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
6503 std::reverse(first: Operands.begin(), last: Operands.end());
6504
6505 if (SDNode *GL = N->getGluedNode()) {
6506 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6507 GL = GL->getOperand(Num: 0).getNode();
6508 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6509 Operand: SDValue(GL, 0)));
6510 }
6511
6512 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
6513 };
6514
6515 SDValue Src0 = N->getOperand(Num: 1);
6516 SDValue Src1, Src2;
6517 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6518 IID == Intrinsic::amdgcn_mov_dpp8 ||
6519 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6520 Src1 = N->getOperand(Num: 2);
6521 if (IID == Intrinsic::amdgcn_writelane ||
6522 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
6523 Src2 = N->getOperand(Num: 3);
6524 }
6525
6526 if (ValSize == SplitSize) {
6527 // Already legal
6528 return SDValue();
6529 }
6530
6531 if (ValSize < 32) {
6532 bool IsFloat = VT.isFloatingPoint();
6533 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
6534 DL: SL, VT: MVT::i32);
6535
6536 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6537 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
6538 DL: SL, VT: MVT::i32);
6539 }
6540
6541 if (IID == Intrinsic::amdgcn_writelane) {
6542 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
6543 DL: SL, VT: MVT::i32);
6544 }
6545
6546 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6547 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
6548 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
6549 }
6550
6551 if (ValSize % SplitSize != 0)
6552 return SDValue();
6553
6554 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6555 EVT VT = N->getValueType(ResNo: 0);
6556 unsigned NE = VT.getVectorNumElements();
6557 EVT EltVT = VT.getVectorElementType();
6558 SmallVector<SDValue, 8> Scalars;
6559 unsigned NumOperands = N->getNumOperands();
6560 SmallVector<SDValue, 4> Operands(NumOperands);
6561 SDNode *GL = N->getGluedNode();
6562
6563 // only handle convergencectrl_glue
6564 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6565
6566 for (unsigned i = 0; i != NE; ++i) {
6567 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6568 ++j) {
6569 SDValue Operand = N->getOperand(Num: j);
6570 EVT OperandVT = Operand.getValueType();
6571 if (OperandVT.isVector()) {
6572 // A vector operand; extract a single element.
6573 EVT OperandEltVT = OperandVT.getVectorElementType();
6574 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
6575 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
6576 } else {
6577 // A scalar operand; just use it as is.
6578 Operands[j] = Operand;
6579 }
6580 }
6581
6582 if (GL)
6583 Operands[NumOperands - 1] =
6584 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6585 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
6586
6587 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
6588 }
6589
6590 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
6591 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
6592 };
6593
6594 if (VT.isVector()) {
6595 switch (MVT::SimpleValueType EltTy =
6596 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6597 case MVT::i32:
6598 case MVT::f32:
6599 if (SplitSize == 32) {
6600 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6601 return unrollLaneOp(LaneOp.getNode());
6602 }
6603 [[fallthrough]];
6604 case MVT::i16:
6605 case MVT::f16:
6606 case MVT::bf16: {
6607 unsigned SubVecNumElt =
6608 SplitSize / VT.getVectorElementType().getSizeInBits();
6609 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
6610 SmallVector<SDValue, 4> Pieces;
6611 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6612 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
6613 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
6614 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6615
6616 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
6617 IsPermLane16)
6618 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
6619 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6620
6621 if (IID == Intrinsic::amdgcn_writelane)
6622 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
6623 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6624
6625 Pieces.push_back(
6626 Elt: IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
6627 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6628 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6629 EltIdx += SubVecNumElt;
6630 }
6631 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
6632 }
6633 default:
6634 // Handle all other cases by bitcasting to i32 vectors
6635 break;
6636 }
6637 }
6638
6639 MVT VecVT =
6640 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
6641 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
6642
6643 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6644 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
6645
6646 if (IID == Intrinsic::amdgcn_writelane)
6647 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
6648
6649 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6650 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6651 return DAG.getBitcast(VT, V: UnrolledLaneOp);
6652}
6653
6654void SITargetLowering::ReplaceNodeResults(SDNode *N,
6655 SmallVectorImpl<SDValue> &Results,
6656 SelectionDAG &DAG) const {
6657 switch (N->getOpcode()) {
6658 case ISD::INSERT_VECTOR_ELT: {
6659 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6660 Results.push_back(Elt: Res);
6661 return;
6662 }
6663 case ISD::EXTRACT_VECTOR_ELT: {
6664 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6665 Results.push_back(Elt: Res);
6666 return;
6667 }
6668 case ISD::INTRINSIC_WO_CHAIN: {
6669 unsigned IID = N->getConstantOperandVal(Num: 0);
6670 switch (IID) {
6671 case Intrinsic::amdgcn_make_buffer_rsrc:
6672 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
6673 return;
6674 case Intrinsic::amdgcn_cvt_pkrtz: {
6675 SDValue Src0 = N->getOperand(Num: 1);
6676 SDValue Src1 = N->getOperand(Num: 2);
6677 SDLoc SL(N);
6678 SDValue Cvt =
6679 DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
6680 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
6681 return;
6682 }
6683 case Intrinsic::amdgcn_cvt_pknorm_i16:
6684 case Intrinsic::amdgcn_cvt_pknorm_u16:
6685 case Intrinsic::amdgcn_cvt_pk_i16:
6686 case Intrinsic::amdgcn_cvt_pk_u16: {
6687 SDValue Src0 = N->getOperand(Num: 1);
6688 SDValue Src1 = N->getOperand(Num: 2);
6689 SDLoc SL(N);
6690 unsigned Opcode;
6691
6692 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6693 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6694 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6695 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6696 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6697 Opcode = AMDGPUISD::CVT_PK_I16_I32;
6698 else
6699 Opcode = AMDGPUISD::CVT_PK_U16_U32;
6700
6701 EVT VT = N->getValueType(ResNo: 0);
6702 if (isTypeLegal(VT))
6703 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
6704 else {
6705 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
6706 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
6707 }
6708 return;
6709 }
6710 case Intrinsic::amdgcn_s_buffer_load: {
6711 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6712 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6713 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6714 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6715 // s_buffer_load_i8.
6716 if (!Subtarget->hasScalarSubwordLoads())
6717 return;
6718 SDValue Op = SDValue(N, 0);
6719 SDValue Rsrc = Op.getOperand(i: 1);
6720 SDValue Offset = Op.getOperand(i: 2);
6721 SDValue CachePolicy = Op.getOperand(i: 3);
6722 EVT VT = Op.getValueType();
6723 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6724 SDLoc DL(Op);
6725 MachineFunction &MF = DAG.getMachineFunction();
6726 const DataLayout &DataLayout = DAG.getDataLayout();
6727 Align Alignment =
6728 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
6729 MachineMemOperand *MMO = MF.getMachineMemOperand(
6730 PtrInfo: MachinePointerInfo(),
6731 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6732 MachineMemOperand::MOInvariant,
6733 Size: VT.getStoreSize(), BaseAlignment: Alignment);
6734 SDValue LoadVal;
6735 if (!Offset->isDivergent()) {
6736 SDValue Ops[] = {Rsrc, // source register
6737 Offset, CachePolicy};
6738 SDValue BufferLoad =
6739 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
6740 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
6741 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
6742 } else {
6743 SDValue Ops[] = {
6744 DAG.getEntryNode(), // Chain
6745 Rsrc, // rsrc
6746 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
6747 {}, // voffset
6748 {}, // soffset
6749 {}, // offset
6750 CachePolicy, // cachepolicy
6751 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
6752 };
6753 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
6754 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
6755 }
6756 Results.push_back(Elt: LoadVal);
6757 return;
6758 }
6759 case Intrinsic::amdgcn_dead: {
6760 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
6761 Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
6762 return;
6763 }
6764 }
6765 break;
6766 }
6767 case ISD::INTRINSIC_W_CHAIN: {
6768 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
6769 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6770 // FIXME: Hacky
6771 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6772 Results.push_back(Elt: Res.getOperand(i: I));
6773 }
6774 } else {
6775 Results.push_back(Elt: Res);
6776 Results.push_back(Elt: Res.getValue(R: 1));
6777 }
6778 return;
6779 }
6780
6781 break;
6782 }
6783 case ISD::SELECT: {
6784 SDLoc SL(N);
6785 EVT VT = N->getValueType(ResNo: 0);
6786 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
6787 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
6788 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
6789
6790 EVT SelectVT = NewVT;
6791 if (NewVT.bitsLT(VT: MVT::i32)) {
6792 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
6793 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
6794 SelectVT = MVT::i32;
6795 }
6796
6797 SDValue NewSelect =
6798 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
6799
6800 if (NewVT != SelectVT)
6801 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
6802 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
6803 return;
6804 }
6805 case ISD::FNEG: {
6806 if (N->getValueType(ResNo: 0) != MVT::v2f16)
6807 break;
6808
6809 SDLoc SL(N);
6810 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
6811
6812 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
6813 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
6814 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6815 return;
6816 }
6817 case ISD::FABS: {
6818 if (N->getValueType(ResNo: 0) != MVT::v2f16)
6819 break;
6820
6821 SDLoc SL(N);
6822 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
6823
6824 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
6825 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
6826 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6827 return;
6828 }
6829 case ISD::FSQRT: {
6830 if (N->getValueType(ResNo: 0) != MVT::f16)
6831 break;
6832 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
6833 break;
6834 }
6835 default:
6836 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6837 break;
6838 }
6839}
6840
6841/// Helper function for LowerBRCOND
6842static SDNode *findUser(SDValue Value, unsigned Opcode) {
6843
6844 for (SDUse &U : Value->uses()) {
6845 if (U.get() != Value)
6846 continue;
6847
6848 if (U.getUser()->getOpcode() == Opcode)
6849 return U.getUser();
6850 }
6851 return nullptr;
6852}
6853
6854unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6855 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6856 switch (Intr->getConstantOperandVal(Num: 1)) {
6857 case Intrinsic::amdgcn_if:
6858 return AMDGPUISD::IF;
6859 case Intrinsic::amdgcn_else:
6860 return AMDGPUISD::ELSE;
6861 case Intrinsic::amdgcn_loop:
6862 return AMDGPUISD::LOOP;
6863 case Intrinsic::amdgcn_end_cf:
6864 llvm_unreachable("should not occur");
6865 default:
6866 return 0;
6867 }
6868 }
6869
6870 // break, if_break, else_break are all only used as inputs to loop, not
6871 // directly as branch conditions.
6872 return 0;
6873}
6874
6875bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6876 const Triple &TT = getTargetMachine().getTargetTriple();
6877 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6878 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6879 AMDGPU::shouldEmitConstantsToTextSection(TT);
6880}
6881
6882bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6883 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6884 return false;
6885
6886 // FIXME: Either avoid relying on address space here or change the default
6887 // address space for functions to avoid the explicit check.
6888 return (GV->getValueType()->isFunctionTy() ||
6889 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
6890 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6891}
6892
6893bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6894 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6895}
6896
6897bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6898 if (!GV->hasExternalLinkage())
6899 return true;
6900
6901 const auto OS = getTargetMachine().getTargetTriple().getOS();
6902 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6903}
6904
6905/// This transforms the control flow intrinsics to get the branch destination as
6906/// last parameter, also switches branch target with BR if the need arise
6907SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
6908 SDLoc DL(BRCOND);
6909
6910 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
6911 SDValue Target = BRCOND.getOperand(i: 2);
6912 SDNode *BR = nullptr;
6913 SDNode *SetCC = nullptr;
6914
6915 if (Intr->getOpcode() == ISD::SETCC) {
6916 // As long as we negate the condition everything is fine
6917 SetCC = Intr;
6918 Intr = SetCC->getOperand(Num: 0).getNode();
6919
6920 } else {
6921 // Get the target from BR if we don't negate the condition
6922 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
6923 assert(BR && "brcond missing unconditional branch user");
6924 Target = BR->getOperand(Num: 1);
6925 }
6926
6927 unsigned CFNode = isCFIntrinsic(Intr);
6928 if (CFNode == 0) {
6929 // This is a uniform branch so we don't need to legalize.
6930 return BRCOND;
6931 }
6932
6933 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6934 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6935
6936 assert(!SetCC ||
6937 (SetCC->getConstantOperandVal(1) == 1 &&
6938 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6939 ISD::SETNE));
6940
6941 // operands of the new intrinsic call
6942 SmallVector<SDValue, 4> Ops;
6943 if (HaveChain)
6944 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
6945
6946 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
6947 Ops.push_back(Elt: Target);
6948
6949 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6950
6951 // build the new intrinsic call
6952 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
6953
6954 if (!HaveChain) {
6955 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(i: 0)};
6956
6957 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
6958 }
6959
6960 if (BR) {
6961 // Give the branch instruction our target
6962 SDValue Ops[] = {BR->getOperand(Num: 0), BRCOND.getOperand(i: 2)};
6963 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
6964 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
6965 }
6966
6967 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6968
6969 // Copy the intrinsic results to registers
6970 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6971 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
6972 if (!CopyToReg)
6973 continue;
6974
6975 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: 1),
6976 N: SDValue(Result, i - 1), Glue: SDValue());
6977
6978 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
6979 }
6980
6981 // Remove the old intrinsic from the chain
6982 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Intr, Intr->getNumValues() - 1),
6983 To: Intr->getOperand(Num: 0));
6984
6985 return Chain;
6986}
6987
6988SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
6989 MVT VT = Op.getSimpleValueType();
6990 SDLoc DL(Op);
6991 // Checking the depth
6992 if (Op.getConstantOperandVal(i: 0) != 0)
6993 return DAG.getConstant(Val: 0, DL, VT);
6994
6995 MachineFunction &MF = DAG.getMachineFunction();
6996 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6997 // Check for kernel and shader functions
6998 if (Info->isEntryFunction())
6999 return DAG.getConstant(Val: 0, DL, VT);
7000
7001 MachineFrameInfo &MFI = MF.getFrameInfo();
7002 // There is a call to @llvm.returnaddress in this function
7003 MFI.setReturnAddressIsTaken(true);
7004
7005 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7006 // Get the return address reg and mark it as an implicit live-in
7007 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
7008 RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
7009
7010 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
7011}
7012
7013SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7014 const SDLoc &DL, EVT VT) const {
7015 return Op.getValueType().bitsLE(VT)
7016 ? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
7017 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
7018 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
7019}
7020
7021SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7022 SelectionDAG &DAG) const {
7023 EVT DstVT = Op.getValueType();
7024 unsigned NumElts = DstVT.getVectorNumElements();
7025 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7026
7027 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
7028
7029 SDLoc DL(Op);
7030 unsigned Opc = Op.getOpcode();
7031 SDValue Flags = Op.getOperand(i: 1);
7032 EVT HalfDstVT =
7033 EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / 2);
7034 SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
7035 SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
7036
7037 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
7038}
7039
7040SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7041 SDValue Src = Op.getOperand(i: 0);
7042 EVT SrcVT = Src.getValueType();
7043 EVT DstVT = Op.getValueType();
7044
7045 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7046 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7047 if (SrcVT.getScalarType() != MVT::f32)
7048 return SDValue();
7049 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7050 }
7051
7052 if (SrcVT.getScalarType() != MVT::f64)
7053 return Op;
7054
7055 SDLoc DL(Op);
7056 if (DstVT == MVT::f16) {
7057 // TODO: Handle strictfp
7058 if (Op.getOpcode() != ISD::FP_ROUND)
7059 return Op;
7060
7061 if (!Subtarget->has16BitInsts()) {
7062 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
7063 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
7064 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
7065 }
7066 if (getTargetMachine().Options.UnsafeFPMath) {
7067 SDValue Flags = Op.getOperand(i: 1);
7068 SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
7069 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
7070 }
7071 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7072 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
7073 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
7074 }
7075
7076 assert(DstVT.getScalarType() == MVT::bf16 &&
7077 "custom lower FP_ROUND for f16 or bf16");
7078 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7079
7080 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7081 // hardware f32 -> bf16 instruction.
7082 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(EltVT: MVT::f32) :
7083 MVT::f32;
7084 SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
7085 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
7086 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
7087}
7088
7089SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7090 SelectionDAG &DAG) const {
7091 EVT VT = Op.getValueType();
7092 const MachineFunction &MF = DAG.getMachineFunction();
7093 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7094 bool IsIEEEMode = Info->getMode().IEEE;
7095
7096 // FIXME: Assert during selection that this is only selected for
7097 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7098 // mode functions, but this happens to be OK since it's only done in cases
7099 // where there is known no sNaN.
7100 if (IsIEEEMode)
7101 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
7102
7103 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7104 VT == MVT::v16bf16)
7105 return splitBinaryVectorOp(Op, DAG);
7106 return Op;
7107}
7108
7109SDValue
7110SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7111 SelectionDAG &DAG) const {
7112 EVT VT = Op.getValueType();
7113 const MachineFunction &MF = DAG.getMachineFunction();
7114 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7115 bool IsIEEEMode = Info->getMode().IEEE;
7116
7117 if (IsIEEEMode)
7118 return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
7119
7120 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7121 VT == MVT::v16bf16)
7122 return splitBinaryVectorOp(Op, DAG);
7123 return Op;
7124}
7125
7126SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7127 SelectionDAG &DAG) const {
7128 EVT VT = Op.getValueType();
7129 if (VT.isVector())
7130 return splitBinaryVectorOp(Op, DAG);
7131
7132 assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
7133 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7134 "should not need to widen f16 minimum/maximum to v2f16");
7135
7136 // Widen f16 operation to v2f16
7137
7138 // fminimum f16:x, f16:y ->
7139 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7140 // (v2f16 (scalar_to_vector y))), 0
7141 SDLoc SL(Op);
7142 SDValue WideSrc0 =
7143 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 0));
7144 SDValue WideSrc1 =
7145 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 1));
7146
7147 SDValue Widened =
7148 DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
7149
7150 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
7151 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
7152}
7153
7154SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7155 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7156 EVT VT = Op.getValueType();
7157 assert(VT == MVT::f16);
7158
7159 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
7160 EVT ExpVT = Exp.getValueType();
7161 if (ExpVT == MVT::i16)
7162 return Op;
7163
7164 SDLoc DL(Op);
7165
7166 // Correct the exponent type for f16 to i16.
7167 // Clamp the range of the exponent to the instruction's range.
7168
7169 // TODO: This should be a generic narrowing legalization, and can easily be
7170 // for GlobalISel.
7171
7172 SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
7173 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
7174
7175 SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
7176 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
7177
7178 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
7179
7180 if (IsStrict) {
7181 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
7182 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
7183 }
7184
7185 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
7186}
7187
7188static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
7189 switch (Op->getOpcode()) {
7190 case ISD::SRA:
7191 case ISD::SMIN:
7192 case ISD::SMAX:
7193 return ISD::SIGN_EXTEND;
7194 case ISD::SRL:
7195 case ISD::UMIN:
7196 case ISD::UMAX:
7197 return ISD::ZERO_EXTEND;
7198 case ISD::ADD:
7199 case ISD::SUB:
7200 case ISD::AND:
7201 case ISD::OR:
7202 case ISD::XOR:
7203 case ISD::SHL:
7204 case ISD::SELECT:
7205 case ISD::MUL:
7206 // operation result won't be influenced by garbage high bits.
7207 // TODO: are all of those cases correct, and are there more?
7208 return ISD::ANY_EXTEND;
7209 case ISD::SETCC: {
7210 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
7211 return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7212 }
7213 default:
7214 llvm_unreachable("unexpected opcode!");
7215 }
7216}
7217
7218SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7219 DAGCombinerInfo &DCI) const {
7220 const unsigned Opc = Op.getOpcode();
7221 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7222 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7223 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7224 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7225 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7226
7227 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7228 : Op->getOperand(Num: 0).getValueType();
7229 auto ExtTy = OpTy.changeElementType(EltVT: MVT::i32);
7230
7231 if (DCI.isBeforeLegalizeOps() ||
7232 isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
7233 return SDValue();
7234
7235 auto &DAG = DCI.DAG;
7236
7237 SDLoc DL(Op);
7238 SDValue LHS;
7239 SDValue RHS;
7240 if (Opc == ISD::SELECT) {
7241 LHS = Op->getOperand(Num: 1);
7242 RHS = Op->getOperand(Num: 2);
7243 } else {
7244 LHS = Op->getOperand(Num: 0);
7245 RHS = Op->getOperand(Num: 1);
7246 }
7247
7248 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7249 LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
7250
7251 // Special case: for shifts, the RHS always needs a zext.
7252 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7253 RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
7254 else
7255 RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
7256
7257 // setcc always return i1/i1 vec so no need to truncate after.
7258 if (Opc == ISD::SETCC) {
7259 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
7260 return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
7261 }
7262
7263 // For other ops, we extend the operation's return type as well so we need to
7264 // truncate back to the original type.
7265 SDValue NewVal;
7266 if (Opc == ISD::SELECT)
7267 NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op->getOperand(Num: 0), LHS, RHS});
7268 else
7269 NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
7270
7271 return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
7272}
7273
7274SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7275 SDValue Mag = Op.getOperand(i: 0);
7276 EVT MagVT = Mag.getValueType();
7277
7278 if (MagVT.getVectorNumElements() > 2)
7279 return splitBinaryVectorOp(Op, DAG);
7280
7281 SDValue Sign = Op.getOperand(i: 1);
7282 EVT SignVT = Sign.getValueType();
7283
7284 if (MagVT == SignVT)
7285 return Op;
7286
7287 // fcopysign v2f16:mag, v2f32:sign ->
7288 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7289
7290 SDLoc SL(Op);
7291 SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
7292 SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
7293
7294 SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
7295
7296 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
7297}
7298
7299// Custom lowering for vector multiplications and s_mul_u64.
7300SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7301 EVT VT = Op.getValueType();
7302
7303 // Split vector operands.
7304 if (VT.isVector())
7305 return splitBinaryVectorOp(Op, DAG);
7306
7307 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7308
7309 // There are four ways to lower s_mul_u64:
7310 //
7311 // 1. If all the operands are uniform, then we lower it as it is.
7312 //
7313 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7314 // multiplications because there is not a vector equivalent of s_mul_u64.
7315 //
7316 // 3. If the cost model decides that it is more efficient to use vector
7317 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7318 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7319 //
7320 // 4. If the cost model decides to use vector registers and both of the
7321 // operands are zero-extended/sign-extended from 32-bits, then we split the
7322 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7323 // possible to check if the operands are zero-extended or sign-extended in
7324 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7325 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7326 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7327 // If the cost model decides that we have to use vector registers, then
7328 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7329 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7330 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7331 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7332 // SIInstrInfo.cpp .
7333
7334 if (Op->isDivergent())
7335 return SDValue();
7336
7337 SDValue Op0 = Op.getOperand(i: 0);
7338 SDValue Op1 = Op.getOperand(i: 1);
7339 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7340 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7341 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7342 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
7343 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7344 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
7345 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7346 SDLoc SL(Op);
7347 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7348 return SDValue(
7349 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
7350 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
7351 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
7352 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7353 return SDValue(
7354 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
7355 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7356 return Op;
7357}
7358
7359SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7360 EVT VT = Op.getValueType();
7361 SDLoc SL(Op);
7362 SDValue LHS = Op.getOperand(i: 0);
7363 SDValue RHS = Op.getOperand(i: 1);
7364 bool isSigned = Op.getOpcode() == ISD::SMULO;
7365
7366 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
7367 const APInt &C = RHSC->getAPIntValue();
7368 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7369 if (C.isPowerOf2()) {
7370 // smulo(x, signed_min) is same as umulo(x, signed_min).
7371 bool UseArithShift = isSigned && !C.isMinSignedValue();
7372 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
7373 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
7374 SDValue Overflow =
7375 DAG.getSetCC(DL: SL, VT: MVT::i1,
7376 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
7377 N1: Result, N2: ShiftAmt),
7378 RHS: LHS, Cond: ISD::SETNE);
7379 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
7380 }
7381 }
7382
7383 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
7384 SDValue Top =
7385 DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
7386
7387 SDValue Sign = isSigned
7388 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
7389 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1,
7390 DL: SL, VT: MVT::i32))
7391 : DAG.getConstant(Val: 0, DL: SL, VT);
7392 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
7393
7394 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
7395}
7396
7397SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
7398 if (Op->isDivergent()) {
7399 // Select to V_MAD_[IU]64_[IU]32.
7400 return Op;
7401 }
7402 if (Subtarget->hasSMulHi()) {
7403 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
7404 return SDValue();
7405 }
7406 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
7407 // calculate the high part, so we might as well do the whole thing with
7408 // V_MAD_[IU]64_[IU]32.
7409 return Op;
7410}
7411
7412SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
7413 if (!Subtarget->isTrapHandlerEnabled() ||
7414 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7415 return lowerTrapEndpgm(Op, DAG);
7416
7417 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
7418 : lowerTrapHsaQueuePtr(Op, DAG);
7419}
7420
7421SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
7422 SDLoc SL(Op);
7423 SDValue Chain = Op.getOperand(i: 0);
7424 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
7425}
7426
7427SDValue
7428SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
7429 const SDLoc &DL, Align Alignment,
7430 ImplicitParameter Param) const {
7431 MachineFunction &MF = DAG.getMachineFunction();
7432 uint64_t Offset = getImplicitParameterOffset(MF, Param);
7433 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
7434 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7435 return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
7436 MMOFlags: MachineMemOperand::MODereferenceable |
7437 MachineMemOperand::MOInvariant);
7438}
7439
7440SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
7441 SelectionDAG &DAG) const {
7442 SDLoc SL(Op);
7443 SDValue Chain = Op.getOperand(i: 0);
7444
7445 SDValue QueuePtr;
7446 // For code object version 5, QueuePtr is passed through implicit kernarg.
7447 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7448 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
7449 QueuePtr =
7450 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
7451 } else {
7452 MachineFunction &MF = DAG.getMachineFunction();
7453 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7454 Register UserSGPR = Info->getQueuePtrUserSGPR();
7455
7456 if (UserSGPR == AMDGPU::NoRegister) {
7457 // We probably are in a function incorrectly marked with
7458 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
7459 // trap, so just use a null pointer.
7460 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
7461 } else {
7462 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
7463 VT: MVT::i64);
7464 }
7465 }
7466
7467 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
7468 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue());
7469
7470 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7471 SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
7472 ToReg.getValue(R: 1)};
7473 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7474}
7475
7476SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
7477 SDLoc SL(Op);
7478 SDValue Chain = Op.getOperand(i: 0);
7479
7480 // We need to simulate the 's_trap 2' instruction on targets that run in
7481 // PRIV=1 (where it is treated as a nop).
7482 if (Subtarget->hasPrivEnabledTrap2NopBug())
7483 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
7484
7485 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
7486 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
7487 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7488}
7489
7490SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7491 SDLoc SL(Op);
7492 SDValue Chain = Op.getOperand(i: 0);
7493 MachineFunction &MF = DAG.getMachineFunction();
7494
7495 if (!Subtarget->isTrapHandlerEnabled() ||
7496 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7497 LLVMContext &Ctx = MF.getFunction().getContext();
7498 Ctx.diagnose(DI: DiagnosticInfoUnsupported(MF.getFunction(),
7499 "debugtrap handler not supported",
7500 Op.getDebugLoc(), DS_Warning));
7501 return Chain;
7502 }
7503
7504 uint64_t TrapID =
7505 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
7506 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
7507 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
7508}
7509
7510SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7511 SelectionDAG &DAG) const {
7512 if (Subtarget->hasApertureRegs()) {
7513 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7514 ? AMDGPU::SRC_SHARED_BASE
7515 : AMDGPU::SRC_PRIVATE_BASE;
7516 // Note: this feature (register) is broken. When used as a 32-bit operand,
7517 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7518 // bits.
7519 //
7520 // To work around the issue, directly emit a 64 bit mov from this register
7521 // then extract the high bits. Note that this shouldn't even result in a
7522 // shift being emitted and simply become a pair of registers (e.g.):
7523 // s_mov_b64 s[6:7], src_shared_base
7524 // v_mov_b32_e32 v1, s7
7525 //
7526 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7527 // coalescing would kick in and it would think it's okay to use the "HI"
7528 // subregister directly (instead of extracting the HI 32 bits) which is an
7529 // artificial (unusable) register.
7530 // Register TableGen definitions would need an overhaul to get rid of the
7531 // artificial "HI" aperture registers and prevent this kind of issue from
7532 // happening.
7533 SDNode *Mov = DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64,
7534 Op1: DAG.getRegister(Reg: ApertureRegNo, VT: MVT::i64));
7535 return DAG.getNode(
7536 Opcode: ISD::TRUNCATE, DL, VT: MVT::i32,
7537 Operand: DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64,
7538 Ops: {SDValue(Mov, 0), DAG.getConstant(Val: 32, DL, VT: MVT::i64)}));
7539 }
7540
7541 // For code object version 5, private_base and shared_base are passed through
7542 // implicit kernargs.
7543 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7544 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
7545 ImplicitParameter Param =
7546 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7547 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
7548 }
7549
7550 MachineFunction &MF = DAG.getMachineFunction();
7551 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7552 Register UserSGPR = Info->getQueuePtrUserSGPR();
7553 if (UserSGPR == AMDGPU::NoRegister) {
7554 // We probably are in a function incorrectly marked with
7555 // amdgpu-no-queue-ptr. This is undefined.
7556 return DAG.getPOISON(VT: MVT::i32);
7557 }
7558
7559 SDValue QueuePtr =
7560 CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
7561
7562 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7563 // private_segment_aperture_base_hi.
7564 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7565
7566 SDValue Ptr =
7567 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
7568
7569 // TODO: Use custom target PseudoSourceValue.
7570 // TODO: We should use the value from the IR intrinsic call, but it might not
7571 // be available and how do we get it?
7572 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7573 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
7574 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
7575 MMOFlags: MachineMemOperand::MODereferenceable |
7576 MachineMemOperand::MOInvariant);
7577}
7578
7579/// Return true if the value is a known valid address, such that a null check is
7580/// not necessary.
7581static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7582 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7583 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
7584 return true;
7585
7586 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7587 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7588
7589 // TODO: Search through arithmetic, handle arguments and loads
7590 // marked nonnull.
7591 return false;
7592}
7593
7594SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7595 SelectionDAG &DAG) const {
7596 SDLoc SL(Op);
7597
7598 const AMDGPUTargetMachine &TM =
7599 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7600
7601 unsigned DestAS, SrcAS;
7602 SDValue Src;
7603 bool IsNonNull = false;
7604 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
7605 SrcAS = ASC->getSrcAddressSpace();
7606 Src = ASC->getOperand(Num: 0);
7607 DestAS = ASC->getDestAddressSpace();
7608 } else {
7609 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7610 Op.getConstantOperandVal(0) ==
7611 Intrinsic::amdgcn_addrspacecast_nonnull);
7612 Src = Op->getOperand(Num: 1);
7613 SrcAS = Op->getConstantOperandVal(Num: 2);
7614 DestAS = Op->getConstantOperandVal(Num: 3);
7615 IsNonNull = true;
7616 }
7617
7618 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
7619
7620 // flat -> local/private
7621 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7622 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7623 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7624 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7625
7626 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7627 return Ptr;
7628
7629 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
7630 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7631 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
7632
7633 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
7634 N3: SegmentNullPtr);
7635 }
7636 }
7637
7638 // local/private -> flat
7639 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7640 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7641 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7642
7643 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
7644 SDValue CvtPtr =
7645 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
7646 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
7647
7648 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7649 return CvtPtr;
7650
7651 unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
7652 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7653
7654 SDValue NonNull =
7655 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
7656
7657 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
7658 N3: FlatNullPtr);
7659 }
7660 }
7661
7662 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7663 Op.getValueType() == MVT::i64) {
7664 const SIMachineFunctionInfo *Info =
7665 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7666 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
7667 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
7668 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
7669 }
7670
7671 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7672 Src.getValueType() == MVT::i64)
7673 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7674
7675 // global <-> flat are no-ops and never emitted.
7676
7677 // Invalid casts are poison.
7678 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
7679}
7680
7681// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7682// the small vector and inserting them into the big vector. That is better than
7683// the default expansion of doing it via a stack slot. Even though the use of
7684// the stack slot would be optimized away afterwards, the stack slot itself
7685// remains.
7686SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7687 SelectionDAG &DAG) const {
7688 SDValue Vec = Op.getOperand(i: 0);
7689 SDValue Ins = Op.getOperand(i: 1);
7690 SDValue Idx = Op.getOperand(i: 2);
7691 EVT VecVT = Vec.getValueType();
7692 EVT InsVT = Ins.getValueType();
7693 EVT EltVT = VecVT.getVectorElementType();
7694 unsigned InsNumElts = InsVT.getVectorNumElements();
7695 unsigned IdxVal = Idx->getAsZExtVal();
7696 SDLoc SL(Op);
7697
7698 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7699 // Insert 32-bit registers at a time.
7700 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7701
7702 unsigned VecNumElts = VecVT.getVectorNumElements();
7703 EVT NewVecVT =
7704 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
7705 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7706 : EVT::getVectorVT(Context&: *DAG.getContext(),
7707 VT: MVT::i32, NumElements: InsNumElts / 2);
7708
7709 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
7710 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
7711
7712 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7713 SDValue Elt;
7714 if (InsNumElts == 2) {
7715 Elt = Ins;
7716 } else {
7717 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
7718 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7719 }
7720 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
7721 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
7722 }
7723
7724 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
7725 }
7726
7727 for (unsigned I = 0; I != InsNumElts; ++I) {
7728 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
7729 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7730 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
7731 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
7732 }
7733 return Vec;
7734}
7735
7736SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7737 SelectionDAG &DAG) const {
7738 SDValue Vec = Op.getOperand(i: 0);
7739 SDValue InsVal = Op.getOperand(i: 1);
7740 SDValue Idx = Op.getOperand(i: 2);
7741 EVT VecVT = Vec.getValueType();
7742 EVT EltVT = VecVT.getVectorElementType();
7743 unsigned VecSize = VecVT.getSizeInBits();
7744 unsigned EltSize = EltVT.getSizeInBits();
7745 SDLoc SL(Op);
7746
7747 // Specially handle the case of v4i16 with static indexing.
7748 unsigned NumElts = VecVT.getVectorNumElements();
7749 auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
7750 if (NumElts == 4 && EltSize == 16 && KIdx) {
7751 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
7752
7753 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7754 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
7755 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7756 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
7757
7758 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
7759 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
7760
7761 unsigned Idx = KIdx->getZExtValue();
7762 bool InsertLo = Idx < 2;
7763 SDValue InsHalf = DAG.getNode(
7764 Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
7765 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
7766 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
7767
7768 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
7769
7770 SDValue Concat =
7771 InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
7772 : DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
7773
7774 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
7775 }
7776
7777 // Static indexing does not lower to stack access, and hence there is no need
7778 // for special custom lowering to avoid stack access.
7779 if (isa<ConstantSDNode>(Val: Idx))
7780 return SDValue();
7781
7782 // Avoid stack access for dynamic indexing by custom lowering to
7783 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7784
7785 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7786
7787 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7788
7789 // Convert vector index to bit-index and get the required bit mask.
7790 assert(isPowerOf2_32(EltSize));
7791 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
7792 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7793 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7794 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
7795 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
7796
7797 // 1. Create a congruent vector with the target value in each element.
7798 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
7799 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
7800
7801 // 2. Mask off all other indices except the required index within (1).
7802 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
7803
7804 // 3. Mask off the required index within the target vector.
7805 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7806 SDValue RHS =
7807 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
7808
7809 // 4. Get (2) and (3) ORed into the target vector.
7810 SDValue BFI =
7811 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
7812
7813 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
7814}
7815
7816SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7817 SelectionDAG &DAG) const {
7818 SDLoc SL(Op);
7819
7820 EVT ResultVT = Op.getValueType();
7821 SDValue Vec = Op.getOperand(i: 0);
7822 SDValue Idx = Op.getOperand(i: 1);
7823 EVT VecVT = Vec.getValueType();
7824 unsigned VecSize = VecVT.getSizeInBits();
7825 EVT EltVT = VecVT.getVectorElementType();
7826
7827 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7828
7829 // Make sure we do any optimizations that will make it easier to fold
7830 // source modifiers before obscuring it with bit operations.
7831
7832 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7833 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
7834 return Combined;
7835
7836 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7837 SDValue Lo, Hi;
7838 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
7839
7840 if (VecSize == 128) {
7841 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
7842 Lo = DAG.getBitcast(VT: LoVT,
7843 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7844 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
7845 Hi = DAG.getBitcast(VT: HiVT,
7846 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7847 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
7848 } else if (VecSize == 256) {
7849 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
7850 SDValue Parts[4];
7851 for (unsigned P = 0; P < 4; ++P) {
7852 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7853 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7854 }
7855
7856 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7857 N1: Parts[0], N2: Parts[1]));
7858 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7859 N1: Parts[2], N2: Parts[3]));
7860 } else {
7861 assert(VecSize == 512);
7862
7863 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
7864 SDValue Parts[8];
7865 for (unsigned P = 0; P < 8; ++P) {
7866 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7867 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7868 }
7869
7870 Lo = DAG.getBitcast(VT: LoVT,
7871 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7872 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
7873 Hi = DAG.getBitcast(VT: HiVT,
7874 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7875 N1: Parts[4], N2: Parts[5], N3: Parts[6], N4: Parts[7]));
7876 }
7877
7878 EVT IdxVT = Idx.getValueType();
7879 unsigned NElem = VecVT.getVectorNumElements();
7880 assert(isPowerOf2_32(NElem));
7881 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
7882 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
7883 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
7884 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
7885 }
7886
7887 assert(VecSize <= 64);
7888
7889 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7890
7891 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7892 SDValue VecBC = peekThroughBitcasts(V: Vec);
7893 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7894 SDValue Src = VecBC.getOperand(i: 0);
7895 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
7896 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
7897 }
7898
7899 unsigned EltSize = EltVT.getSizeInBits();
7900 assert(isPowerOf2_32(EltSize));
7901
7902 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7903
7904 // Convert vector index to bit-index (* EltSize)
7905 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7906
7907 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7908 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
7909
7910 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7911 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
7912 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
7913 }
7914
7915 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
7916}
7917
7918static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7919 assert(Elt % 2 == 0);
7920 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7921}
7922
7923static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
7924 assert(Elt % 2 == 0);
7925 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
7926 !(Mask[Elt + 1] & 1);
7927}
7928
7929SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7930 SelectionDAG &DAG) const {
7931 SDLoc SL(Op);
7932 EVT ResultVT = Op.getValueType();
7933 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
7934 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
7935 const int NewSrcNumElts = 2;
7936 MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
7937 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
7938
7939 // Break up the shuffle into registers sized pieces.
7940 //
7941 // We're trying to form sub-shuffles that the register allocation pipeline
7942 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
7943 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
7944 // pair of copies into a consecutive register copy, so use the ordinary
7945 // extract_vector_elt lowering unless we can use the shuffle.
7946 //
7947 // TODO: This is a bit of hack, and we should probably always use
7948 // extract_subvector for the largest possible subvector we can (or at least
7949 // use it for PackVT aligned pieces). However we have worse support for
7950 // combines on them don't directly treat extract_subvector / insert_subvector
7951 // as legal. The DAG scheduler also ends up doing a worse job with the
7952 // extract_subvectors.
7953 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
7954
7955 // vector_shuffle <0,1,6,7> lhs, rhs
7956 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7957 //
7958 // vector_shuffle <6,7,2,3> lhs, rhs
7959 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7960 //
7961 // vector_shuffle <6,7,0,1> lhs, rhs
7962 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7963
7964 // Avoid scalarizing when both halves are reading from consecutive elements.
7965
7966 // If we're treating 2 element shuffles as legal, also create odd-to-even
7967 // shuffles of neighboring pairs.
7968 //
7969 // vector_shuffle <3,2,7,6> lhs, rhs
7970 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
7971 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
7972
7973 SmallVector<SDValue, 16> Pieces;
7974 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7975 if (ShouldUseConsecutiveExtract &&
7976 elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
7977 const int Idx = SVN->getMaskElt(Idx: I);
7978 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7979 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7980 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
7981 N1: SVN->getOperand(Num: VecIdx),
7982 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7983 Pieces.push_back(Elt: SubVec);
7984 } else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
7985 isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
7986 int Idx0 = SVN->getMaskElt(Idx: I);
7987 int Idx1 = SVN->getMaskElt(Idx: I + 1);
7988
7989 SDValue SrcOp0 = SVN->getOperand(Num: 0);
7990 SDValue SrcOp1 = SrcOp0;
7991 if (Idx0 >= SrcNumElts) {
7992 SrcOp0 = SVN->getOperand(Num: 1);
7993 Idx0 -= SrcNumElts;
7994 }
7995
7996 if (Idx1 >= SrcNumElts) {
7997 SrcOp1 = SVN->getOperand(Num: 1);
7998 Idx1 -= SrcNumElts;
7999 }
8000
8001 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8002 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8003
8004 // Extract nearest even aligned piece.
8005 SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
8006 N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
8007 SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
8008 N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
8009
8010 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8011 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8012
8013 SDValue Result0 = SubVec0;
8014 SDValue Result1 = SubVec0;
8015
8016 if (SubVec0 != SubVec1) {
8017 NewMaskIdx1 += NewSrcNumElts;
8018 Result1 = SubVec1;
8019 } else {
8020 Result1 = DAG.getPOISON(VT: PackVT);
8021 }
8022
8023 SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
8024 Mask: {NewMaskIdx0, NewMaskIdx1});
8025 Pieces.push_back(Elt: Shuf);
8026 } else {
8027 const int Idx0 = SVN->getMaskElt(Idx: I);
8028 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
8029 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8030 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8031 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8032 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8033
8034 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
8035 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
8036 N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
8037
8038 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
8039 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
8040 N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
8041 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
8042 }
8043 }
8044
8045 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
8046}
8047
8048SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8049 SelectionDAG &DAG) const {
8050 SDValue SVal = Op.getOperand(i: 0);
8051 EVT ResultVT = Op.getValueType();
8052 EVT SValVT = SVal.getValueType();
8053 SDValue UndefVal = DAG.getPOISON(VT: SValVT);
8054 SDLoc SL(Op);
8055
8056 SmallVector<SDValue, 8> VElts;
8057 VElts.push_back(Elt: SVal);
8058 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8059 VElts.push_back(Elt: UndefVal);
8060
8061 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
8062}
8063
8064SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8065 SelectionDAG &DAG) const {
8066 SDLoc SL(Op);
8067 EVT VT = Op.getValueType();
8068
8069 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8070 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8071
8072 SDValue Lo = Op.getOperand(i: 0);
8073 SDValue Hi = Op.getOperand(i: 1);
8074
8075 // Avoid adding defined bits with the zero_extend.
8076 if (Hi.isUndef()) {
8077 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
8078 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
8079 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
8080 }
8081
8082 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
8083 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
8084
8085 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
8086 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
8087 if (Lo.isUndef())
8088 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
8089
8090 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
8091 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
8092
8093 SDValue Or =
8094 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
8095 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
8096 }
8097
8098 // Split into 2-element chunks.
8099 const unsigned NumParts = VT.getVectorNumElements() / 2;
8100 EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: 2);
8101 MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
8102
8103 SmallVector<SDValue> Casts;
8104 for (unsigned P = 0; P < NumParts; ++P) {
8105 SDValue Vec = DAG.getBuildVector(
8106 VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * 2), Op.getOperand(i: P * 2 + 1)});
8107 Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
8108 }
8109
8110 SDValue Blend =
8111 DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
8112 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
8113}
8114
8115bool SITargetLowering::isOffsetFoldingLegal(
8116 const GlobalAddressSDNode *GA) const {
8117 // OSes that use ELF REL relocations (instead of RELA) can only store a
8118 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8119 // which can create arbitrary 64-bit addends. (This is only a problem for
8120 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8121 // the high 32 bits of the addend.)
8122 //
8123 // This should be kept in sync with how HasRelocationAddend is initialized in
8124 // the constructor of ELFAMDGPUAsmBackend.
8125 if (!Subtarget->isAmdHsaOS())
8126 return false;
8127
8128 // We can fold offsets for anything that doesn't require a GOT relocation.
8129 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8130 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
8131 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
8132 !shouldEmitGOTReloc(GV: GA->getGlobal());
8133}
8134
8135static SDValue
8136buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
8137 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8138 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8139 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8140 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8141 // lowered to the following code sequence:
8142 //
8143 // For constant address space:
8144 // s_getpc_b64 s[0:1]
8145 // s_add_u32 s0, s0, $symbol
8146 // s_addc_u32 s1, s1, 0
8147 //
8148 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8149 // a fixup or relocation is emitted to replace $symbol with a literal
8150 // constant, which is a pc-relative offset from the encoding of the $symbol
8151 // operand to the global variable.
8152 //
8153 // For global address space:
8154 // s_getpc_b64 s[0:1]
8155 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8156 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8157 //
8158 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8159 // fixups or relocations are emitted to replace $symbol@*@lo and
8160 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8161 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8162 // operand to the global variable.
8163 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
8164 SDValue PtrHi;
8165 if (GAFlags == SIInstrInfo::MO_NONE)
8166 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
8167 else
8168 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
8169 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
8170}
8171
8172SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8173 SDValue Op,
8174 SelectionDAG &DAG) const {
8175 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
8176 SDLoc DL(GSD);
8177 EVT PtrVT = Op.getValueType();
8178
8179 const GlobalValue *GV = GSD->getGlobal();
8180 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8181 shouldUseLDSConstAddress(GV)) ||
8182 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
8183 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
8184 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
8185 GV->hasExternalLinkage()) {
8186 Type *Ty = GV->getValueType();
8187 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8188 // zero-sized type in other languages to declare the dynamic shared
8189 // memory which size is not known at the compile time. They will be
8190 // allocated by the runtime and placed directly after the static
8191 // allocated ones. They all share the same offset.
8192 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8193 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8194 // Adjust alignment for that dynamic shared memory array.
8195 Function &F = DAG.getMachineFunction().getFunction();
8196 MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV));
8197 MFI->setUsesDynamicLDS(true);
8198 return SDValue(
8199 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
8200 }
8201 }
8202 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
8203 }
8204
8205 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8206 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
8207 TargetFlags: SIInstrInfo::MO_ABS32_LO);
8208 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
8209 }
8210
8211 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8212 SDValue AddrLo = DAG.getTargetGlobalAddress(
8213 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
8214 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
8215
8216 SDValue AddrHi = DAG.getTargetGlobalAddress(
8217 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
8218 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
8219
8220 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
8221 }
8222
8223 if (shouldEmitFixup(GV))
8224 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
8225
8226 if (shouldEmitPCReloc(GV))
8227 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
8228 GAFlags: SIInstrInfo::MO_REL32);
8229
8230 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
8231 GAFlags: SIInstrInfo::MO_GOTPCREL32);
8232 PointerType *PtrTy =
8233 PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
8234 const DataLayout &DataLayout = DAG.getDataLayout();
8235 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
8236 MachinePointerInfo PtrInfo =
8237 MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
8238
8239 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
8240 MMOFlags: MachineMemOperand::MODereferenceable |
8241 MachineMemOperand::MOInvariant);
8242}
8243
8244SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
8245 const SDLoc &DL, SDValue V) const {
8246 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8247 // the destination register.
8248 //
8249 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8250 // so we will end up with redundant moves to m0.
8251 //
8252 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8253
8254 // A Null SDValue creates a glue result.
8255 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
8256 Op1: V, Op2: Chain);
8257 return SDValue(M0, 0);
8258}
8259
8260SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8261 MVT VT,
8262 unsigned Offset) const {
8263 SDLoc SL(Op);
8264 SDValue Param = lowerKernargMemParameter(
8265 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
8266 // The local size values will have the hi 16-bits as zero.
8267 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
8268 N2: DAG.getValueType(VT));
8269}
8270
8271static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8272 EVT VT) {
8273 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8274 DAG.getMachineFunction().getFunction(),
8275 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8276 return DAG.getPOISON(VT);
8277}
8278
8279static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
8280 EVT VT) {
8281 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8282 DAG.getMachineFunction().getFunction(),
8283 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8284 return DAG.getPOISON(VT);
8285}
8286
8287static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
8288 ArrayRef<SDValue> Elts) {
8289 assert(!Elts.empty());
8290 MVT Type;
8291 unsigned NumElts = Elts.size();
8292
8293 if (NumElts <= 12) {
8294 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
8295 } else {
8296 assert(Elts.size() <= 16);
8297 Type = MVT::v16f32;
8298 NumElts = 16;
8299 }
8300
8301 SmallVector<SDValue, 16> VecElts(NumElts);
8302 for (unsigned i = 0; i < Elts.size(); ++i) {
8303 SDValue Elt = Elts[i];
8304 if (Elt.getValueType() != MVT::f32)
8305 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
8306 VecElts[i] = Elt;
8307 }
8308 for (unsigned i = Elts.size(); i < NumElts; ++i)
8309 VecElts[i] = DAG.getPOISON(VT: MVT::f32);
8310
8311 if (NumElts == 1)
8312 return VecElts[0];
8313 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
8314}
8315
8316static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8317 SDValue Src, int ExtraElts) {
8318 EVT SrcVT = Src.getValueType();
8319
8320 SmallVector<SDValue, 8> Elts;
8321
8322 if (SrcVT.isVector())
8323 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
8324 else
8325 Elts.push_back(Elt: Src);
8326
8327 SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
8328 while (ExtraElts--)
8329 Elts.push_back(Elt: Undef);
8330
8331 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
8332}
8333
8334// Re-construct the required return value for a image load intrinsic.
8335// This is more complicated due to the optional use TexFailCtrl which means the
8336// required return type is an aggregate
8337static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
8338 ArrayRef<EVT> ResultTypes, bool IsTexFail,
8339 bool Unpacked, bool IsD16, int DMaskPop,
8340 int NumVDataDwords, bool IsAtomicPacked16Bit,
8341 const SDLoc &DL) {
8342 // Determine the required return type. This is the same regardless of
8343 // IsTexFail flag
8344 EVT ReqRetVT = ResultTypes[0];
8345 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
8346 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
8347 ? (ReqRetNumElts + 1) / 2
8348 : ReqRetNumElts;
8349
8350 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
8351
8352 MVT DataDwordVT =
8353 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
8354
8355 MVT MaskPopVT =
8356 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
8357
8358 SDValue Data(Result, 0);
8359 SDValue TexFail;
8360
8361 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
8362 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
8363 if (MaskPopVT.isVector()) {
8364 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
8365 N1: SDValue(Result, 0), N2: ZeroIdx);
8366 } else {
8367 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
8368 N1: SDValue(Result, 0), N2: ZeroIdx);
8369 }
8370 }
8371
8372 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
8373 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
8374 ExtraElts: NumDataDwords - MaskPopDwords);
8375
8376 if (IsD16)
8377 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
8378
8379 EVT LegalReqRetVT = ReqRetVT;
8380 if (!ReqRetVT.isVector()) {
8381 if (!Data.getValueType().isInteger())
8382 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
8383 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
8384 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
8385 } else {
8386 // We need to widen the return vector to a legal type
8387 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
8388 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
8389 LegalReqRetVT =
8390 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
8391 NumElements: ReqRetVT.getVectorNumElements() + 1);
8392 }
8393 }
8394 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
8395
8396 if (IsTexFail) {
8397 TexFail =
8398 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
8399 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
8400
8401 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
8402 }
8403
8404 if (Result->getNumValues() == 1)
8405 return Data;
8406
8407 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
8408}
8409
8410static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
8411 SDValue *LWE, bool &IsTexFail) {
8412 auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
8413
8414 uint64_t Value = TexFailCtrlConst->getZExtValue();
8415 if (Value) {
8416 IsTexFail = true;
8417 }
8418
8419 SDLoc DL(TexFailCtrlConst);
8420 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
8421 Value &= ~(uint64_t)0x1;
8422 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
8423 Value &= ~(uint64_t)0x2;
8424
8425 return Value == 0;
8426}
8427
8428static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
8429 MVT PackVectorVT,
8430 SmallVectorImpl<SDValue> &PackedAddrs,
8431 unsigned DimIdx, unsigned EndIdx,
8432 unsigned NumGradients) {
8433 SDLoc DL(Op);
8434 for (unsigned I = DimIdx; I < EndIdx; I++) {
8435 SDValue Addr = Op.getOperand(i: I);
8436
8437 // Gradients are packed with undef for each coordinate.
8438 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
8439 // 1D: undef,dx/dh; undef,dx/dv
8440 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
8441 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
8442 if (((I + 1) >= EndIdx) ||
8443 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
8444 I == DimIdx + NumGradients - 1))) {
8445 if (Addr.getValueType() != MVT::i16)
8446 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
8447 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
8448 } else {
8449 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
8450 I++;
8451 }
8452 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
8453 PackedAddrs.push_back(Elt: Addr);
8454 }
8455}
8456
8457SDValue SITargetLowering::lowerImage(SDValue Op,
8458 const AMDGPU::ImageDimIntrinsicInfo *Intr,
8459 SelectionDAG &DAG, bool WithChain) const {
8460 SDLoc DL(Op);
8461 MachineFunction &MF = DAG.getMachineFunction();
8462 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
8463 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
8464 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
8465 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
8466 unsigned IntrOpcode = Intr->BaseOpcode;
8467 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
8468 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
8469 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
8470
8471 SmallVector<EVT, 3> ResultTypes(Op->values());
8472 SmallVector<EVT, 3> OrigResultTypes(Op->values());
8473 bool IsD16 = false;
8474 bool IsG16 = false;
8475 bool IsA16 = false;
8476 SDValue VData;
8477 int NumVDataDwords = 0;
8478 bool AdjustRetType = false;
8479 bool IsAtomicPacked16Bit = false;
8480
8481 // Offset of intrinsic arguments
8482 const unsigned ArgOffset = WithChain ? 2 : 1;
8483
8484 unsigned DMask;
8485 unsigned DMaskLanes = 0;
8486
8487 if (BaseOpcode->Atomic) {
8488 VData = Op.getOperand(i: 2);
8489
8490 IsAtomicPacked16Bit =
8491 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
8492 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
8493
8494 bool Is64Bit = VData.getValueSizeInBits() == 64;
8495 if (BaseOpcode->AtomicX2) {
8496 SDValue VData2 = Op.getOperand(i: 3);
8497 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
8498 Ops: {VData, VData2});
8499 if (Is64Bit)
8500 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
8501
8502 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
8503 DMask = Is64Bit ? 0xf : 0x3;
8504 NumVDataDwords = Is64Bit ? 4 : 2;
8505 } else {
8506 DMask = Is64Bit ? 0x3 : 0x1;
8507 NumVDataDwords = Is64Bit ? 2 : 1;
8508 }
8509 } else {
8510 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
8511 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
8512
8513 if (BaseOpcode->Store) {
8514 VData = Op.getOperand(i: 2);
8515
8516 MVT StoreVT = VData.getSimpleValueType();
8517 if (StoreVT.getScalarType() == MVT::f16) {
8518 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8519 return Op; // D16 is unsupported for this instruction
8520
8521 IsD16 = true;
8522 VData = handleD16VData(VData, DAG, ImageStore: true);
8523 }
8524
8525 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
8526 } else if (!BaseOpcode->NoReturn) {
8527 // Work out the num dwords based on the dmask popcount and underlying type
8528 // and whether packing is supported.
8529 MVT LoadVT = ResultTypes[0].getSimpleVT();
8530 if (LoadVT.getScalarType() == MVT::f16) {
8531 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8532 return Op; // D16 is unsupported for this instruction
8533
8534 IsD16 = true;
8535 }
8536
8537 // Confirm that the return type is large enough for the dmask specified
8538 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8539 (!LoadVT.isVector() && DMaskLanes > 1))
8540 return Op;
8541
8542 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8543 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8544 // instructions.
8545 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8546 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8547 NumVDataDwords = (DMaskLanes + 1) / 2;
8548 else
8549 NumVDataDwords = DMaskLanes;
8550
8551 AdjustRetType = true;
8552 }
8553 }
8554
8555 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8556 SmallVector<SDValue, 4> VAddrs;
8557
8558 // Check for 16 bit addresses or derivatives and pack if true.
8559 MVT VAddrVT =
8560 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
8561 MVT VAddrScalarVT = VAddrVT.getScalarType();
8562 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8563 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8564
8565 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
8566 VAddrScalarVT = VAddrVT.getScalarType();
8567 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8568 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8569
8570 // Push back extra arguments.
8571 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8572 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
8573 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8574 // Special handling of bias when A16 is on. Bias is of type half but
8575 // occupies full 32-bit.
8576 SDValue Bias = DAG.getBuildVector(
8577 VT: MVT::v2f16, DL,
8578 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
8579 VAddrs.push_back(Elt: Bias);
8580 } else {
8581 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8582 "Bias needs to be converted to 16 bit in A16 mode");
8583 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
8584 }
8585 }
8586
8587 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8588 // 16 bit gradients are supported, but are tied to the A16 control
8589 // so both gradients and addresses must be 16 bit
8590 LLVM_DEBUG(
8591 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8592 "require 16 bit args for both gradients and addresses");
8593 return Op;
8594 }
8595
8596 if (IsA16) {
8597 if (!ST->hasA16()) {
8598 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8599 "support 16 bit addresses\n");
8600 return Op;
8601 }
8602 }
8603
8604 // We've dealt with incorrect input so we know that if IsA16, IsG16
8605 // are set then we have to compress/pack operands (either address,
8606 // gradient or both)
8607 // In the case where a16 and gradients are tied (no G16 support) then we
8608 // have already verified that both IsA16 and IsG16 are true
8609 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8610 // Activate g16
8611 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8612 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
8613 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8614 }
8615
8616 // Add gradients (packed or unpacked)
8617 if (IsG16) {
8618 // Pack the gradients
8619 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8620 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
8621 DimIdx: ArgOffset + Intr->GradientStart,
8622 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
8623 } else {
8624 for (unsigned I = ArgOffset + Intr->GradientStart;
8625 I < ArgOffset + Intr->CoordStart; I++)
8626 VAddrs.push_back(Elt: Op.getOperand(i: I));
8627 }
8628
8629 // Add addresses (packed or unpacked)
8630 if (IsA16) {
8631 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
8632 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
8633 NumGradients: 0 /* No gradients */);
8634 } else {
8635 // Add uncompressed address
8636 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8637 VAddrs.push_back(Elt: Op.getOperand(i: I));
8638 }
8639
8640 // If the register allocator cannot place the address registers contiguously
8641 // without introducing moves, then using the non-sequential address encoding
8642 // is always preferable, since it saves VALU instructions and is usually a
8643 // wash in terms of code size or even better.
8644 //
8645 // However, we currently have no way of hinting to the register allocator that
8646 // MIMG addresses should be placed contiguously when it is possible to do so,
8647 // so force non-NSA for the common 2-address case as a heuristic.
8648 //
8649 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8650 // allocation when possible.
8651 //
8652 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8653 // set of the remaining addresses.
8654 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
8655 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8656 const bool UseNSA = ST->hasNSAEncoding() &&
8657 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8658 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8659 const bool UsePartialNSA =
8660 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8661
8662 SDValue VAddr;
8663 if (UsePartialNSA) {
8664 VAddr = getBuildDwordsVector(DAG, DL,
8665 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
8666 } else if (!UseNSA) {
8667 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
8668 }
8669
8670 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
8671 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
8672 SDValue Unorm;
8673 if (!BaseOpcode->Sampler) {
8674 Unorm = True;
8675 } else {
8676 uint64_t UnormConst =
8677 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
8678
8679 Unorm = UnormConst ? True : False;
8680 }
8681
8682 SDValue TFE;
8683 SDValue LWE;
8684 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
8685 bool IsTexFail = false;
8686 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
8687 return Op;
8688
8689 if (IsTexFail) {
8690 if (!DMaskLanes) {
8691 // Expecting to get an error flag since TFC is on - and dmask is 0
8692 // Force dmask to be at least 1 otherwise the instruction will fail
8693 DMask = 0x1;
8694 DMaskLanes = 1;
8695 NumVDataDwords = 1;
8696 }
8697 NumVDataDwords += 1;
8698 AdjustRetType = true;
8699 }
8700
8701 // Has something earlier tagged that the return type needs adjusting
8702 // This happens if the instruction is a load or has set TexFailCtrl flags
8703 if (AdjustRetType) {
8704 // NumVDataDwords reflects the true number of dwords required in the return
8705 // type
8706 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8707 // This is a no-op load. This can be eliminated
8708 SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
8709 if (isa<MemSDNode>(Val: Op))
8710 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
8711 return Undef;
8712 }
8713
8714 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(Context&: *DAG.getContext(),
8715 VT: MVT::i32, NumElements: NumVDataDwords)
8716 : MVT::i32;
8717
8718 ResultTypes[0] = NewVT;
8719 if (ResultTypes.size() == 3) {
8720 // Original result was aggregate type used for TexFailCtrl results
8721 // The actual instruction returns as a vector type which has now been
8722 // created. Remove the aggregate result.
8723 ResultTypes.erase(CI: &ResultTypes[1]);
8724 }
8725 }
8726
8727 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
8728 if (BaseOpcode->Atomic)
8729 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8730 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8731 AMDGPU::CPol::VOLATILE))
8732 return Op;
8733
8734 SmallVector<SDValue, 26> Ops;
8735 if (BaseOpcode->Store || BaseOpcode->Atomic)
8736 Ops.push_back(Elt: VData); // vdata
8737 if (UsePartialNSA) {
8738 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
8739 Ops.push_back(Elt: VAddr);
8740 } else if (UseNSA)
8741 append_range(C&: Ops, R&: VAddrs);
8742 else
8743 Ops.push_back(Elt: VAddr);
8744 SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
8745 EVT RsrcVT = Rsrc.getValueType();
8746 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8747 return Op;
8748 Ops.push_back(Elt: Rsrc);
8749 if (BaseOpcode->Sampler) {
8750 SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
8751 if (Samp.getValueType() != MVT::v4i32)
8752 return Op;
8753 Ops.push_back(Elt: Samp);
8754 }
8755 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
8756 if (IsGFX10Plus)
8757 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
8758 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8759 Ops.push_back(Elt: Unorm);
8760 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
8761 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
8762 ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
8763 ? True
8764 : False);
8765 if (IsGFX10Plus)
8766 Ops.push_back(Elt: IsA16 ? True : False);
8767
8768 if (!Subtarget->hasGFX90AInsts())
8769 Ops.push_back(Elt: TFE); // tfe
8770 else if (TFE->getAsZExtVal()) {
8771 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8772 DAG.getMachineFunction().getFunction(),
8773 "TFE is not supported on this GPU", DL.getDebugLoc()));
8774 }
8775
8776 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8777 Ops.push_back(Elt: LWE); // lwe
8778 if (!IsGFX10Plus)
8779 Ops.push_back(Elt: DimInfo->DA ? True : False);
8780 if (BaseOpcode->HasD16)
8781 Ops.push_back(Elt: IsD16 ? True : False);
8782 if (isa<MemSDNode>(Val: Op))
8783 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
8784
8785 int NumVAddrDwords =
8786 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8787 int Opcode = -1;
8788
8789 if (IsGFX12Plus) {
8790 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
8791 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8792 } else if (IsGFX11Plus) {
8793 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8794 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
8795 : AMDGPU::MIMGEncGfx11Default,
8796 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8797 } else if (IsGFX10Plus) {
8798 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8799 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
8800 : AMDGPU::MIMGEncGfx10Default,
8801 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8802 } else {
8803 if (Subtarget->hasGFX90AInsts()) {
8804 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
8805 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8806 if (Opcode == -1) {
8807 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
8808 DAG.getMachineFunction().getFunction(),
8809 "requested image instruction is not supported on this GPU",
8810 DL.getDebugLoc()));
8811
8812 unsigned Idx = 0;
8813 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
8814 for (EVT VT : OrigResultTypes) {
8815 if (VT == MVT::Other)
8816 RetValues[Idx++] = Op.getOperand(i: 0); // Chain
8817 else
8818 RetValues[Idx++] = DAG.getPOISON(VT);
8819 }
8820
8821 return DAG.getMergeValues(Ops: RetValues, dl: DL);
8822 }
8823 }
8824 if (Opcode == -1 &&
8825 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8826 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
8827 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8828 if (Opcode == -1)
8829 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
8830 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8831 }
8832 if (Opcode == -1)
8833 return Op;
8834
8835 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
8836 if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
8837 MachineMemOperand *MemRef = MemOp->getMemOperand();
8838 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
8839 }
8840
8841 if (BaseOpcode->AtomicX2) {
8842 SmallVector<SDValue, 1> Elt;
8843 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
8844 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
8845 }
8846 if (BaseOpcode->NoReturn)
8847 return SDValue(NewNode, 0);
8848 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
8849 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
8850 NumVDataDwords, IsAtomicPacked16Bit, DL);
8851}
8852
8853SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8854 SDValue Offset, SDValue CachePolicy,
8855 SelectionDAG &DAG) const {
8856 MachineFunction &MF = DAG.getMachineFunction();
8857
8858 const DataLayout &DataLayout = DAG.getDataLayout();
8859 Align Alignment =
8860 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
8861
8862 MachineMemOperand *MMO = MF.getMachineMemOperand(
8863 PtrInfo: MachinePointerInfo(),
8864 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8865 MachineMemOperand::MOInvariant,
8866 Size: VT.getStoreSize(), BaseAlignment: Alignment);
8867
8868 if (!Offset->isDivergent()) {
8869 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8870
8871 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8872 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8873 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8874 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8875 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8876 SDValue BufferLoad =
8877 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
8878 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
8879 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
8880 }
8881
8882 // Widen vec3 load to vec4.
8883 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8884 !Subtarget->hasScalarDwordx3Loads()) {
8885 EVT WidenedVT =
8886 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
8887 auto WidenedOp = DAG.getMemIntrinsicNode(
8888 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
8889 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
8890 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
8891 N2: DAG.getVectorIdxConstant(Val: 0, DL));
8892 return Subvector;
8893 }
8894
8895 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
8896 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
8897 }
8898
8899 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8900 // assume that the buffer is unswizzled.
8901 SDValue Ops[] = {
8902 DAG.getEntryNode(), // Chain
8903 Rsrc, // rsrc
8904 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8905 {}, // voffset
8906 {}, // soffset
8907 {}, // offset
8908 CachePolicy, // cachepolicy
8909 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8910 };
8911 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8912 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
8913 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
8914 }
8915
8916 SmallVector<SDValue, 4> Loads;
8917 unsigned NumLoads = 1;
8918 MVT LoadVT = VT.getSimpleVT();
8919 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8920 assert((LoadVT.getScalarType() == MVT::i32 ||
8921 LoadVT.getScalarType() == MVT::f32));
8922
8923 if (NumElts == 8 || NumElts == 16) {
8924 NumLoads = NumElts / 4;
8925 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
8926 }
8927
8928 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
8929
8930 // Use the alignment to ensure that the required offsets will fit into the
8931 // immediate offsets.
8932 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
8933 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8934
8935 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8936 for (unsigned i = 0; i < NumLoads; ++i) {
8937 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
8938 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8939 MemVT: LoadVT, MMO, DAG));
8940 }
8941
8942 if (NumElts == 8 || NumElts == 16)
8943 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
8944
8945 return Loads[0];
8946}
8947
8948SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8949 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8950 if (!Subtarget->hasArchitectedSGPRs())
8951 return {};
8952 SDLoc SL(Op);
8953 MVT VT = MVT::i32;
8954 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
8955 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
8956 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
8957}
8958
8959SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8960 unsigned Dim,
8961 const ArgDescriptor &Arg) const {
8962 SDLoc SL(Op);
8963 MachineFunction &MF = DAG.getMachineFunction();
8964 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
8965 if (MaxID == 0)
8966 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
8967
8968 // It's undefined behavior if a function marked with the amdgpu-no-*
8969 // attributes uses the corresponding intrinsic.
8970 if (!Arg)
8971 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
8972
8973 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
8974 SL: SDLoc(DAG.getEntryNode()), Arg);
8975
8976 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8977 // masking operations anyway.
8978 //
8979 // TODO: We could assert the top bit is 0 for the source copy.
8980 if (Arg.isMasked())
8981 return Val;
8982
8983 // Preserve the known bits after expansion to a copy.
8984 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
8985 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
8986 N2: DAG.getValueType(SmallVT));
8987}
8988
8989SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8990 SelectionDAG &DAG) const {
8991 MachineFunction &MF = DAG.getMachineFunction();
8992 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
8993
8994 EVT VT = Op.getValueType();
8995 SDLoc DL(Op);
8996 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
8997
8998 // TODO: Should this propagate fast-math-flags?
8999
9000 switch (IntrinsicID) {
9001 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9002 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
9003 return emitNonHSAIntrinsicError(DAG, DL, VT);
9004 return getPreloadedValue(DAG, MFI: *MFI, VT,
9005 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
9006 }
9007 case Intrinsic::amdgcn_dispatch_ptr:
9008 case Intrinsic::amdgcn_queue_ptr: {
9009 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
9010 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9011 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9012 DL.getDebugLoc()));
9013 return DAG.getPOISON(VT);
9014 }
9015
9016 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9017 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
9018 : AMDGPUFunctionArgInfo::QUEUE_PTR;
9019 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
9020 }
9021 case Intrinsic::amdgcn_implicitarg_ptr: {
9022 if (MFI->isEntryFunction())
9023 return getImplicitArgPtr(DAG, SL: DL);
9024 return getPreloadedValue(DAG, MFI: *MFI, VT,
9025 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
9026 }
9027 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9028 if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) {
9029 // This only makes sense to call in a kernel, so just lower to null.
9030 return DAG.getConstant(Val: 0, DL, VT);
9031 }
9032
9033 return getPreloadedValue(DAG, MFI: *MFI, VT,
9034 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
9035 }
9036 case Intrinsic::amdgcn_dispatch_id: {
9037 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
9038 }
9039 case Intrinsic::amdgcn_rcp:
9040 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
9041 case Intrinsic::amdgcn_rsq:
9042 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
9043 case Intrinsic::amdgcn_rsq_legacy:
9044 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9045 return emitRemovedIntrinsicError(DAG, DL, VT);
9046 return SDValue();
9047 case Intrinsic::amdgcn_rcp_legacy:
9048 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9049 return emitRemovedIntrinsicError(DAG, DL, VT);
9050 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
9051 case Intrinsic::amdgcn_rsq_clamp: {
9052 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9053 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
9054
9055 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
9056 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
9057 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
9058
9059 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
9060 SDValue Tmp =
9061 DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
9062 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
9063 N2: DAG.getConstantFP(Val: Min, DL, VT));
9064 }
9065 case Intrinsic::r600_read_ngroups_x:
9066 if (Subtarget->isAmdHsaOS())
9067 return emitNonHSAIntrinsicError(DAG, DL, VT);
9068
9069 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9070 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
9071 Signed: false);
9072 case Intrinsic::r600_read_ngroups_y:
9073 if (Subtarget->isAmdHsaOS())
9074 return emitNonHSAIntrinsicError(DAG, DL, VT);
9075
9076 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9077 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
9078 Signed: false);
9079 case Intrinsic::r600_read_ngroups_z:
9080 if (Subtarget->isAmdHsaOS())
9081 return emitNonHSAIntrinsicError(DAG, DL, VT);
9082
9083 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
9084 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
9085 Signed: false);
9086 case Intrinsic::r600_read_local_size_x:
9087 if (Subtarget->isAmdHsaOS())
9088 return emitNonHSAIntrinsicError(DAG, DL, VT);
9089
9090 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9091 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
9092 case Intrinsic::r600_read_local_size_y:
9093 if (Subtarget->isAmdHsaOS())
9094 return emitNonHSAIntrinsicError(DAG, DL, VT);
9095
9096 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9097 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
9098 case Intrinsic::r600_read_local_size_z:
9099 if (Subtarget->isAmdHsaOS())
9100 return emitNonHSAIntrinsicError(DAG, DL, VT);
9101
9102 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
9103 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
9104 case Intrinsic::amdgcn_workgroup_id_x:
9105 return getPreloadedValue(DAG, MFI: *MFI, VT,
9106 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
9107 case Intrinsic::amdgcn_workgroup_id_y:
9108 return getPreloadedValue(DAG, MFI: *MFI, VT,
9109 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
9110 case Intrinsic::amdgcn_workgroup_id_z:
9111 return getPreloadedValue(DAG, MFI: *MFI, VT,
9112 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
9113 case Intrinsic::amdgcn_wave_id:
9114 return lowerWaveID(DAG, Op);
9115 case Intrinsic::amdgcn_lds_kernel_id: {
9116 if (MFI->isEntryFunction())
9117 return getLDSKernelId(DAG, SL: DL);
9118 return getPreloadedValue(DAG, MFI: *MFI, VT,
9119 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
9120 }
9121 case Intrinsic::amdgcn_workitem_id_x:
9122 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
9123 case Intrinsic::amdgcn_workitem_id_y:
9124 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
9125 case Intrinsic::amdgcn_workitem_id_z:
9126 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
9127 case Intrinsic::amdgcn_wavefrontsize:
9128 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9129 DL: SDLoc(Op), VT: MVT::i32);
9130 case Intrinsic::amdgcn_s_buffer_load: {
9131 unsigned CPol = Op.getConstantOperandVal(i: 3);
9132 // s_buffer_load, because of how it's optimized, can't be volatile
9133 // so reject ones with the volatile bit set.
9134 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9135 ? AMDGPU::CPol::ALL
9136 : AMDGPU::CPol::ALL_pregfx12))
9137 return Op;
9138 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2),
9139 CachePolicy: Op.getOperand(i: 3), DAG);
9140 }
9141 case Intrinsic::amdgcn_fdiv_fast:
9142 return lowerFDIV_FAST(Op, DAG);
9143 case Intrinsic::amdgcn_sin:
9144 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
9145
9146 case Intrinsic::amdgcn_cos:
9147 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
9148
9149 case Intrinsic::amdgcn_mul_u24:
9150 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1),
9151 N2: Op.getOperand(i: 2));
9152 case Intrinsic::amdgcn_mul_i24:
9153 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1),
9154 N2: Op.getOperand(i: 2));
9155
9156 case Intrinsic::amdgcn_log_clamp: {
9157 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9158 return SDValue();
9159
9160 return emitRemovedIntrinsicError(DAG, DL, VT);
9161 }
9162 case Intrinsic::amdgcn_fract:
9163 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
9164
9165 case Intrinsic::amdgcn_class:
9166 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: 1),
9167 N2: Op.getOperand(i: 2));
9168 case Intrinsic::amdgcn_div_fmas:
9169 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: 1),
9170 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
9171
9172 case Intrinsic::amdgcn_div_fixup:
9173 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: 1),
9174 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9175
9176 case Intrinsic::amdgcn_div_scale: {
9177 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
9178
9179 // Translate to the operands expected by the machine instruction. The
9180 // first parameter must be the same as the first instruction.
9181 SDValue Numerator = Op.getOperand(i: 1);
9182 SDValue Denominator = Op.getOperand(i: 2);
9183
9184 // Note this order is opposite of the machine instruction's operations,
9185 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9186 // intrinsic has the numerator as the first operand to match a normal
9187 // division operation.
9188
9189 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9190
9191 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
9192 N2: Denominator, N3: Numerator);
9193 }
9194 case Intrinsic::amdgcn_icmp: {
9195 // There is a Pat that handles this variant, so return it as-is.
9196 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
9197 Op.getConstantOperandVal(i: 2) == 0 &&
9198 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
9199 return Op;
9200 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9201 }
9202 case Intrinsic::amdgcn_fcmp: {
9203 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9204 }
9205 case Intrinsic::amdgcn_ballot:
9206 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
9207 case Intrinsic::amdgcn_fmed3:
9208 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: 1),
9209 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9210 case Intrinsic::amdgcn_fdot2:
9211 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: 1),
9212 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
9213 case Intrinsic::amdgcn_fmul_legacy:
9214 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: 1),
9215 N2: Op.getOperand(i: 2));
9216 case Intrinsic::amdgcn_sffbh:
9217 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
9218 case Intrinsic::amdgcn_sbfe:
9219 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: 1),
9220 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9221 case Intrinsic::amdgcn_ubfe:
9222 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: 1),
9223 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9224 case Intrinsic::amdgcn_cvt_pkrtz:
9225 case Intrinsic::amdgcn_cvt_pknorm_i16:
9226 case Intrinsic::amdgcn_cvt_pknorm_u16:
9227 case Intrinsic::amdgcn_cvt_pk_i16:
9228 case Intrinsic::amdgcn_cvt_pk_u16: {
9229 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9230 EVT VT = Op.getValueType();
9231 unsigned Opcode;
9232
9233 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9234 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
9235 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9236 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
9237 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9238 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
9239 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9240 Opcode = AMDGPUISD::CVT_PK_I16_I32;
9241 else
9242 Opcode = AMDGPUISD::CVT_PK_U16_U32;
9243
9244 if (isTypeLegal(VT))
9245 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
9246
9247 SDValue Node =
9248 DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
9249 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
9250 }
9251 case Intrinsic::amdgcn_fmad_ftz:
9252 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
9253 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9254
9255 case Intrinsic::amdgcn_if_break:
9256 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
9257 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)),
9258 0);
9259
9260 case Intrinsic::amdgcn_groupstaticsize: {
9261 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
9262 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
9263 return Op;
9264
9265 const Module *M = MF.getFunction().getParent();
9266 const GlobalValue *GV =
9267 Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
9268 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
9269 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9270 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
9271 }
9272 case Intrinsic::amdgcn_is_shared:
9273 case Intrinsic::amdgcn_is_private: {
9274 SDLoc SL(Op);
9275 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
9276 ? AMDGPUAS::LOCAL_ADDRESS
9277 : AMDGPUAS::PRIVATE_ADDRESS;
9278 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
9279 SDValue SrcVec =
9280 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
9281
9282 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
9283 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
9284 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
9285 }
9286 case Intrinsic::amdgcn_perm:
9287 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
9288 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
9289 case Intrinsic::amdgcn_reloc_constant: {
9290 Module *M = const_cast<Module *>(MF.getFunction().getParent());
9291 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
9292 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
9293 auto *RelocSymbol = cast<GlobalVariable>(
9294 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
9295 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
9296 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9297 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
9298 }
9299 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
9300 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
9301 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
9302 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
9303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
9304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
9305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
9306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
9307 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
9308 return SDValue();
9309
9310 SDLoc SL(Op);
9311 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
9312 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
9313 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
9314 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
9315 }
9316 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
9317 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
9318 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
9319 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
9320 return SDValue();
9321
9322 SDLoc SL(Op);
9323 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
9324 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
9325 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
9326 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
9327 IndexKeyi32, Op.getOperand(i: 7)});
9328 }
9329 case Intrinsic::amdgcn_addrspacecast_nonnull:
9330 return lowerADDRSPACECAST(Op, DAG);
9331 case Intrinsic::amdgcn_readlane:
9332 case Intrinsic::amdgcn_readfirstlane:
9333 case Intrinsic::amdgcn_writelane:
9334 case Intrinsic::amdgcn_permlane16:
9335 case Intrinsic::amdgcn_permlanex16:
9336 case Intrinsic::amdgcn_permlane64:
9337 case Intrinsic::amdgcn_set_inactive:
9338 case Intrinsic::amdgcn_set_inactive_chain_arg:
9339 case Intrinsic::amdgcn_mov_dpp8:
9340 case Intrinsic::amdgcn_update_dpp:
9341 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
9342 case Intrinsic::amdgcn_dead: {
9343 SmallVector<SDValue, 8> Poisons;
9344 for (const EVT ValTy : Op.getNode()->values())
9345 Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
9346 return DAG.getMergeValues(Ops: Poisons, dl: SDLoc(Op));
9347 }
9348 default:
9349 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9350 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
9351 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
9352
9353 return Op;
9354 }
9355}
9356
9357// On targets not supporting constant in soffset field, turn zero to
9358// SGPR_NULL to avoid generating an extra s_mov with zero.
9359static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
9360 const GCNSubtarget *Subtarget) {
9361 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
9362 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
9363 return SOffset;
9364}
9365
9366SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
9367 SelectionDAG &DAG,
9368 unsigned NewOpcode) const {
9369 SDLoc DL(Op);
9370
9371 SDValue VData = Op.getOperand(i: 2);
9372 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9373 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9374 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9375 SDValue Ops[] = {
9376 Op.getOperand(i: 0), // Chain
9377 VData, // vdata
9378 Rsrc, // rsrc
9379 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9380 VOffset, // voffset
9381 SOffset, // soffset
9382 Offset, // offset
9383 Op.getOperand(i: 6), // cachepolicy
9384 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9385 };
9386
9387 auto *M = cast<MemSDNode>(Val&: Op);
9388
9389 EVT MemVT = VData.getValueType();
9390 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
9391 MMO: M->getMemOperand());
9392}
9393
9394SDValue
9395SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
9396 unsigned NewOpcode) const {
9397 SDLoc DL(Op);
9398
9399 SDValue VData = Op.getOperand(i: 2);
9400 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9401 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9402 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9403 SDValue Ops[] = {
9404 Op.getOperand(i: 0), // Chain
9405 VData, // vdata
9406 Rsrc, // rsrc
9407 Op.getOperand(i: 4), // vindex
9408 VOffset, // voffset
9409 SOffset, // soffset
9410 Offset, // offset
9411 Op.getOperand(i: 7), // cachepolicy
9412 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9413 };
9414
9415 auto *M = cast<MemSDNode>(Val&: Op);
9416
9417 EVT MemVT = VData.getValueType();
9418 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
9419 MMO: M->getMemOperand());
9420}
9421
9422SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
9423 SelectionDAG &DAG) const {
9424 unsigned IntrID = Op.getConstantOperandVal(i: 1);
9425 SDLoc DL(Op);
9426
9427 switch (IntrID) {
9428 case Intrinsic::amdgcn_ds_ordered_add:
9429 case Intrinsic::amdgcn_ds_ordered_swap: {
9430 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9431 SDValue Chain = M->getOperand(Num: 0);
9432 SDValue M0 = M->getOperand(Num: 2);
9433 SDValue Value = M->getOperand(Num: 3);
9434 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
9435 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
9436 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
9437
9438 unsigned OrderedCountIndex = IndexOperand & 0x3f;
9439 IndexOperand &= ~0x3f;
9440 unsigned CountDw = 0;
9441
9442 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
9443 CountDw = (IndexOperand >> 24) & 0xf;
9444 IndexOperand &= ~(0xf << 24);
9445
9446 if (CountDw < 1 || CountDw > 4) {
9447 const Function &Fn = DAG.getMachineFunction().getFunction();
9448 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9449 Fn, "ds_ordered_count: dword count must be between 1 and 4",
9450 DL.getDebugLoc()));
9451 CountDw = 1;
9452 }
9453 }
9454
9455 if (IndexOperand) {
9456 const Function &Fn = DAG.getMachineFunction().getFunction();
9457 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9458 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
9459 }
9460
9461 if (WaveDone && !WaveRelease) {
9462 // TODO: Move this to IR verifier
9463 const Function &Fn = DAG.getMachineFunction().getFunction();
9464 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9465 Fn, "ds_ordered_count: wave_done requires wave_release",
9466 DL.getDebugLoc()));
9467 }
9468
9469 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
9470 unsigned ShaderType =
9471 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
9472 unsigned Offset0 = OrderedCountIndex << 2;
9473 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
9474
9475 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
9476 Offset1 |= (CountDw - 1) << 6;
9477
9478 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
9479 Offset1 |= ShaderType << 2;
9480
9481 unsigned Offset = Offset0 | (Offset1 << 8);
9482
9483 SDValue Ops[] = {
9484 Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
9485 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
9486 };
9487 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
9488 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
9489 MMO: M->getMemOperand());
9490 }
9491 case Intrinsic::amdgcn_raw_buffer_load:
9492 case Intrinsic::amdgcn_raw_ptr_buffer_load:
9493 case Intrinsic::amdgcn_raw_atomic_buffer_load:
9494 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
9495 case Intrinsic::amdgcn_raw_buffer_load_format:
9496 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
9497 const bool IsFormat =
9498 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
9499 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
9500
9501 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9502 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
9503 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
9504 SDValue Ops[] = {
9505 Op.getOperand(i: 0), // Chain
9506 Rsrc, // rsrc
9507 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9508 VOffset, // voffset
9509 SOffset, // soffset
9510 Offset, // offset
9511 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
9512 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9513 };
9514
9515 auto *M = cast<MemSDNode>(Val&: Op);
9516 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
9517 }
9518 case Intrinsic::amdgcn_struct_buffer_load:
9519 case Intrinsic::amdgcn_struct_ptr_buffer_load:
9520 case Intrinsic::amdgcn_struct_buffer_load_format:
9521 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
9522 case Intrinsic::amdgcn_struct_atomic_buffer_load:
9523 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
9524 const bool IsFormat =
9525 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
9526 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
9527
9528 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9529 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9530 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9531 SDValue Ops[] = {
9532 Op.getOperand(i: 0), // Chain
9533 Rsrc, // rsrc
9534 Op.getOperand(i: 3), // vindex
9535 VOffset, // voffset
9536 SOffset, // soffset
9537 Offset, // offset
9538 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
9539 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9540 };
9541
9542 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
9543 }
9544 case Intrinsic::amdgcn_raw_tbuffer_load:
9545 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
9546 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9547 EVT LoadVT = Op.getValueType();
9548 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9549 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
9550 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
9551
9552 SDValue Ops[] = {
9553 Op.getOperand(i: 0), // Chain
9554 Rsrc, // rsrc
9555 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9556 VOffset, // voffset
9557 SOffset, // soffset
9558 Offset, // offset
9559 Op.getOperand(i: 5), // format
9560 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
9561 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9562 };
9563
9564 if (LoadVT.getScalarType() == MVT::f16)
9565 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9566 Ops);
9567 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9568 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
9569 DAG);
9570 }
9571 case Intrinsic::amdgcn_struct_tbuffer_load:
9572 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9573 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9574 EVT LoadVT = Op.getValueType();
9575 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9576 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9577 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9578
9579 SDValue Ops[] = {
9580 Op.getOperand(i: 0), // Chain
9581 Rsrc, // rsrc
9582 Op.getOperand(i: 3), // vindex
9583 VOffset, // voffset
9584 SOffset, // soffset
9585 Offset, // offset
9586 Op.getOperand(i: 6), // format
9587 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
9588 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9589 };
9590
9591 if (LoadVT.getScalarType() == MVT::f16)
9592 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
9593 Ops);
9594 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9595 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
9596 DAG);
9597 }
9598 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9599 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9600 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9601 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9602 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9603 return lowerStructBufferAtomicIntrin(Op, DAG,
9604 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9605 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9606 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9607 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9608 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9610 return lowerStructBufferAtomicIntrin(Op, DAG,
9611 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9612 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9613 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9614 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9615 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9617 return lowerStructBufferAtomicIntrin(Op, DAG,
9618 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9619 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9620 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9621 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9622 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9623 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9624 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9625 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9626 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9627 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9628 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9629 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9630 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9631 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9633 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9634 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9635 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9636 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9637 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9638 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9639 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9640 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9641 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9642 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9643 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9644 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9645 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9646 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9648 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9649 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9650 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9651 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9652 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9653 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9654 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9655 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9656 return lowerRawBufferAtomicIntrin(Op, DAG,
9657 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9658 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9659 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9660 return lowerStructBufferAtomicIntrin(Op, DAG,
9661 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9662 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9663 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9664 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9665 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9666 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9667 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9668 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9669 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9670 return lowerStructBufferAtomicIntrin(Op, DAG,
9671 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9672 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9673 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9674 return lowerStructBufferAtomicIntrin(Op, DAG,
9675 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9676 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9677 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9678 return lowerStructBufferAtomicIntrin(Op, DAG,
9679 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9680 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9681 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9682 return lowerStructBufferAtomicIntrin(Op, DAG,
9683 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9684 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9685 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9686 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9687 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9688 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9689 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9690 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9691 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9692 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9693 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9694 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9695 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9696 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9697 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9698 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9699 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9700 return lowerStructBufferAtomicIntrin(Op, DAG,
9701 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9702
9703 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9704 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9705 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
9706 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9707 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9708 SDValue Ops[] = {
9709 Op.getOperand(i: 0), // Chain
9710 Op.getOperand(i: 2), // src
9711 Op.getOperand(i: 3), // cmp
9712 Rsrc, // rsrc
9713 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9714 VOffset, // voffset
9715 SOffset, // soffset
9716 Offset, // offset
9717 Op.getOperand(i: 7), // cachepolicy
9718 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9719 };
9720 EVT VT = Op.getValueType();
9721 auto *M = cast<MemSDNode>(Val&: Op);
9722
9723 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9724 VTList: Op->getVTList(), Ops, MemVT: VT,
9725 MMO: M->getMemOperand());
9726 }
9727 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9728 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9729 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
9730 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
9731 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
9732 SDValue Ops[] = {
9733 Op.getOperand(i: 0), // Chain
9734 Op.getOperand(i: 2), // src
9735 Op.getOperand(i: 3), // cmp
9736 Rsrc, // rsrc
9737 Op.getOperand(i: 5), // vindex
9738 VOffset, // voffset
9739 SOffset, // soffset
9740 Offset, // offset
9741 Op.getOperand(i: 8), // cachepolicy
9742 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9743 };
9744 EVT VT = Op.getValueType();
9745 auto *M = cast<MemSDNode>(Val&: Op);
9746
9747 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9748 VTList: Op->getVTList(), Ops, MemVT: VT,
9749 MMO: M->getMemOperand());
9750 }
9751 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
9752 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
9753 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9754 SDValue NodePtr = M->getOperand(Num: 2);
9755 SDValue RayExtent = M->getOperand(Num: 3);
9756 SDValue InstanceMask = M->getOperand(Num: 4);
9757 SDValue RayOrigin = M->getOperand(Num: 5);
9758 SDValue RayDir = M->getOperand(Num: 6);
9759 SDValue Offsets = M->getOperand(Num: 7);
9760 SDValue TDescr = M->getOperand(Num: 8);
9761
9762 assert(NodePtr.getValueType() == MVT::i64);
9763 assert(RayDir.getValueType() == MVT::v3f32);
9764
9765 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
9766 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9767 return SDValue();
9768 }
9769
9770 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
9771 const unsigned NumVDataDwords = 10;
9772 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
9773 int Opcode = AMDGPU::getMIMGOpcode(
9774 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
9775 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
9776 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9777 assert(Opcode != -1);
9778
9779 SmallVector<SDValue, 7> Ops;
9780 Ops.push_back(Elt: NodePtr);
9781 Ops.push_back(Elt: DAG.getBuildVector(
9782 VT: MVT::v2i32, DL,
9783 Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
9784 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
9785 Ops.push_back(Elt: RayOrigin);
9786 Ops.push_back(Elt: RayDir);
9787 Ops.push_back(Elt: Offsets);
9788 Ops.push_back(Elt: TDescr);
9789 Ops.push_back(Elt: M->getChain());
9790
9791 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9792 MachineMemOperand *MemRef = M->getMemOperand();
9793 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9794 return SDValue(NewNode, 0);
9795 }
9796 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9797 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9798 SDValue NodePtr = M->getOperand(Num: 2);
9799 SDValue RayExtent = M->getOperand(Num: 3);
9800 SDValue RayOrigin = M->getOperand(Num: 4);
9801 SDValue RayDir = M->getOperand(Num: 5);
9802 SDValue RayInvDir = M->getOperand(Num: 6);
9803 SDValue TDescr = M->getOperand(Num: 7);
9804
9805 assert(NodePtr.getValueType() == MVT::i32 ||
9806 NodePtr.getValueType() == MVT::i64);
9807 assert(RayDir.getValueType() == MVT::v3f16 ||
9808 RayDir.getValueType() == MVT::v3f32);
9809
9810 if (!Subtarget->hasGFX10_AEncoding()) {
9811 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9812 return SDValue();
9813 }
9814
9815 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
9816 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9817 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9818 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9819 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9820 const unsigned NumVDataDwords = 4;
9821 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9822 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9823 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9824 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9825 IsGFX12Plus;
9826 const unsigned BaseOpcodes[2][2] = {
9827 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9828 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9829 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9830 int Opcode;
9831 if (UseNSA) {
9832 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9833 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9834 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9835 : AMDGPU::MIMGEncGfx10NSA,
9836 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9837 } else {
9838 assert(!IsGFX12Plus);
9839 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9840 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9841 : AMDGPU::MIMGEncGfx10Default,
9842 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9843 }
9844 assert(Opcode != -1);
9845
9846 SmallVector<SDValue, 16> Ops;
9847
9848 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
9849 SmallVector<SDValue, 3> Lanes;
9850 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
9851 if (Lanes[0].getValueSizeInBits() == 32) {
9852 for (unsigned I = 0; I < 3; ++I)
9853 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
9854 } else {
9855 if (IsAligned) {
9856 Ops.push_back(Elt: DAG.getBitcast(
9857 VT: MVT::i32,
9858 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[0], Lanes[1]})));
9859 Ops.push_back(Elt: Lanes[2]);
9860 } else {
9861 SDValue Elt0 = Ops.pop_back_val();
9862 Ops.push_back(Elt: DAG.getBitcast(
9863 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes[0]})));
9864 Ops.push_back(Elt: DAG.getBitcast(
9865 VT: MVT::i32,
9866 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[1], Lanes[2]})));
9867 }
9868 }
9869 };
9870
9871 if (UseNSA && IsGFX11Plus) {
9872 Ops.push_back(Elt: NodePtr);
9873 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9874 Ops.push_back(Elt: RayOrigin);
9875 if (IsA16) {
9876 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9877 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
9878 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
9879 for (unsigned I = 0; I < 3; ++I) {
9880 MergedLanes.push_back(Elt: DAG.getBitcast(
9881 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9882 Ops: {DirLanes[I], InvDirLanes[I]})));
9883 }
9884 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
9885 } else {
9886 Ops.push_back(Elt: RayDir);
9887 Ops.push_back(Elt: RayInvDir);
9888 }
9889 } else {
9890 if (Is64)
9891 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
9892 Count: 2);
9893 else
9894 Ops.push_back(Elt: NodePtr);
9895
9896 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9897 packLanes(RayOrigin, true);
9898 packLanes(RayDir, true);
9899 packLanes(RayInvDir, false);
9900 }
9901
9902 if (!UseNSA) {
9903 // Build a single vector containing all the operands so far prepared.
9904 if (NumVAddrDwords > 12) {
9905 SDValue Undef = DAG.getPOISON(VT: MVT::i32);
9906 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
9907 }
9908 assert(Ops.size() >= 8 && Ops.size() <= 12);
9909 SDValue MergedOps =
9910 DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
9911 Ops.clear();
9912 Ops.push_back(Elt: MergedOps);
9913 }
9914
9915 Ops.push_back(Elt: TDescr);
9916 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
9917 Ops.push_back(Elt: M->getChain());
9918
9919 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9920 MachineMemOperand *MemRef = M->getMemOperand();
9921 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9922 return SDValue(NewNode, 0);
9923 }
9924 case Intrinsic::amdgcn_global_atomic_fmin_num:
9925 case Intrinsic::amdgcn_global_atomic_fmax_num:
9926 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9927 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9928 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9929 SDValue Ops[] = {
9930 M->getOperand(Num: 0), // Chain
9931 M->getOperand(Num: 2), // Ptr
9932 M->getOperand(Num: 3) // Value
9933 };
9934 unsigned Opcode = 0;
9935 switch (IntrID) {
9936 case Intrinsic::amdgcn_global_atomic_fmin_num:
9937 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9938 Opcode = ISD::ATOMIC_LOAD_FMIN;
9939 break;
9940 }
9941 case Intrinsic::amdgcn_global_atomic_fmax_num:
9942 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9943 Opcode = ISD::ATOMIC_LOAD_FMAX;
9944 break;
9945 }
9946 default:
9947 llvm_unreachable("unhandled atomic opcode");
9948 }
9949 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
9950 Ops, MMO: M->getMemOperand());
9951 }
9952 case Intrinsic::amdgcn_s_get_barrier_state:
9953 case Intrinsic::amdgcn_s_get_named_barrier_state: {
9954 SDValue Chain = Op->getOperand(Num: 0);
9955 SmallVector<SDValue, 2> Ops;
9956 unsigned Opc;
9957
9958 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
9959 uint64_t BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getZExtValue();
9960 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
9961 BarID = (BarID >> 4) & 0x3F;
9962 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9963 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
9964 Ops.push_back(Elt: K);
9965 Ops.push_back(Elt: Chain);
9966 } else {
9967 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9968 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
9969 SDValue M0Val;
9970 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op->getOperand(Num: 2),
9971 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
9972 M0Val = SDValue(
9973 DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
9974 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
9975 0);
9976 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
9977 } else
9978 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op->getOperand(Num: 2)).getValue(R: 0));
9979 }
9980
9981 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9982 return SDValue(NewMI, 0);
9983 }
9984 default:
9985
9986 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9987 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
9988 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9989
9990 return SDValue();
9991 }
9992}
9993
9994// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9995// dwordx4 if on SI and handle TFE loads.
9996SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9997 SDVTList VTList,
9998 ArrayRef<SDValue> Ops, EVT MemVT,
9999 MachineMemOperand *MMO,
10000 SelectionDAG &DAG) const {
10001 LLVMContext &C = *DAG.getContext();
10002 MachineFunction &MF = DAG.getMachineFunction();
10003 EVT VT = VTList.VTs[0];
10004
10005 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10006 bool IsTFE = VTList.NumVTs == 3;
10007 if (IsTFE) {
10008 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
10009 unsigned NumOpDWords = NumValueDWords + 1;
10010 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
10011 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
10012 MachineMemOperand *OpDWordsMMO =
10013 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
10014 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
10015 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
10016 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10017 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
10018 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
10019 SDValue ValueDWords =
10020 NumValueDWords == 1
10021 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
10022 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
10023 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
10024 N2: ZeroIdx);
10025 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
10026 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
10027 }
10028
10029 if (!Subtarget->hasDwordx3LoadStores() &&
10030 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10031 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
10032 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
10033 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
10034 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
10035 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
10036 MemVT: WidenedMemVT, MMO: WidenedMMO);
10037 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
10038 N2: DAG.getVectorIdxConstant(Val: 0, DL));
10039 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
10040 }
10041
10042 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
10043}
10044
10045SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10046 bool ImageStore) const {
10047 EVT StoreVT = VData.getValueType();
10048
10049 // No change for f16 and legal vector D16 types.
10050 if (!StoreVT.isVector())
10051 return VData;
10052
10053 SDLoc DL(VData);
10054 unsigned NumElements = StoreVT.getVectorNumElements();
10055
10056 if (Subtarget->hasUnpackedD16VMem()) {
10057 // We need to unpack the packed data to store.
10058 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10059 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10060
10061 EVT EquivStoreVT =
10062 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
10063 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
10064 return DAG.UnrollVectorOp(N: ZExt.getNode());
10065 }
10066
10067 // The sq block of gfx8.1 does not estimate register use correctly for d16
10068 // image store instructions. The data operand is computed as if it were not a
10069 // d16 image instruction.
10070 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10071 // Bitcast to i16
10072 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10073 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10074
10075 // Decompose into scalars
10076 SmallVector<SDValue, 4> Elts;
10077 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
10078
10079 // Group pairs of i16 into v2i16 and bitcast to i32
10080 SmallVector<SDValue, 4> PackedElts;
10081 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10082 SDValue Pair =
10083 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
10084 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
10085 PackedElts.push_back(Elt: IntPair);
10086 }
10087 if ((NumElements % 2) == 1) {
10088 // Handle v3i16
10089 unsigned I = Elts.size() / 2;
10090 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
10091 Ops: {Elts[I * 2], DAG.getPOISON(VT: MVT::i16)});
10092 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
10093 PackedElts.push_back(Elt: IntPair);
10094 }
10095
10096 // Pad using UNDEF
10097 PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
10098
10099 // Build final vector
10100 EVT VecVT =
10101 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
10102 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
10103 }
10104
10105 if (NumElements == 3) {
10106 EVT IntStoreVT =
10107 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
10108 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
10109
10110 EVT WidenedStoreVT = EVT::getVectorVT(
10111 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
10112 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
10113 BitWidth: WidenedStoreVT.getStoreSizeInBits());
10114 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
10115 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
10116 }
10117
10118 assert(isTypeLegal(StoreVT));
10119 return VData;
10120}
10121
10122SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10123 SelectionDAG &DAG) const {
10124 SDLoc DL(Op);
10125 SDValue Chain = Op.getOperand(i: 0);
10126 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
10127 MachineFunction &MF = DAG.getMachineFunction();
10128
10129 switch (IntrinsicID) {
10130 case Intrinsic::amdgcn_exp_compr: {
10131 if (!Subtarget->hasCompressedExport()) {
10132 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10133 DAG.getMachineFunction().getFunction(),
10134 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10135 }
10136 SDValue Src0 = Op.getOperand(i: 4);
10137 SDValue Src1 = Op.getOperand(i: 5);
10138 // Hack around illegal type on SI by directly selecting it.
10139 if (isTypeLegal(VT: Src0.getValueType()))
10140 return SDValue();
10141
10142 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
10143 SDValue Undef = DAG.getPOISON(VT: MVT::f32);
10144 const SDValue Ops[] = {
10145 Op.getOperand(i: 2), // tgt
10146 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
10147 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
10148 Undef, // src2
10149 Undef, // src3
10150 Op.getOperand(i: 7), // vm
10151 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
10152 Op.getOperand(i: 3), // en
10153 Op.getOperand(i: 0) // Chain
10154 };
10155
10156 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10157 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
10158 }
10159 case Intrinsic::amdgcn_s_barrier:
10160 case Intrinsic::amdgcn_s_barrier_signal:
10161 case Intrinsic::amdgcn_s_barrier_wait: {
10162 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
10163 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
10164 unsigned WGSize = ST.getFlatWorkGroupSizes(F: MF.getFunction()).second;
10165 if (WGSize <= ST.getWavefrontSize()) {
10166 // If the workgroup fits in a wave, remove s_barrier_signal and lower
10167 // s_barrier/s_barrier_wait to wave_barrier.
10168 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
10169 return Op.getOperand(i: 0);
10170 else
10171 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::WAVE_BARRIER, dl: DL,
10172 VT: MVT::Other, Op1: Op.getOperand(i: 0)),
10173 0);
10174 }
10175 }
10176
10177 if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
10178 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
10179 SDValue K =
10180 DAG.getSignedTargetConstant(Val: AMDGPU::Barrier::WORKGROUP, DL, VT: MVT::i32);
10181 SDValue BarSignal =
10182 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM, dl: DL,
10183 VT: MVT::Other, Op1: K, Op2: Op.getOperand(i: 0)),
10184 0);
10185 SDValue BarWait =
10186 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_WAIT, dl: DL, VT: MVT::Other, Op1: K,
10187 Op2: BarSignal.getValue(R: 0)),
10188 0);
10189 return BarWait;
10190 }
10191
10192 return SDValue();
10193 };
10194
10195 case Intrinsic::amdgcn_struct_tbuffer_store:
10196 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10197 SDValue VData = Op.getOperand(i: 2);
10198 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10199 if (IsD16)
10200 VData = handleD16VData(VData, DAG);
10201 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10202 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10203 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10204 SDValue Ops[] = {
10205 Chain,
10206 VData, // vdata
10207 Rsrc, // rsrc
10208 Op.getOperand(i: 4), // vindex
10209 VOffset, // voffset
10210 SOffset, // soffset
10211 Offset, // offset
10212 Op.getOperand(i: 7), // format
10213 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
10214 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10215 };
10216 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10217 : AMDGPUISD::TBUFFER_STORE_FORMAT;
10218 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10219 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
10220 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10221 }
10222
10223 case Intrinsic::amdgcn_raw_tbuffer_store:
10224 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
10225 SDValue VData = Op.getOperand(i: 2);
10226 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10227 if (IsD16)
10228 VData = handleD16VData(VData, DAG);
10229 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10230 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10231 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10232 SDValue Ops[] = {
10233 Chain,
10234 VData, // vdata
10235 Rsrc, // rsrc
10236 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10237 VOffset, // voffset
10238 SOffset, // soffset
10239 Offset, // offset
10240 Op.getOperand(i: 6), // format
10241 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
10242 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10243 };
10244 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10245 : AMDGPUISD::TBUFFER_STORE_FORMAT;
10246 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10247 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
10248 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10249 }
10250
10251 case Intrinsic::amdgcn_raw_buffer_store:
10252 case Intrinsic::amdgcn_raw_ptr_buffer_store:
10253 case Intrinsic::amdgcn_raw_buffer_store_format:
10254 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
10255 const bool IsFormat =
10256 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
10257 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
10258
10259 SDValue VData = Op.getOperand(i: 2);
10260 EVT VDataVT = VData.getValueType();
10261 EVT EltType = VDataVT.getScalarType();
10262 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10263 if (IsD16) {
10264 VData = handleD16VData(VData, DAG);
10265 VDataVT = VData.getValueType();
10266 }
10267
10268 if (!isTypeLegal(VT: VDataVT)) {
10269 VData =
10270 DAG.getNode(Opcode: ISD::BITCAST, DL,
10271 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
10272 }
10273
10274 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10275 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10276 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10277 SDValue Ops[] = {
10278 Chain,
10279 VData,
10280 Rsrc,
10281 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10282 VOffset, // voffset
10283 SOffset, // soffset
10284 Offset, // offset
10285 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10286 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10287 };
10288 unsigned Opc =
10289 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
10290 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10291 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10292
10293 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10294 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10295 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
10296
10297 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
10298 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10299 }
10300
10301 case Intrinsic::amdgcn_struct_buffer_store:
10302 case Intrinsic::amdgcn_struct_ptr_buffer_store:
10303 case Intrinsic::amdgcn_struct_buffer_store_format:
10304 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
10305 const bool IsFormat =
10306 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
10307 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
10308
10309 SDValue VData = Op.getOperand(i: 2);
10310 EVT VDataVT = VData.getValueType();
10311 EVT EltType = VDataVT.getScalarType();
10312 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
10313
10314 if (IsD16) {
10315 VData = handleD16VData(VData, DAG);
10316 VDataVT = VData.getValueType();
10317 }
10318
10319 if (!isTypeLegal(VT: VDataVT)) {
10320 VData =
10321 DAG.getNode(Opcode: ISD::BITCAST, DL,
10322 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
10323 }
10324
10325 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10326 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10327 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10328 SDValue Ops[] = {
10329 Chain,
10330 VData,
10331 Rsrc,
10332 Op.getOperand(i: 4), // vindex
10333 VOffset, // voffset
10334 SOffset, // soffset
10335 Offset, // offset
10336 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
10337 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10338 };
10339 unsigned Opc =
10340 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
10341 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
10342 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10343
10344 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
10345 EVT VDataType = VData.getValueType().getScalarType();
10346 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
10347 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
10348
10349 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
10350 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
10351 }
10352 case Intrinsic::amdgcn_raw_buffer_load_lds:
10353 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
10354 case Intrinsic::amdgcn_struct_buffer_load_lds:
10355 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
10356 if (!Subtarget->hasVMemToLDSLoad())
10357 return SDValue();
10358 unsigned Opc;
10359 bool HasVIndex =
10360 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
10361 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
10362 unsigned OpOffset = HasVIndex ? 1 : 0;
10363 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
10364 bool HasVOffset = !isNullConstant(V: VOffset);
10365 unsigned Size = Op->getConstantOperandVal(Num: 4);
10366
10367 switch (Size) {
10368 default:
10369 return SDValue();
10370 case 1:
10371 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
10372 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
10373 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
10374 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
10375 break;
10376 case 2:
10377 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
10378 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
10379 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
10380 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
10381 break;
10382 case 4:
10383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
10384 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
10385 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
10386 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
10387 break;
10388 case 12:
10389 if (!Subtarget->hasLDSLoadB96_B128())
10390 return SDValue();
10391 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
10392 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
10393 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
10394 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
10395 break;
10396 case 16:
10397 if (!Subtarget->hasLDSLoadB96_B128())
10398 return SDValue();
10399 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
10400 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
10401 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
10402 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
10403 break;
10404 }
10405
10406 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
10407
10408 SmallVector<SDValue, 8> Ops;
10409
10410 if (HasVIndex && HasVOffset)
10411 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
10412 Ops: {Op.getOperand(i: 5), // VIndex
10413 VOffset}));
10414 else if (HasVIndex)
10415 Ops.push_back(Elt: Op.getOperand(i: 5));
10416 else if (HasVOffset)
10417 Ops.push_back(Elt: VOffset);
10418
10419 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10420 Ops.push_back(Elt: Rsrc);
10421 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
10422 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
10423 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
10424 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
10425 Ops.push_back(Elt: DAG.getTargetConstant(
10426 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
10427 DL, VT: MVT::i8)); // cpol
10428 Ops.push_back(Elt: DAG.getTargetConstant(
10429 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
10430 ? 1
10431 : 0,
10432 DL, VT: MVT::i8)); // swz
10433 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
10434 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
10435
10436 auto *M = cast<MemSDNode>(Val&: Op);
10437 MachineMemOperand *LoadMMO = M->getMemOperand();
10438 // Don't set the offset value here because the pointer points to the base of
10439 // the buffer.
10440 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10441
10442 MachinePointerInfo StorePtrI = LoadPtrI;
10443 LoadPtrI.V = PoisonValue::get(
10444 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
10445 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10446 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10447
10448 auto F = LoadMMO->getFlags() &
10449 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10450 LoadMMO =
10451 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
10452 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10453
10454 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10455 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t),
10456 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10457
10458 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
10459 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
10460
10461 return SDValue(Load, 0);
10462 }
10463 // Buffers are handled by LowerBufferFatPointers, and we're going to go
10464 // for "trust me" that the remaining cases are global pointers until
10465 // such time as we can put two mem operands on an intrinsic.
10466 case Intrinsic::amdgcn_load_to_lds:
10467 case Intrinsic::amdgcn_global_load_lds: {
10468 if (!Subtarget->hasVMemToLDSLoad())
10469 return SDValue();
10470
10471 unsigned Opc;
10472 unsigned Size = Op->getConstantOperandVal(Num: 4);
10473 switch (Size) {
10474 default:
10475 return SDValue();
10476 case 1:
10477 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
10478 break;
10479 case 2:
10480 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
10481 break;
10482 case 4:
10483 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
10484 break;
10485 case 12:
10486 if (!Subtarget->hasLDSLoadB96_B128())
10487 return SDValue();
10488 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
10489 break;
10490 case 16:
10491 if (!Subtarget->hasLDSLoadB96_B128())
10492 return SDValue();
10493 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
10494 break;
10495 }
10496
10497 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
10498
10499 SmallVector<SDValue, 6> Ops;
10500
10501 SDValue Addr = Op.getOperand(i: 2); // Global ptr
10502 SDValue VOffset;
10503 // Try to split SAddr and VOffset. Global and LDS pointers share the same
10504 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
10505 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
10506 SDValue LHS = Addr.getOperand(i: 0);
10507 SDValue RHS = Addr.getOperand(i: 1);
10508
10509 if (LHS->isDivergent())
10510 std::swap(a&: LHS, b&: RHS);
10511
10512 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
10513 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
10514 // add (i64 sgpr), (zero_extend (i32 vgpr))
10515 Addr = LHS;
10516 VOffset = RHS.getOperand(i: 0);
10517 }
10518 }
10519
10520 Ops.push_back(Elt: Addr);
10521 if (!Addr->isDivergent()) {
10522 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
10523 if (!VOffset)
10524 VOffset =
10525 SDValue(DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
10526 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
10527 0);
10528 Ops.push_back(Elt: VOffset);
10529 }
10530
10531 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
10532 Ops.push_back(Elt: Op.getOperand(i: 6)); // CPol
10533 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
10534 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
10535
10536 auto *M = cast<MemSDNode>(Val&: Op);
10537 MachineMemOperand *LoadMMO = M->getMemOperand();
10538 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
10539 LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5);
10540 MachinePointerInfo StorePtrI = LoadPtrI;
10541 LoadPtrI.V = PoisonValue::get(
10542 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
10543 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
10544 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
10545 auto F = LoadMMO->getFlags() &
10546 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
10547 LoadMMO =
10548 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
10549 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
10550 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
10551 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4),
10552 AAInfo: LoadMMO->getAAInfo());
10553
10554 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
10555 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
10556
10557 return SDValue(Load, 0);
10558 }
10559 case Intrinsic::amdgcn_end_cf:
10560 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
10561 Op1: Op->getOperand(Num: 2), Op2: Chain),
10562 0);
10563 case Intrinsic::amdgcn_s_barrier_signal_var: {
10564 // these two intrinsics have two operands: barrier pointer and member count
10565 SDValue Chain = Op->getOperand(Num: 0);
10566 SmallVector<SDValue, 2> Ops;
10567 SDValue BarOp = Op->getOperand(Num: 2);
10568 SDValue CntOp = Op->getOperand(Num: 3);
10569 SDValue M0Val;
10570 // extract the BarrierID from bits 4-9 of BarOp
10571 SDValue BarID;
10572 BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
10573 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
10574 BarID =
10575 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
10576 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
10577 0);
10578 // Member count should be put into M0[ShAmt:+6]
10579 // Barrier ID should be put into M0[5:0]
10580 M0Val =
10581 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
10582 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
10583 0);
10584 constexpr unsigned ShAmt = 16;
10585 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
10586 N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
10587
10588 M0Val = SDValue(
10589 DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), 0);
10590
10591 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
10592
10593 auto *NewMI = DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_M0, dl: DL,
10594 VTs: Op->getVTList(), Ops);
10595 return SDValue(NewMI, 0);
10596 }
10597 case Intrinsic::amdgcn_s_prefetch_data: {
10598 // For non-global address space preserve the chain and remove the call.
10599 if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
10600 return Op.getOperand(i: 0);
10601 return Op;
10602 }
10603 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
10604 SDValue Ops[] = {
10605 Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG),
10606 Op.getOperand(i: 3), // offset
10607 Op.getOperand(i: 4), // length
10608 };
10609
10610 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10611 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
10612 VTList: Op->getVTList(), Ops, MemVT: M->getMemoryVT(),
10613 MMO: M->getMemOperand());
10614 }
10615 default: {
10616 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10617 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10618 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
10619
10620 return Op;
10621 }
10622 }
10623}
10624
10625bool SITargetLowering::shouldPreservePtrArith(const Function &F,
10626 EVT PtrVT) const {
10627 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
10628}
10629
10630// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10631// offset (the offset that is included in bounds checking and swizzling, to be
10632// split between the instruction's voffset and immoffset fields) and soffset
10633// (the offset that is excluded from bounds checking and swizzling, to go in
10634// the instruction's soffset field). This function takes the first kind of
10635// offset and figures out how to split it between voffset and immoffset.
10636std::pair<SDValue, SDValue>
10637SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
10638 SDLoc DL(Offset);
10639 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
10640 SDValue N0 = Offset;
10641 ConstantSDNode *C1 = nullptr;
10642
10643 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
10644 N0 = SDValue();
10645 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
10646 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
10647 N0 = N0.getOperand(i: 0);
10648 }
10649
10650 if (C1) {
10651 unsigned ImmOffset = C1->getZExtValue();
10652 // If the immediate value is too big for the immoffset field, put only bits
10653 // that would normally fit in the immoffset field. The remaining value that
10654 // is copied/added for the voffset field is a large power of 2, and it
10655 // stands more chance of being CSEd with the copy/add for another similar
10656 // load/store.
10657 // However, do not do that rounding down if that is a negative
10658 // number, as it appears to be illegal to have a negative offset in the
10659 // vgpr, even if adding the immediate offset makes it positive.
10660 unsigned Overflow = ImmOffset & ~MaxImm;
10661 ImmOffset -= Overflow;
10662 if ((int32_t)Overflow < 0) {
10663 Overflow += ImmOffset;
10664 ImmOffset = 0;
10665 }
10666 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
10667 if (Overflow) {
10668 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
10669 if (!N0)
10670 N0 = OverflowVal;
10671 else {
10672 SDValue Ops[] = {N0, OverflowVal};
10673 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
10674 }
10675 }
10676 }
10677 if (!N0)
10678 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10679 if (!C1)
10680 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
10681 return {N0, SDValue(C1, 0)};
10682}
10683
10684// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10685// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10686// pointed to by Offsets.
10687void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10688 SelectionDAG &DAG, SDValue *Offsets,
10689 Align Alignment) const {
10690 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10691 SDLoc DL(CombinedOffset);
10692 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
10693 uint32_t Imm = C->getZExtValue();
10694 uint32_t SOffset, ImmOffset;
10695 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10696 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10697 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10698 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10699 return;
10700 }
10701 }
10702 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
10703 SDValue N0 = CombinedOffset.getOperand(i: 0);
10704 SDValue N1 = CombinedOffset.getOperand(i: 1);
10705 uint32_t SOffset, ImmOffset;
10706 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
10707 if (Offset >= 0 &&
10708 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
10709 Offsets[0] = N0;
10710 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10711 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10712 return;
10713 }
10714 }
10715
10716 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10717 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
10718 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10719
10720 Offsets[0] = CombinedOffset;
10721 Offsets[1] = SOffsetZero;
10722 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
10723}
10724
10725SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10726 SelectionDAG &DAG) const {
10727 if (!MaybePointer.getValueType().isScalarInteger())
10728 return MaybePointer;
10729
10730 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
10731 return Rsrc;
10732}
10733
10734// Wrap a global or flat pointer into a buffer intrinsic using the flags
10735// specified in the intrinsic.
10736SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10737 SelectionDAG &DAG) const {
10738 SDLoc Loc(Op);
10739
10740 SDValue Pointer = Op->getOperand(Num: 1);
10741 SDValue Stride = Op->getOperand(Num: 2);
10742 SDValue NumRecords = Op->getOperand(Num: 3);
10743 SDValue Flags = Op->getOperand(Num: 4);
10744
10745 auto [LowHalf, HighHalf] = DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
10746 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
10747 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
10748 std::optional<uint32_t> ConstStride = std::nullopt;
10749 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride))
10750 ConstStride = ConstNode->getZExtValue();
10751
10752 SDValue NewHighHalf = Masked;
10753 if (!ConstStride || *ConstStride != 0) {
10754 SDValue ShiftedStride;
10755 if (ConstStride) {
10756 ShiftedStride = DAG.getConstant(Val: *ConstStride << 16, DL: Loc, VT: MVT::i32);
10757 } else {
10758 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
10759 ShiftedStride =
10760 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
10761 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
10762 }
10763 NewHighHalf = DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
10764 }
10765
10766 SDValue Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf,
10767 N2: NewHighHalf, N3: NumRecords, N4: Flags);
10768 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
10769 return RsrcPtr;
10770}
10771
10772// Handle 8 bit and 16 bit buffer loads
10773SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10774 EVT LoadVT, SDLoc DL,
10775 ArrayRef<SDValue> Ops,
10776 MachineMemOperand *MMO,
10777 bool IsTFE) const {
10778 EVT IntVT = LoadVT.changeTypeToInteger();
10779
10780 if (IsTFE) {
10781 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10782 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10783 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10784 MachineFunction &MF = DAG.getMachineFunction();
10785 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
10786 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
10787 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
10788 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10789 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
10790 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10791 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
10792 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
10793 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
10794 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
10795 }
10796
10797 unsigned Opc = LoadVT.getScalarType() == MVT::i8
10798 ? AMDGPUISD::BUFFER_LOAD_UBYTE
10799 : AMDGPUISD::BUFFER_LOAD_USHORT;
10800
10801 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
10802 SDValue BufferLoad =
10803 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
10804 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
10805 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
10806
10807 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
10808}
10809
10810// Handle 8 bit and 16 bit buffer stores
10811SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10812 EVT VDataType, SDLoc DL,
10813 SDValue Ops[],
10814 MemSDNode *M) const {
10815 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10816 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
10817
10818 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
10819 Ops[1] = BufferStoreExt;
10820 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
10821 : AMDGPUISD::BUFFER_STORE_SHORT;
10822 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10823 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
10824 MMO: M->getMemOperand());
10825}
10826
10827static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
10828 SDValue Op, const SDLoc &SL, EVT VT) {
10829 if (VT.bitsLT(VT: Op.getValueType()))
10830 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
10831
10832 switch (ExtType) {
10833 case ISD::SEXTLOAD:
10834 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
10835 case ISD::ZEXTLOAD:
10836 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
10837 case ISD::EXTLOAD:
10838 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
10839 case ISD::NON_EXTLOAD:
10840 return Op;
10841 }
10842
10843 llvm_unreachable("invalid ext type");
10844}
10845
10846// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10847// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10848SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
10849 DAGCombinerInfo &DCI) const {
10850 SelectionDAG &DAG = DCI.DAG;
10851 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10852 return SDValue();
10853
10854 // FIXME: Constant loads should all be marked invariant.
10855 unsigned AS = Ld->getAddressSpace();
10856 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10857 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10858 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10859 return SDValue();
10860
10861 // Don't do this early, since it may interfere with adjacent load merging for
10862 // illegal types. We can avoid losing alignment information for exotic types
10863 // pre-legalize.
10864 EVT MemVT = Ld->getMemoryVT();
10865 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10866 MemVT.getSizeInBits() >= 32)
10867 return SDValue();
10868
10869 SDLoc SL(Ld);
10870
10871 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10872 "unexpected vector extload");
10873
10874 // TODO: Drop only high part of range.
10875 SDValue Ptr = Ld->getBasePtr();
10876 SDValue NewLoad = DAG.getLoad(
10877 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
10878 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
10879 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
10880 Ranges: nullptr); // Drop ranges
10881
10882 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
10883 if (MemVT.isFloatingPoint()) {
10884 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10885 "unexpected fp extload");
10886 TruncVT = MemVT.changeTypeToInteger();
10887 }
10888
10889 SDValue Cvt = NewLoad;
10890 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10891 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
10892 N2: DAG.getValueType(TruncVT));
10893 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10894 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10895 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
10896 } else {
10897 assert(Ld->getExtensionType() == ISD::EXTLOAD);
10898 }
10899
10900 EVT VT = Ld->getValueType(ResNo: 0);
10901 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
10902
10903 DCI.AddToWorklist(N: Cvt.getNode());
10904
10905 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10906 // the appropriate extension from the 32-bit load.
10907 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
10908 DCI.AddToWorklist(N: Cvt.getNode());
10909
10910 // Handle conversion back to floating point if necessary.
10911 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
10912
10913 return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: 1)}, dl: SL);
10914}
10915
10916static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10917 const SIMachineFunctionInfo &Info) {
10918 // TODO: Should check if the address can definitely not access stack.
10919 if (Info.isEntryFunction())
10920 return Info.getUserSGPRInfo().hasFlatScratchInit();
10921 return true;
10922}
10923
10924SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10925 SDLoc DL(Op);
10926 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
10927 ISD::LoadExtType ExtType = Load->getExtensionType();
10928 EVT MemVT = Load->getMemoryVT();
10929 MachineMemOperand *MMO = Load->getMemOperand();
10930
10931 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10932 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
10933 return SDValue();
10934
10935 // FIXME: Copied from PPC
10936 // First, load into 32 bits, then truncate to 1 bit.
10937
10938 SDValue Chain = Load->getChain();
10939 SDValue BasePtr = Load->getBasePtr();
10940
10941 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10942
10943 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
10944 MemVT: RealMemVT, MMO);
10945
10946 if (!MemVT.isVector()) {
10947 SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
10948 NewLD.getValue(R: 1)};
10949
10950 return DAG.getMergeValues(Ops, dl: DL);
10951 }
10952
10953 SmallVector<SDValue, 3> Elts;
10954 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10955 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
10956 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
10957
10958 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
10959 }
10960
10961 SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: 1)};
10962
10963 return DAG.getMergeValues(Ops, dl: DL);
10964 }
10965
10966 if (!MemVT.isVector())
10967 return SDValue();
10968
10969 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10970 "Custom lowering for non-i32 vectors hasn't been implemented.");
10971
10972 Align Alignment = Load->getAlign();
10973 unsigned AS = Load->getAddressSpace();
10974 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10975 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10976 return SplitVectorLoad(Op, DAG);
10977 }
10978
10979 MachineFunction &MF = DAG.getMachineFunction();
10980 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10981 // If there is a possibility that flat instruction access scratch memory
10982 // then we need to use the same legalization rules we use for private.
10983 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10984 !Subtarget->hasMultiDwordFlatScratchAddressing())
10985 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
10986 ? AMDGPUAS::PRIVATE_ADDRESS
10987 : AMDGPUAS::GLOBAL_ADDRESS;
10988
10989 unsigned NumElements = MemVT.getVectorNumElements();
10990
10991 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10992 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10993 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10994 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10995 isMemOpHasNoClobberedMemOperand(N: Load))) {
10996 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
10997 Alignment >= Align(4) && NumElements < 32) {
10998 if (MemVT.isPow2VectorType() ||
10999 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11000 return SDValue();
11001 return WidenOrSplitVectorLoad(Op, DAG);
11002 }
11003 // Non-uniform loads will be selected to MUBUF instructions, so they
11004 // have the same legalization requirements as global and private
11005 // loads.
11006 //
11007 }
11008 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11009 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
11010 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11011 if (NumElements > 4)
11012 return SplitVectorLoad(Op, DAG);
11013 // v3 loads not supported on SI.
11014 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11015 return WidenOrSplitVectorLoad(Op, DAG);
11016
11017 // v3 and v4 loads are supported for private and global memory.
11018 return SDValue();
11019 }
11020 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11021 // Depending on the setting of the private_element_size field in the
11022 // resource descriptor, we can only make private accesses up to a certain
11023 // size.
11024 switch (Subtarget->getMaxPrivateElementSize()) {
11025 case 4: {
11026 auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
11027 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
11028 }
11029 case 8:
11030 if (NumElements > 2)
11031 return SplitVectorLoad(Op, DAG);
11032 return SDValue();
11033 case 16:
11034 // Same as global/flat
11035 if (NumElements > 4)
11036 return SplitVectorLoad(Op, DAG);
11037 // v3 loads not supported on SI.
11038 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11039 return WidenOrSplitVectorLoad(Op, DAG);
11040
11041 return SDValue();
11042 default:
11043 llvm_unreachable("unsupported private_element_size");
11044 }
11045 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11046 unsigned Fast = 0;
11047 auto Flags = Load->getMemOperand()->getFlags();
11048 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
11049 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
11050 Fast > 1)
11051 return SDValue();
11052
11053 if (MemVT.isVector())
11054 return SplitVectorLoad(Op, DAG);
11055 }
11056
11057 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
11058 VT: MemVT, MMO: *Load->getMemOperand())) {
11059 auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
11060 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
11061 }
11062
11063 return SDValue();
11064}
11065
11066SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11067 EVT VT = Op.getValueType();
11068 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11069 VT.getSizeInBits() == 512)
11070 return splitTernaryVectorOp(Op, DAG);
11071
11072 assert(VT.getSizeInBits() == 64);
11073
11074 SDLoc DL(Op);
11075 SDValue Cond = Op.getOperand(i: 0);
11076
11077 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
11078 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
11079
11080 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
11081 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
11082
11083 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
11084 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
11085
11086 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
11087
11088 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
11089 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
11090
11091 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
11092
11093 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
11094 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
11095}
11096
11097// Catch division cases where we can use shortcuts with rcp and rsq
11098// instructions.
11099SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11100 SelectionDAG &DAG) const {
11101 SDLoc SL(Op);
11102 SDValue LHS = Op.getOperand(i: 0);
11103 SDValue RHS = Op.getOperand(i: 1);
11104 EVT VT = Op.getValueType();
11105 const SDNodeFlags Flags = Op->getFlags();
11106
11107 bool AllowInaccurateRcp =
11108 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11109
11110 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
11111 // Without !fpmath accuracy information, we can't do more because we don't
11112 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11113 // f16 is always accurate enough
11114 if (!AllowInaccurateRcp && VT != MVT::f16)
11115 return SDValue();
11116
11117 if (CLHS->isExactlyValue(V: 1.0)) {
11118 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11119 // the CI documentation has a worst case error of 1 ulp.
11120 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11121 // use it as long as we aren't trying to use denormals.
11122 //
11123 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11124
11125 // 1.0 / sqrt(x) -> rsq(x)
11126
11127 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
11128 // error seems really high at 2^29 ULP.
11129 // 1.0 / x -> rcp(x)
11130 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
11131 }
11132
11133 // Same as for 1.0, but expand the sign out of the constant.
11134 if (CLHS->isExactlyValue(V: -1.0)) {
11135 // -1.0 / x -> rcp (fneg x)
11136 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
11137 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
11138 }
11139 }
11140
11141 // For f16 require afn or arcp.
11142 // For f32 require afn.
11143 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
11144 return SDValue();
11145
11146 // Turn into multiply by the reciprocal.
11147 // x / y -> x * (1.0 / y)
11148 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
11149 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
11150}
11151
11152SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
11153 SelectionDAG &DAG) const {
11154 SDLoc SL(Op);
11155 SDValue X = Op.getOperand(i: 0);
11156 SDValue Y = Op.getOperand(i: 1);
11157 EVT VT = Op.getValueType();
11158 const SDNodeFlags Flags = Op->getFlags();
11159
11160 bool AllowInaccurateDiv =
11161 Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11162 if (!AllowInaccurateDiv)
11163 return SDValue();
11164
11165 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
11166 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
11167
11168 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
11169 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
11170
11171 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
11172 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
11173 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
11174 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
11175 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
11176 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
11177}
11178
11179static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11180 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
11181 SDNodeFlags Flags) {
11182 if (GlueChain->getNumValues() <= 1) {
11183 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
11184 }
11185
11186 assert(GlueChain->getNumValues() == 3);
11187
11188 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
11189 switch (Opcode) {
11190 default:
11191 llvm_unreachable("no chain equivalent for opcode");
11192 case ISD::FMUL:
11193 Opcode = AMDGPUISD::FMUL_W_CHAIN;
11194 break;
11195 }
11196
11197 return DAG.getNode(Opcode, DL: SL, VTList,
11198 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
11199 Flags);
11200}
11201
11202static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
11203 EVT VT, SDValue A, SDValue B, SDValue C,
11204 SDValue GlueChain, SDNodeFlags Flags) {
11205 if (GlueChain->getNumValues() <= 1) {
11206 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
11207 }
11208
11209 assert(GlueChain->getNumValues() == 3);
11210
11211 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
11212 switch (Opcode) {
11213 default:
11214 llvm_unreachable("no chain equivalent for opcode");
11215 case ISD::FMA:
11216 Opcode = AMDGPUISD::FMA_W_CHAIN;
11217 break;
11218 }
11219
11220 return DAG.getNode(Opcode, DL: SL, VTList,
11221 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
11222 Flags);
11223}
11224
11225SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
11226 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11227 return FastLowered;
11228
11229 SDLoc SL(Op);
11230 SDValue LHS = Op.getOperand(i: 0);
11231 SDValue RHS = Op.getOperand(i: 1);
11232
11233 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
11234 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
11235 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
11236 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
11237 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11238 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
11239 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
11240 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
11241 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
11242 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
11243 // q16.u = opx(V_CVT_F16_F32, q32.u);
11244 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
11245
11246 // We will use ISD::FMA on targets that don't support ISD::FMAD.
11247 unsigned FMADOpCode =
11248 isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
11249
11250 SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
11251 SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
11252 SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
11253 SDValue Rcp =
11254 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op->getFlags());
11255 SDValue Quot =
11256 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op->getFlags());
11257 SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
11258 Flags: Op->getFlags());
11259 Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op->getFlags());
11260 Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
11261 Flags: Op->getFlags());
11262 SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op->getFlags());
11263 SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
11264 TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
11265 N2: DAG.getConstant(Val: 0xff800000, DL: SL, VT: MVT::i32));
11266 Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
11267 Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op->getFlags());
11268 SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
11269 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
11270 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
11271 Flags: Op->getFlags());
11272}
11273
11274// Faster 2.5 ULP division that does not support denormals.
11275SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
11276 SDNodeFlags Flags = Op->getFlags();
11277 SDLoc SL(Op);
11278 SDValue LHS = Op.getOperand(i: 1);
11279 SDValue RHS = Op.getOperand(i: 2);
11280
11281 SDValue r1 = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
11282
11283 const APFloat K0Val(0x1p+96f);
11284 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
11285
11286 const APFloat K1Val(0x1p-32f);
11287 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
11288
11289 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
11290
11291 EVT SetCCVT =
11292 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
11293
11294 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
11295
11296 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
11297
11298 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
11299
11300 // rcp does not support denormals.
11301 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
11302
11303 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
11304
11305 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
11306}
11307
11308// Returns immediate value for setting the F32 denorm mode when using the
11309// S_DENORM_MODE instruction.
11310static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
11311 const SIMachineFunctionInfo *Info,
11312 const GCNSubtarget *ST) {
11313 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
11314 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
11315 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
11316 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
11317}
11318
11319SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
11320 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
11321 return FastLowered;
11322
11323 // The selection matcher assumes anything with a chain selecting to a
11324 // mayRaiseFPException machine instruction. Since we're introducing a chain
11325 // here, we need to explicitly report nofpexcept for the regular fdiv
11326 // lowering.
11327 SDNodeFlags Flags = Op->getFlags();
11328 Flags.setNoFPExcept(true);
11329
11330 SDLoc SL(Op);
11331 SDValue LHS = Op.getOperand(i: 0);
11332 SDValue RHS = Op.getOperand(i: 1);
11333
11334 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
11335
11336 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
11337
11338 SDValue DenominatorScaled =
11339 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
11340 SDValue NumeratorScaled =
11341 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
11342
11343 // Denominator is scaled to not be denormal, so using rcp is ok.
11344 SDValue ApproxRcp =
11345 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
11346 SDValue NegDivScale0 =
11347 DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
11348
11349 using namespace AMDGPU::Hwreg;
11350 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
11351 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
11352
11353 const MachineFunction &MF = DAG.getMachineFunction();
11354 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11355 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
11356
11357 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
11358 const bool HasDynamicDenormals =
11359 (DenormMode.Input == DenormalMode::Dynamic) ||
11360 (DenormMode.Output == DenormalMode::Dynamic);
11361
11362 SDValue SavedDenormMode;
11363
11364 if (!PreservesDenormals) {
11365 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
11366 // lowering. The chain dependence is insufficient, and we need glue. We do
11367 // not need the glue variants in a strictfp function.
11368
11369 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
11370
11371 SDValue Glue = DAG.getEntryNode();
11372 if (HasDynamicDenormals) {
11373 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
11374 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
11375 Ops: {BitField, Glue});
11376 SavedDenormMode = SDValue(GetReg, 0);
11377
11378 Glue = DAG.getMergeValues(
11379 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
11380 }
11381
11382 SDNode *EnableDenorm;
11383 if (Subtarget->hasDenormModeInst()) {
11384 const SDValue EnableDenormValue =
11385 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
11386
11387 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
11388 N2: EnableDenormValue)
11389 .getNode();
11390 } else {
11391 const SDValue EnableDenormValue =
11392 DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
11393 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
11394 Ops: {EnableDenormValue, BitField, Glue});
11395 }
11396
11397 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
11398 SDValue(EnableDenorm, 1)};
11399
11400 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
11401 }
11402
11403 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
11404 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
11405
11406 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
11407 C: ApproxRcp, GlueChain: Fma0, Flags);
11408
11409 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
11410 GlueChain: Fma1, Flags);
11411
11412 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
11413 C: NumeratorScaled, GlueChain: Mul, Flags);
11414
11415 SDValue Fma3 =
11416 getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
11417
11418 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
11419 C: NumeratorScaled, GlueChain: Fma3, Flags);
11420
11421 if (!PreservesDenormals) {
11422 SDNode *DisableDenorm;
11423 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
11424 const SDValue DisableDenormValue = getSPDenormModeValue(
11425 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
11426
11427 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
11428 DisableDenorm =
11429 DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
11430 N1: Fma4.getValue(R: 1), N2: DisableDenormValue, N3: Fma4.getValue(R: 2))
11431 .getNode();
11432 } else {
11433 assert(HasDynamicDenormals == (bool)SavedDenormMode);
11434 const SDValue DisableDenormValue =
11435 HasDynamicDenormals
11436 ? SavedDenormMode
11437 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
11438
11439 DisableDenorm = DAG.getMachineNode(
11440 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
11441 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
11442 }
11443
11444 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
11445 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
11446 DAG.setRoot(OutputChain);
11447 }
11448
11449 SDValue Scale = NumeratorScaled.getValue(R: 1);
11450 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
11451 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
11452
11453 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
11454}
11455
11456SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
11457 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
11458 return FastLowered;
11459
11460 SDLoc SL(Op);
11461 SDValue X = Op.getOperand(i: 0);
11462 SDValue Y = Op.getOperand(i: 1);
11463
11464 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
11465
11466 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
11467
11468 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
11469
11470 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
11471
11472 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
11473
11474 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
11475
11476 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
11477
11478 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
11479
11480 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
11481
11482 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
11483 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
11484
11485 SDValue Fma4 =
11486 DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
11487
11488 SDValue Scale;
11489
11490 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
11491 // Workaround a hardware bug on SI where the condition output from div_scale
11492 // is not usable.
11493
11494 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
11495
11496 // Figure out if the scale to use for div_fmas.
11497 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
11498 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
11499 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
11500 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
11501
11502 SDValue NumHi =
11503 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
11504 SDValue DenHi =
11505 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
11506
11507 SDValue Scale0Hi =
11508 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
11509 SDValue Scale1Hi =
11510 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
11511
11512 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
11513 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
11514 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
11515 } else {
11516 Scale = DivScale1.getValue(R: 1);
11517 }
11518
11519 SDValue Fmas =
11520 DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
11521
11522 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
11523}
11524
11525SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
11526 EVT VT = Op.getValueType();
11527
11528 if (VT == MVT::f32)
11529 return LowerFDIV32(Op, DAG);
11530
11531 if (VT == MVT::f64)
11532 return LowerFDIV64(Op, DAG);
11533
11534 if (VT == MVT::f16)
11535 return LowerFDIV16(Op, DAG);
11536
11537 llvm_unreachable("Unexpected type for fdiv");
11538}
11539
11540SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
11541 SDLoc dl(Op);
11542 SDValue Val = Op.getOperand(i: 0);
11543 EVT VT = Val.getValueType();
11544 EVT ResultExpVT = Op->getValueType(ResNo: 1);
11545 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
11546
11547 SDValue Mant = DAG.getNode(
11548 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
11549 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
11550
11551 SDValue Exp = DAG.getNode(
11552 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
11553 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
11554
11555 if (Subtarget->hasFractBug()) {
11556 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
11557 SDValue Inf =
11558 DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
11559
11560 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
11561 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
11562 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
11563 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
11564 }
11565
11566 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
11567 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
11568}
11569
11570SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
11571 SDLoc DL(Op);
11572 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
11573 EVT VT = Store->getMemoryVT();
11574
11575 if (VT == MVT::i1) {
11576 return DAG.getTruncStore(
11577 Chain: Store->getChain(), dl: DL,
11578 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
11579 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
11580 }
11581
11582 assert(VT.isVector() &&
11583 Store->getValue().getValueType().getScalarType() == MVT::i32);
11584
11585 unsigned AS = Store->getAddressSpace();
11586 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11587 Store->getAlign().value() < VT.getStoreSize() &&
11588 VT.getSizeInBits() > 32) {
11589 return SplitVectorStore(Op, DAG);
11590 }
11591
11592 MachineFunction &MF = DAG.getMachineFunction();
11593 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11594 // If there is a possibility that flat instruction access scratch memory
11595 // then we need to use the same legalization rules we use for private.
11596 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11597 !Subtarget->hasMultiDwordFlatScratchAddressing())
11598 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
11599 ? AMDGPUAS::PRIVATE_ADDRESS
11600 : AMDGPUAS::GLOBAL_ADDRESS;
11601
11602 unsigned NumElements = VT.getVectorNumElements();
11603 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
11604 if (NumElements > 4)
11605 return SplitVectorStore(Op, DAG);
11606 // v3 stores not supported on SI.
11607 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11608 return SplitVectorStore(Op, DAG);
11609
11610 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
11611 VT, MMO: *Store->getMemOperand()))
11612 return expandUnalignedStore(ST: Store, DAG);
11613
11614 return SDValue();
11615 }
11616 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11617 switch (Subtarget->getMaxPrivateElementSize()) {
11618 case 4:
11619 return scalarizeVectorStore(ST: Store, DAG);
11620 case 8:
11621 if (NumElements > 2)
11622 return SplitVectorStore(Op, DAG);
11623 return SDValue();
11624 case 16:
11625 if (NumElements > 4 ||
11626 (NumElements == 3 && !Subtarget->enableFlatScratch()))
11627 return SplitVectorStore(Op, DAG);
11628 return SDValue();
11629 default:
11630 llvm_unreachable("unsupported private_element_size");
11631 }
11632 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11633 unsigned Fast = 0;
11634 auto Flags = Store->getMemOperand()->getFlags();
11635 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
11636 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
11637 Fast > 1)
11638 return SDValue();
11639
11640 if (VT.isVector())
11641 return SplitVectorStore(Op, DAG);
11642
11643 return expandUnalignedStore(ST: Store, DAG);
11644 }
11645
11646 // Probably an invalid store. If so we'll end up emitting a selection error.
11647 return SDValue();
11648}
11649
11650// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11651SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11652 SDLoc SL(Op);
11653 assert(!Subtarget->has16BitInsts());
11654 SDNodeFlags Flags = Op->getFlags();
11655 SDValue Ext =
11656 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
11657
11658 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
11659 SDValue Sqrt =
11660 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
11661
11662 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
11663 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
11664}
11665
11666SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11667 SDLoc DL(Op);
11668 SDNodeFlags Flags = Op->getFlags();
11669 MVT VT = Op.getValueType().getSimpleVT();
11670 const SDValue X = Op.getOperand(i: 0);
11671
11672 if (allowApproxFunc(DAG, Flags)) {
11673 // Instruction is 1ulp but ignores denormals.
11674 return DAG.getNode(
11675 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
11676 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
11677 }
11678
11679 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
11680 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
11681
11682 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
11683
11684 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
11685
11686 SDValue SqrtX =
11687 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
11688
11689 SDValue SqrtS;
11690 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
11691 SDValue SqrtID =
11692 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
11693 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
11694
11695 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
11696 SDValue SqrtSNextDownInt =
11697 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11698 N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
11699 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
11700
11701 SDValue NegSqrtSNextDown =
11702 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
11703
11704 SDValue SqrtVP =
11705 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
11706
11707 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11708 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
11709 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
11710
11711 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
11712 SDValue SqrtVS =
11713 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
11714
11715 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
11716 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
11717
11718 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
11719 Flags);
11720
11721 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
11722 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
11723 Flags);
11724 } else {
11725 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
11726
11727 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
11728
11729 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
11730 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
11731 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
11732
11733 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
11734 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
11735 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
11736
11737 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
11738 SDValue SqrtD =
11739 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
11740 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
11741 }
11742
11743 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
11744
11745 SDValue ScaledDown =
11746 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
11747
11748 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
11749 SDValue IsZeroOrInf =
11750 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11751 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
11752
11753 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
11754}
11755
11756SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11757 // For double type, the SQRT and RSQ instructions don't have required
11758 // precision, we apply Goldschmidt's algorithm to improve the result:
11759 //
11760 // y0 = rsq(x)
11761 // g0 = x * y0
11762 // h0 = 0.5 * y0
11763 //
11764 // r0 = 0.5 - h0 * g0
11765 // g1 = g0 * r0 + g0
11766 // h1 = h0 * r0 + h0
11767 //
11768 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11769 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11770 // h2 = h1 * r1 + h1
11771 //
11772 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11773 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11774 //
11775 // sqrt(x) = g3
11776
11777 SDNodeFlags Flags = Op->getFlags();
11778
11779 SDLoc DL(Op);
11780
11781 SDValue X = Op.getOperand(i: 0);
11782 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
11783
11784 SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
11785
11786 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
11787
11788 // Scale up input if it is too small.
11789 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
11790 SDValue ScaleUp =
11791 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
11792 SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
11793
11794 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
11795
11796 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
11797
11798 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
11799 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
11800
11801 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
11802 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
11803
11804 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
11805
11806 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
11807
11808 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
11809 SDValue SqrtD0 =
11810 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
11811
11812 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
11813
11814 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
11815 SDValue SqrtD1 =
11816 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
11817
11818 SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
11819
11820 SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -128, DL, VT: MVT::i32);
11821 SDValue ScaleDown =
11822 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
11823 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
11824
11825 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11826 // with finite only or nsz because rsq(+/-0) = +/-inf
11827
11828 // TODO: Check for DAZ and expand to subnormals
11829 SDValue IsZeroOrInf =
11830 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11831 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
11832
11833 // If x is +INF, +0, or -0, use its original value
11834 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
11835 Flags);
11836}
11837
11838SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11839 SDLoc DL(Op);
11840 EVT VT = Op.getValueType();
11841 SDValue Arg = Op.getOperand(i: 0);
11842 SDValue TrigVal;
11843
11844 // Propagate fast-math flags so that the multiply we introduce can be folded
11845 // if Arg is already the result of a multiply by constant.
11846 auto Flags = Op->getFlags();
11847
11848 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
11849
11850 if (Subtarget->hasTrigReducedRange()) {
11851 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11852 TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags);
11853 } else {
11854 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11855 }
11856
11857 switch (Op.getOpcode()) {
11858 case ISD::FCOS:
11859 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11860 case ISD::FSIN:
11861 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11862 default:
11863 llvm_unreachable("Wrong trig opcode");
11864 }
11865}
11866
11867SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
11868 SelectionDAG &DAG) const {
11869 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
11870 assert(AtomicNode->isCompareAndSwap());
11871 unsigned AS = AtomicNode->getAddressSpace();
11872
11873 // No custom lowering required for local address space
11874 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11875 return Op;
11876
11877 // Non-local address space requires custom lowering for atomic compare
11878 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11879 SDLoc DL(Op);
11880 SDValue ChainIn = Op.getOperand(i: 0);
11881 SDValue Addr = Op.getOperand(i: 1);
11882 SDValue Old = Op.getOperand(i: 2);
11883 SDValue New = Op.getOperand(i: 3);
11884 EVT VT = Op.getValueType();
11885 MVT SimpleVT = VT.getSimpleVT();
11886 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
11887
11888 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
11889 SDValue Ops[] = {ChainIn, Addr, NewOld};
11890
11891 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
11892 VTList: Op->getVTList(), Ops, MemVT: VT,
11893 MMO: AtomicNode->getMemOperand());
11894}
11895
11896//===----------------------------------------------------------------------===//
11897// Custom DAG optimizations
11898//===----------------------------------------------------------------------===//
11899
11900SDValue
11901SITargetLowering::performUCharToFloatCombine(SDNode *N,
11902 DAGCombinerInfo &DCI) const {
11903 EVT VT = N->getValueType(ResNo: 0);
11904 EVT ScalarVT = VT.getScalarType();
11905 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11906 return SDValue();
11907
11908 SelectionDAG &DAG = DCI.DAG;
11909 SDLoc DL(N);
11910
11911 SDValue Src = N->getOperand(Num: 0);
11912 EVT SrcVT = Src.getValueType();
11913
11914 // TODO: We could try to match extracting the higher bytes, which would be
11915 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11916 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11917 // about in practice.
11918 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11919 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
11920 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
11921 DCI.AddToWorklist(N: Cvt.getNode());
11922
11923 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11924 if (ScalarVT != MVT::f32) {
11925 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
11926 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
11927 }
11928 return Cvt;
11929 }
11930 }
11931
11932 return SDValue();
11933}
11934
11935SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11936 DAGCombinerInfo &DCI) const {
11937 SDValue MagnitudeOp = N->getOperand(Num: 0);
11938 SDValue SignOp = N->getOperand(Num: 1);
11939
11940 // The generic combine for fcopysign + fp cast is too conservative with
11941 // vectors, and also gets confused by the splitting we will perform here, so
11942 // peek through FP casts.
11943 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
11944 SignOp.getOpcode() == ISD::FP_ROUND)
11945 SignOp = SignOp.getOperand(i: 0);
11946
11947 SelectionDAG &DAG = DCI.DAG;
11948 SDLoc DL(N);
11949 EVT SignVT = SignOp.getValueType();
11950
11951 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11952 // lower half with a copy.
11953 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11954 EVT MagVT = MagnitudeOp.getValueType();
11955
11956 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11957
11958 if (MagVT.getScalarType() == MVT::f64) {
11959 EVT F32VT = MagVT.isVector()
11960 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
11961 : MVT::v2f32;
11962
11963 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
11964
11965 SmallVector<SDValue, 8> NewElts;
11966 for (unsigned I = 0; I != NumElts; ++I) {
11967 SDValue MagLo =
11968 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11969 N2: DAG.getConstant(Val: 2 * I, DL, VT: MVT::i32));
11970 SDValue MagHi =
11971 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11972 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
11973
11974 SDValue SignOpElt =
11975 MagVT.isVector()
11976 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
11977 N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
11978 : SignOp;
11979
11980 SDValue HiOp =
11981 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
11982
11983 SDValue Vector =
11984 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
11985
11986 SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
11987 NewElts.push_back(Elt: NewElt);
11988 }
11989
11990 if (NewElts.size() == 1)
11991 return NewElts[0];
11992
11993 return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
11994 }
11995
11996 if (SignVT.getScalarType() != MVT::f64)
11997 return SDValue();
11998
11999 // Reduce width of sign operand, we only need the highest bit.
12000 //
12001 // fcopysign f64:x, f64:y ->
12002 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12003 // TODO: In some cases it might make sense to go all the way to f16.
12004
12005 EVT F32VT = MagVT.isVector()
12006 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
12007 : MVT::v2f32;
12008
12009 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
12010
12011 SmallVector<SDValue, 8> F32Signs;
12012 for (unsigned I = 0; I != NumElts; ++I) {
12013 // Take sign from odd elements of cast vector
12014 SDValue SignAsF32 =
12015 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
12016 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
12017 F32Signs.push_back(Elt: SignAsF32);
12018 }
12019
12020 SDValue NewSign =
12021 NumElts == 1
12022 ? F32Signs.back()
12023 : DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
12024 VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
12025 Ops: F32Signs);
12026
12027 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
12028 N2: NewSign);
12029}
12030
12031// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12032// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12033// bits
12034
12035// This is a variant of
12036// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12037//
12038// The normal DAG combiner will do this, but only if the add has one use since
12039// that would increase the number of instructions.
12040//
12041// This prevents us from seeing a constant offset that can be folded into a
12042// memory instruction's addressing mode. If we know the resulting add offset of
12043// a pointer can be folded into an addressing offset, we can replace the pointer
12044// operand with the add of new constant offset. This eliminates one of the uses,
12045// and may allow the remaining use to also be simplified.
12046//
12047SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12048 EVT MemVT,
12049 DAGCombinerInfo &DCI) const {
12050 SDValue N0 = N->getOperand(Num: 0);
12051 SDValue N1 = N->getOperand(Num: 1);
12052
12053 // We only do this to handle cases where it's profitable when there are
12054 // multiple uses of the add, so defer to the standard combine.
12055 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
12056 N0->hasOneUse())
12057 return SDValue();
12058
12059 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
12060 if (!CN1)
12061 return SDValue();
12062
12063 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
12064 if (!CAdd)
12065 return SDValue();
12066
12067 SelectionDAG &DAG = DCI.DAG;
12068
12069 if (N0->getOpcode() == ISD::OR &&
12070 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
12071 return SDValue();
12072
12073 // If the resulting offset is too large, we can't fold it into the
12074 // addressing mode offset.
12075 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12076 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
12077
12078 AddrMode AM;
12079 AM.HasBaseReg = true;
12080 AM.BaseOffs = Offset.getSExtValue();
12081 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
12082 return SDValue();
12083
12084 SDLoc SL(N);
12085 EVT VT = N->getValueType(ResNo: 0);
12086
12087 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
12088 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
12089
12090 SDNodeFlags Flags;
12091 Flags.setNoUnsignedWrap(
12092 N->getFlags().hasNoUnsignedWrap() &&
12093 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12094
12095 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
12096}
12097
12098/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12099/// by the chain and intrinsic ID. Theoretically we would also need to check the
12100/// specific intrinsic, but they all place the pointer operand first.
12101static unsigned getBasePtrIndex(const MemSDNode *N) {
12102 switch (N->getOpcode()) {
12103 case ISD::STORE:
12104 case ISD::INTRINSIC_W_CHAIN:
12105 case ISD::INTRINSIC_VOID:
12106 return 2;
12107 default:
12108 return 1;
12109 }
12110}
12111
12112SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12113 DAGCombinerInfo &DCI) const {
12114 SelectionDAG &DAG = DCI.DAG;
12115
12116 unsigned PtrIdx = getBasePtrIndex(N);
12117 SDValue Ptr = N->getOperand(Num: PtrIdx);
12118
12119 // TODO: We could also do this for multiplies.
12120 if (Ptr.getOpcode() == ISD::SHL) {
12121 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
12122 MemVT: N->getMemoryVT(), DCI);
12123 if (NewPtr) {
12124 SmallVector<SDValue, 8> NewOps(N->ops());
12125
12126 NewOps[PtrIdx] = NewPtr;
12127 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
12128 }
12129 }
12130
12131 return SDValue();
12132}
12133
12134static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
12135 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
12136 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
12137 (Opc == ISD::XOR && Val == 0);
12138}
12139
12140// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
12141// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
12142// integer combine opportunities since most 64-bit operations are decomposed
12143// this way. TODO: We won't want this for SALU especially if it is an inline
12144// immediate.
12145SDValue SITargetLowering::splitBinaryBitConstantOp(
12146 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
12147 const ConstantSDNode *CRHS) const {
12148 uint64_t Val = CRHS->getZExtValue();
12149 uint32_t ValLo = Lo_32(Value: Val);
12150 uint32_t ValHi = Hi_32(Value: Val);
12151 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12152
12153 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
12154 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
12155 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
12156 // If we need to materialize a 64-bit immediate, it will be split up later
12157 // anyway. Avoid creating the harder to understand 64-bit immediate
12158 // materialization.
12159 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
12160 }
12161
12162 return SDValue();
12163}
12164
12165bool llvm::isBoolSGPR(SDValue V) {
12166 if (V.getValueType() != MVT::i1)
12167 return false;
12168 switch (V.getOpcode()) {
12169 default:
12170 break;
12171 case ISD::SETCC:
12172 case ISD::IS_FPCLASS:
12173 case AMDGPUISD::FP_CLASS:
12174 return true;
12175 case ISD::AND:
12176 case ISD::OR:
12177 case ISD::XOR:
12178 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
12179 case ISD::SADDO:
12180 case ISD::UADDO:
12181 case ISD::SSUBO:
12182 case ISD::USUBO:
12183 case ISD::SMULO:
12184 case ISD::UMULO:
12185 return V.getResNo() == 1;
12186 case ISD::INTRINSIC_WO_CHAIN: {
12187 unsigned IntrinsicID = V.getConstantOperandVal(i: 0);
12188 switch (IntrinsicID) {
12189 case Intrinsic::amdgcn_is_shared:
12190 case Intrinsic::amdgcn_is_private:
12191 return true;
12192 default:
12193 return false;
12194 }
12195
12196 return false;
12197 }
12198 }
12199 return false;
12200}
12201
12202// If a constant has all zeroes or all ones within each byte return it.
12203// Otherwise return 0.
12204static uint32_t getConstantPermuteMask(uint32_t C) {
12205 // 0xff for any zero byte in the mask
12206 uint32_t ZeroByteMask = 0;
12207 if (!(C & 0x000000ff))
12208 ZeroByteMask |= 0x000000ff;
12209 if (!(C & 0x0000ff00))
12210 ZeroByteMask |= 0x0000ff00;
12211 if (!(C & 0x00ff0000))
12212 ZeroByteMask |= 0x00ff0000;
12213 if (!(C & 0xff000000))
12214 ZeroByteMask |= 0xff000000;
12215 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
12216 if ((NonZeroByteMask & C) != NonZeroByteMask)
12217 return 0; // Partial bytes selected.
12218 return C;
12219}
12220
12221// Check if a node selects whole bytes from its operand 0 starting at a byte
12222// boundary while masking the rest. Returns select mask as in the v_perm_b32
12223// or -1 if not succeeded.
12224// Note byte select encoding:
12225// value 0-3 selects corresponding source byte;
12226// value 0xc selects zero;
12227// value 0xff selects 0xff.
12228static uint32_t getPermuteMask(SDValue V) {
12229 assert(V.getValueSizeInBits() == 32);
12230
12231 if (V.getNumOperands() != 2)
12232 return ~0;
12233
12234 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
12235 if (!N1)
12236 return ~0;
12237
12238 uint32_t C = N1->getZExtValue();
12239
12240 switch (V.getOpcode()) {
12241 default:
12242 break;
12243 case ISD::AND:
12244 if (uint32_t ConstMask = getConstantPermuteMask(C))
12245 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
12246 break;
12247
12248 case ISD::OR:
12249 if (uint32_t ConstMask = getConstantPermuteMask(C))
12250 return (0x03020100 & ~ConstMask) | ConstMask;
12251 break;
12252
12253 case ISD::SHL:
12254 if (C % 8)
12255 return ~0;
12256
12257 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
12258
12259 case ISD::SRL:
12260 if (C % 8)
12261 return ~0;
12262
12263 return uint32_t(0x0c0c0c0c03020100ull >> C);
12264 }
12265
12266 return ~0;
12267}
12268
12269SDValue SITargetLowering::performAndCombine(SDNode *N,
12270 DAGCombinerInfo &DCI) const {
12271 if (DCI.isBeforeLegalize())
12272 return SDValue();
12273
12274 SelectionDAG &DAG = DCI.DAG;
12275 EVT VT = N->getValueType(ResNo: 0);
12276 SDValue LHS = N->getOperand(Num: 0);
12277 SDValue RHS = N->getOperand(Num: 1);
12278
12279 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
12280 if (VT == MVT::i64 && CRHS) {
12281 if (SDValue Split =
12282 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
12283 return Split;
12284 }
12285
12286 if (CRHS && VT == MVT::i32) {
12287 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
12288 // nb = number of trailing zeroes in mask
12289 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
12290 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
12291 uint64_t Mask = CRHS->getZExtValue();
12292 unsigned Bits = llvm::popcount(Value: Mask);
12293 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
12294 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
12295 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
12296 unsigned Shift = CShift->getZExtValue();
12297 unsigned NB = CRHS->getAPIntValue().countr_zero();
12298 unsigned Offset = NB + Shift;
12299 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
12300 SDLoc SL(N);
12301 SDValue BFE =
12302 DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS->getOperand(Num: 0),
12303 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
12304 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
12305 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
12306 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
12307 N2: DAG.getValueType(NarrowVT));
12308 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
12309 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
12310 return Shl;
12311 }
12312 }
12313 }
12314
12315 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12316 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
12317 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
12318 uint32_t Sel = getConstantPermuteMask(C: Mask);
12319 if (!Sel)
12320 return SDValue();
12321
12322 // Select 0xc for all zero bytes
12323 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
12324 SDLoc DL(N);
12325 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
12326 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12327 }
12328 }
12329
12330 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
12331 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
12332 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
12333 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
12334 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
12335
12336 SDValue X = LHS.getOperand(i: 0);
12337 SDValue Y = RHS.getOperand(i: 0);
12338 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
12339 !isTypeLegal(VT: X.getValueType()))
12340 return SDValue();
12341
12342 if (LCC == ISD::SETO) {
12343 if (X != LHS.getOperand(i: 1))
12344 return SDValue();
12345
12346 if (RCC == ISD::SETUNE) {
12347 const ConstantFPSDNode *C1 =
12348 dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
12349 if (!C1 || !C1->isInfinity() || C1->isNegative())
12350 return SDValue();
12351
12352 const uint32_t Mask = SIInstrFlags::N_NORMAL |
12353 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
12354 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
12355 SIInstrFlags::P_NORMAL;
12356
12357 static_assert(
12358 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
12359 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
12360 0x3ff) == Mask,
12361 "mask not equal");
12362
12363 SDLoc DL(N);
12364 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
12365 N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
12366 }
12367 }
12368 }
12369
12370 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
12371 std::swap(a&: LHS, b&: RHS);
12372
12373 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12374 RHS.hasOneUse()) {
12375 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
12376 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
12377 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
12378 // | n_nan)
12379 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
12380 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
12381 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
12382 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
12383 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
12384 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
12385 : Mask->getZExtValue() & OrdMask;
12386
12387 SDLoc DL(N);
12388 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
12389 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
12390 }
12391 }
12392
12393 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
12394 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12395 // and x, (sext cc from i1) => select cc, x, 0
12396 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
12397 std::swap(a&: LHS, b&: RHS);
12398 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
12399 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), LHS,
12400 RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
12401 }
12402
12403 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12404 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12405 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12406 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
12407 uint32_t LHSMask = getPermuteMask(V: LHS);
12408 uint32_t RHSMask = getPermuteMask(V: RHS);
12409 if (LHSMask != ~0u && RHSMask != ~0u) {
12410 // Canonicalize the expression in an attempt to have fewer unique masks
12411 // and therefore fewer registers used to hold the masks.
12412 if (LHSMask > RHSMask) {
12413 std::swap(a&: LHSMask, b&: RHSMask);
12414 std::swap(a&: LHS, b&: RHS);
12415 }
12416
12417 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12418 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12419 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12420 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12421
12422 // Check of we need to combine values from two sources within a byte.
12423 if (!(LHSUsedLanes & RHSUsedLanes) &&
12424 // If we select high and lower word keep it for SDWA.
12425 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12426 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12427 // Each byte in each mask is either selector mask 0-3, or has higher
12428 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
12429 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
12430 // mask which is not 0xff wins. By anding both masks we have a correct
12431 // result except that 0x0c shall be corrected to give 0x0c only.
12432 uint32_t Mask = LHSMask & RHSMask;
12433 for (unsigned I = 0; I < 32; I += 8) {
12434 uint32_t ByteSel = 0xff << I;
12435 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
12436 Mask &= (0x0c << I) & 0xffffffff;
12437 }
12438
12439 // Add 4 to each active LHS lane. It will not affect any existing 0xff
12440 // or 0x0c.
12441 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
12442 SDLoc DL(N);
12443
12444 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
12445 N2: RHS.getOperand(i: 0),
12446 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12447 }
12448 }
12449 }
12450
12451 return SDValue();
12452}
12453
12454// A key component of v_perm is a mapping between byte position of the src
12455// operands, and the byte position of the dest. To provide such, we need: 1. the
12456// node that provides x byte of the dest of the OR, and 2. the byte of the node
12457// used to provide that x byte. calculateByteProvider finds which node provides
12458// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
12459// and finds an ultimate src and byte position For example: The supported
12460// LoadCombine pattern for vector loads is as follows
12461// t1
12462// or
12463// / \
12464// t2 t3
12465// zext shl
12466// | | \
12467// t4 t5 16
12468// or anyext
12469// / \ |
12470// t6 t7 t8
12471// srl shl or
12472// / | / \ / \
12473// t9 t10 t11 t12 t13 t14
12474// trunc* 8 trunc* 8 and and
12475// | | / | | \
12476// t15 t16 t17 t18 t19 t20
12477// trunc* 255 srl -256
12478// | / \
12479// t15 t15 16
12480//
12481// *In this example, the truncs are from i32->i16
12482//
12483// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
12484// respectively. calculateSrcByte would find (given node) -> ultimate src &
12485// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
12486// After finding the mapping, we can combine the tree into vperm t15, t16,
12487// 0x05000407
12488
12489// Find the source and byte position from a node.
12490// \p DestByte is the byte position of the dest of the or that the src
12491// ultimately provides. \p SrcIndex is the byte of the src that maps to this
12492// dest of the or byte. \p Depth tracks how many recursive iterations we have
12493// performed.
12494static const std::optional<ByteProvider<SDValue>>
12495calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
12496 unsigned Depth = 0) {
12497 // We may need to recursively traverse a series of SRLs
12498 if (Depth >= 6)
12499 return std::nullopt;
12500
12501 if (Op.getValueSizeInBits() < 8)
12502 return std::nullopt;
12503
12504 if (Op.getValueType().isVector())
12505 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
12506
12507 switch (Op->getOpcode()) {
12508 case ISD::TRUNCATE: {
12509 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
12510 }
12511
12512 case ISD::SIGN_EXTEND:
12513 case ISD::ZERO_EXTEND:
12514 case ISD::SIGN_EXTEND_INREG: {
12515 SDValue NarrowOp = Op->getOperand(Num: 0);
12516 auto NarrowVT = NarrowOp.getValueType();
12517 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
12518 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
12519 NarrowVT = VTSign->getVT();
12520 }
12521 if (!NarrowVT.isByteSized())
12522 return std::nullopt;
12523 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
12524
12525 if (SrcIndex >= NarrowByteWidth)
12526 return std::nullopt;
12527 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
12528 }
12529
12530 case ISD::SRA:
12531 case ISD::SRL: {
12532 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12533 if (!ShiftOp)
12534 return std::nullopt;
12535
12536 uint64_t BitShift = ShiftOp->getZExtValue();
12537
12538 if (BitShift % 8 != 0)
12539 return std::nullopt;
12540
12541 SrcIndex += BitShift / 8;
12542
12543 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
12544 }
12545
12546 default: {
12547 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
12548 }
12549 }
12550 llvm_unreachable("fully handled switch");
12551}
12552
12553// For a byte position in the result of an Or, traverse the tree and find the
12554// node (and the byte of the node) which ultimately provides this {Or,
12555// BytePosition}. \p Op is the operand we are currently examining. \p Index is
12556// the byte position of the Op that corresponds with the originally requested
12557// byte of the Or \p Depth tracks how many recursive iterations we have
12558// performed. \p StartingIndex is the originally requested byte of the Or
12559static const std::optional<ByteProvider<SDValue>>
12560calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
12561 unsigned StartingIndex = 0) {
12562 // Finding Src tree of RHS of or typically requires at least 1 additional
12563 // depth
12564 if (Depth > 6)
12565 return std::nullopt;
12566
12567 unsigned BitWidth = Op.getScalarValueSizeInBits();
12568 if (BitWidth % 8 != 0)
12569 return std::nullopt;
12570 if (Index > BitWidth / 8 - 1)
12571 return std::nullopt;
12572
12573 bool IsVec = Op.getValueType().isVector();
12574 switch (Op.getOpcode()) {
12575 case ISD::OR: {
12576 if (IsVec)
12577 return std::nullopt;
12578
12579 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
12580 StartingIndex);
12581 if (!RHS)
12582 return std::nullopt;
12583 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
12584 StartingIndex);
12585 if (!LHS)
12586 return std::nullopt;
12587 // A well formed Or will have two ByteProviders for each byte, one of which
12588 // is constant zero
12589 if (!LHS->isConstantZero() && !RHS->isConstantZero())
12590 return std::nullopt;
12591 if (!LHS || LHS->isConstantZero())
12592 return RHS;
12593 if (!RHS || RHS->isConstantZero())
12594 return LHS;
12595 return std::nullopt;
12596 }
12597
12598 case ISD::AND: {
12599 if (IsVec)
12600 return std::nullopt;
12601
12602 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12603 if (!BitMaskOp)
12604 return std::nullopt;
12605
12606 uint32_t BitMask = BitMaskOp->getZExtValue();
12607 // Bits we expect for our StartingIndex
12608 uint32_t IndexMask = 0xFF << (Index * 8);
12609
12610 if ((IndexMask & BitMask) != IndexMask) {
12611 // If the result of the and partially provides the byte, then it
12612 // is not well formatted
12613 if (IndexMask & BitMask)
12614 return std::nullopt;
12615 return ByteProvider<SDValue>::getConstantZero();
12616 }
12617
12618 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
12619 }
12620
12621 case ISD::FSHR: {
12622 if (IsVec)
12623 return std::nullopt;
12624
12625 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
12626 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
12627 if (!ShiftOp || Op.getValueType().isVector())
12628 return std::nullopt;
12629
12630 uint64_t BitsProvided = Op.getValueSizeInBits();
12631 if (BitsProvided % 8 != 0)
12632 return std::nullopt;
12633
12634 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
12635 if (BitShift % 8)
12636 return std::nullopt;
12637
12638 uint64_t ConcatSizeInBytes = BitsProvided / 4;
12639 uint64_t ByteShift = BitShift / 8;
12640
12641 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
12642 uint64_t BytesProvided = BitsProvided / 8;
12643 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
12644 NewIndex %= BytesProvided;
12645 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
12646 }
12647
12648 case ISD::SRA:
12649 case ISD::SRL: {
12650 if (IsVec)
12651 return std::nullopt;
12652
12653 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12654 if (!ShiftOp)
12655 return std::nullopt;
12656
12657 uint64_t BitShift = ShiftOp->getZExtValue();
12658 if (BitShift % 8)
12659 return std::nullopt;
12660
12661 auto BitsProvided = Op.getScalarValueSizeInBits();
12662 if (BitsProvided % 8 != 0)
12663 return std::nullopt;
12664
12665 uint64_t BytesProvided = BitsProvided / 8;
12666 uint64_t ByteShift = BitShift / 8;
12667 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
12668 // If the byte we are trying to provide (as tracked by index) falls in this
12669 // range, then the SRL provides the byte. The byte of interest of the src of
12670 // the SRL is Index + ByteShift
12671 return BytesProvided - ByteShift > Index
12672 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
12673 SrcIndex: Index + ByteShift)
12674 : ByteProvider<SDValue>::getConstantZero();
12675 }
12676
12677 case ISD::SHL: {
12678 if (IsVec)
12679 return std::nullopt;
12680
12681 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12682 if (!ShiftOp)
12683 return std::nullopt;
12684
12685 uint64_t BitShift = ShiftOp->getZExtValue();
12686 if (BitShift % 8 != 0)
12687 return std::nullopt;
12688 uint64_t ByteShift = BitShift / 8;
12689
12690 // If we are shifting by an amount greater than (or equal to)
12691 // the index we are trying to provide, then it provides 0s. If not,
12692 // then this bytes are not definitively 0s, and the corresponding byte
12693 // of interest is Index - ByteShift of the src
12694 return Index < ByteShift
12695 ? ByteProvider<SDValue>::getConstantZero()
12696 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
12697 Depth: Depth + 1, StartingIndex);
12698 }
12699 case ISD::ANY_EXTEND:
12700 case ISD::SIGN_EXTEND:
12701 case ISD::ZERO_EXTEND:
12702 case ISD::SIGN_EXTEND_INREG:
12703 case ISD::AssertZext:
12704 case ISD::AssertSext: {
12705 if (IsVec)
12706 return std::nullopt;
12707
12708 SDValue NarrowOp = Op->getOperand(Num: 0);
12709 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
12710 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
12711 Op->getOpcode() == ISD::AssertZext ||
12712 Op->getOpcode() == ISD::AssertSext) {
12713 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
12714 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12715 }
12716 if (NarrowBitWidth % 8 != 0)
12717 return std::nullopt;
12718 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12719
12720 if (Index >= NarrowByteWidth)
12721 return Op.getOpcode() == ISD::ZERO_EXTEND
12722 ? std::optional<ByteProvider<SDValue>>(
12723 ByteProvider<SDValue>::getConstantZero())
12724 : std::nullopt;
12725 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
12726 }
12727
12728 case ISD::TRUNCATE: {
12729 if (IsVec)
12730 return std::nullopt;
12731
12732 uint64_t NarrowByteWidth = BitWidth / 8;
12733
12734 if (NarrowByteWidth >= Index) {
12735 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
12736 StartingIndex);
12737 }
12738
12739 return std::nullopt;
12740 }
12741
12742 case ISD::CopyFromReg: {
12743 if (BitWidth / 8 > Index)
12744 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
12745
12746 return std::nullopt;
12747 }
12748
12749 case ISD::LOAD: {
12750 auto *L = cast<LoadSDNode>(Val: Op.getNode());
12751
12752 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12753 if (NarrowBitWidth % 8 != 0)
12754 return std::nullopt;
12755 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12756
12757 // If the width of the load does not reach byte we are trying to provide for
12758 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12759 // question
12760 if (Index >= NarrowByteWidth) {
12761 return L->getExtensionType() == ISD::ZEXTLOAD
12762 ? std::optional<ByteProvider<SDValue>>(
12763 ByteProvider<SDValue>::getConstantZero())
12764 : std::nullopt;
12765 }
12766
12767 if (NarrowByteWidth > Index) {
12768 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
12769 }
12770
12771 return std::nullopt;
12772 }
12773
12774 case ISD::BSWAP: {
12775 if (IsVec)
12776 return std::nullopt;
12777
12778 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
12779 Depth: Depth + 1, StartingIndex);
12780 }
12781
12782 case ISD::EXTRACT_VECTOR_ELT: {
12783 auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12784 if (!IdxOp)
12785 return std::nullopt;
12786 auto VecIdx = IdxOp->getZExtValue();
12787 auto ScalarSize = Op.getScalarValueSizeInBits();
12788 if (ScalarSize < 32)
12789 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12790 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
12791 DestByte: StartingIndex, SrcIndex: Index);
12792 }
12793
12794 case AMDGPUISD::PERM: {
12795 if (IsVec)
12796 return std::nullopt;
12797
12798 auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
12799 if (!PermMask)
12800 return std::nullopt;
12801
12802 auto IdxMask =
12803 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12804 if (IdxMask > 0x07 && IdxMask != 0x0c)
12805 return std::nullopt;
12806
12807 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
12808 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12809
12810 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
12811 : ByteProvider<SDValue>(
12812 ByteProvider<SDValue>::getConstantZero());
12813 }
12814
12815 default: {
12816 return std::nullopt;
12817 }
12818 }
12819
12820 llvm_unreachable("fully handled switch");
12821}
12822
12823// Returns true if the Operand is a scalar and is 16 bits
12824static bool isExtendedFrom16Bits(SDValue &Operand) {
12825
12826 switch (Operand.getOpcode()) {
12827 case ISD::ANY_EXTEND:
12828 case ISD::SIGN_EXTEND:
12829 case ISD::ZERO_EXTEND: {
12830 auto OpVT = Operand.getOperand(i: 0).getValueType();
12831 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12832 }
12833 case ISD::LOAD: {
12834 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
12835 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
12836 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12837 ExtType == ISD::EXTLOAD) {
12838 auto MemVT = L->getMemoryVT();
12839 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12840 }
12841 return L->getMemoryVT().getSizeInBits() == 16;
12842 }
12843 default:
12844 return false;
12845 }
12846}
12847
12848// Returns true if the mask matches consecutive bytes, and the first byte
12849// begins at a power of 2 byte offset from 0th byte
12850static bool addresses16Bits(int Mask) {
12851 int Low8 = Mask & 0xff;
12852 int Hi8 = (Mask & 0xff00) >> 8;
12853
12854 assert(Low8 < 8 && Hi8 < 8);
12855 // Are the bytes contiguous in the order of increasing addresses.
12856 bool IsConsecutive = (Hi8 - Low8 == 1);
12857 // Is the first byte at location that is aligned for 16 bit instructions.
12858 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12859 // In this case, we still need code to extract the 16 bit operand, so it
12860 // is better to use i8 v_perm
12861 bool Is16Aligned = !(Low8 % 2);
12862
12863 return IsConsecutive && Is16Aligned;
12864}
12865
12866// Do not lower into v_perm if the operands are actually 16 bit
12867// and the selected bits (based on PermMask) correspond with two
12868// easily addressable 16 bit operands.
12869static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12870 SDValue &OtherOp) {
12871 int Low16 = PermMask & 0xffff;
12872 int Hi16 = (PermMask & 0xffff0000) >> 16;
12873
12874 auto TempOp = peekThroughBitcasts(V: Op);
12875 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
12876
12877 auto OpIs16Bit =
12878 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
12879 if (!OpIs16Bit)
12880 return true;
12881
12882 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12883 isExtendedFrom16Bits(Operand&: TempOtherOp);
12884 if (!OtherOpIs16Bit)
12885 return true;
12886
12887 // Do we cleanly address both
12888 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
12889}
12890
12891static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12892 unsigned DWordOffset) {
12893 SDValue Ret;
12894
12895 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12896 // ByteProvider must be at least 8 bits
12897 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12898
12899 if (TypeSize <= 32)
12900 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
12901
12902 if (Src.getValueType().isVector()) {
12903 auto ScalarTySize = Src.getScalarValueSizeInBits();
12904 auto ScalarTy = Src.getValueType().getScalarType();
12905 if (ScalarTySize == 32) {
12906 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
12907 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
12908 }
12909 if (ScalarTySize > 32) {
12910 Ret = DAG.getNode(
12911 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
12912 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
12913 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12914 if (ShiftVal)
12915 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
12916 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12917 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12918 }
12919
12920 assert(ScalarTySize < 32);
12921 auto NumElements = TypeSize / ScalarTySize;
12922 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12923 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12924 auto NumElementsIn32 = 32 / ScalarTySize;
12925 auto NumAvailElements = DWordOffset < Trunc32Elements
12926 ? NumElementsIn32
12927 : NumElements - NormalizedTrunc;
12928
12929 SmallVector<SDValue, 4> VecSrcs;
12930 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
12931 Count: NumAvailElements);
12932
12933 Ret = DAG.getBuildVector(
12934 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
12935 Ops: VecSrcs);
12936 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12937 }
12938
12939 /// Scalar Type
12940 auto ShiftVal = 32 * DWordOffset;
12941 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
12942 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12943 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12944}
12945
12946static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12947 SelectionDAG &DAG = DCI.DAG;
12948 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
12949 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12950
12951 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12952 assert(VT == MVT::i32);
12953 for (int i = 0; i < 4; i++) {
12954 // Find the ByteProvider that provides the ith byte of the result of OR
12955 std::optional<ByteProvider<SDValue>> P =
12956 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
12957 // TODO support constantZero
12958 if (!P || P->isConstantZero())
12959 return SDValue();
12960
12961 PermNodes.push_back(Elt: *P);
12962 }
12963 if (PermNodes.size() != 4)
12964 return SDValue();
12965
12966 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12967 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12968 uint64_t PermMask = 0x00000000;
12969 for (size_t i = 0; i < PermNodes.size(); i++) {
12970 auto PermOp = PermNodes[i];
12971 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12972 // by sizeof(Src2) = 4
12973 int SrcByteAdjust = 4;
12974
12975 // If the Src uses a byte from a different DWORD, then it corresponds
12976 // with a difference source
12977 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
12978 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12979 if (SecondSrc)
12980 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
12981 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12982 return SDValue();
12983
12984 // Set the index of the second distinct Src node
12985 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12986 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12987 SrcByteAdjust = 0;
12988 }
12989 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12990 assert(!DAG.getDataLayout().isBigEndian());
12991 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12992 }
12993 SDLoc DL(N);
12994 SDValue Op = *PermNodes[FirstSrc.first].Src;
12995 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
12996 assert(Op.getValueSizeInBits() == 32);
12997
12998 // Check that we are not just extracting the bytes in order from an op
12999 if (!SecondSrc) {
13000 int Low16 = PermMask & 0xffff;
13001 int Hi16 = (PermMask & 0xffff0000) >> 16;
13002
13003 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13004 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13005
13006 // The perm op would really just produce Op. So combine into Op
13007 if (WellFormedLow && WellFormedHi)
13008 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
13009 }
13010
13011 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13012
13013 if (SecondSrc) {
13014 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
13015 assert(OtherOp.getValueSizeInBits() == 32);
13016 }
13017
13018 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13019
13020 assert(Op.getValueType().isByteSized() &&
13021 OtherOp.getValueType().isByteSized());
13022
13023 // If the ultimate src is less than 32 bits, then we will only be
13024 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13025 // CalculateByteProvider would not have returned Op as source if we
13026 // used a byte that is outside its ValueType. Thus, we are free to
13027 // ANY_EXTEND as the extended bits are dont-cares.
13028 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
13029 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
13030
13031 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
13032 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
13033 }
13034 return SDValue();
13035}
13036
13037SDValue SITargetLowering::performOrCombine(SDNode *N,
13038 DAGCombinerInfo &DCI) const {
13039 SelectionDAG &DAG = DCI.DAG;
13040 SDValue LHS = N->getOperand(Num: 0);
13041 SDValue RHS = N->getOperand(Num: 1);
13042
13043 EVT VT = N->getValueType(ResNo: 0);
13044 if (VT == MVT::i1) {
13045 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13046 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13047 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13048 SDValue Src = LHS.getOperand(i: 0);
13049 if (Src != RHS.getOperand(i: 0))
13050 return SDValue();
13051
13052 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
13053 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
13054 if (!CLHS || !CRHS)
13055 return SDValue();
13056
13057 // Only 10 bits are used.
13058 static const uint32_t MaxMask = 0x3ff;
13059
13060 uint32_t NewMask =
13061 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13062 SDLoc DL(N);
13063 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
13064 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13065 }
13066
13067 return SDValue();
13068 }
13069
13070 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13071 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
13072 LHS.getOpcode() == AMDGPUISD::PERM &&
13073 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
13074 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
13075 if (!Sel)
13076 return SDValue();
13077
13078 Sel |= LHS.getConstantOperandVal(i: 2);
13079 SDLoc DL(N);
13080 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13081 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13082 }
13083
13084 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13085 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13086 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13087 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
13088
13089 // If all the uses of an or need to extract the individual elements, do not
13090 // attempt to lower into v_perm
13091 auto usesCombinedOperand = [](SDNode *OrUse) {
13092 // If we have any non-vectorized use, then it is a candidate for v_perm
13093 if (OrUse->getOpcode() != ISD::BITCAST ||
13094 !OrUse->getValueType(ResNo: 0).isVector())
13095 return true;
13096
13097 // If we have any non-vectorized use, then it is a candidate for v_perm
13098 for (auto *VUser : OrUse->users()) {
13099 if (!VUser->getValueType(ResNo: 0).isVector())
13100 return true;
13101
13102 // If the use of a vector is a store, then combining via a v_perm
13103 // is beneficial.
13104 // TODO -- whitelist more uses
13105 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13106 if (VUser->getOpcode() == VectorwiseOp)
13107 return true;
13108 }
13109 return false;
13110 };
13111
13112 if (!any_of(Range: N->users(), P: usesCombinedOperand))
13113 return SDValue();
13114
13115 uint32_t LHSMask = getPermuteMask(V: LHS);
13116 uint32_t RHSMask = getPermuteMask(V: RHS);
13117
13118 if (LHSMask != ~0u && RHSMask != ~0u) {
13119 // Canonicalize the expression in an attempt to have fewer unique masks
13120 // and therefore fewer registers used to hold the masks.
13121 if (LHSMask > RHSMask) {
13122 std::swap(a&: LHSMask, b&: RHSMask);
13123 std::swap(a&: LHS, b&: RHS);
13124 }
13125
13126 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13127 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13128 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13129 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13130
13131 // Check of we need to combine values from two sources within a byte.
13132 if (!(LHSUsedLanes & RHSUsedLanes) &&
13133 // If we select high and lower word keep it for SDWA.
13134 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13135 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13136 // Kill zero bytes selected by other mask. Zero value is 0xc.
13137 LHSMask &= ~RHSUsedLanes;
13138 RHSMask &= ~LHSUsedLanes;
13139 // Add 4 to each active LHS lane
13140 LHSMask |= LHSUsedLanes & 0x04040404;
13141 // Combine masks
13142 uint32_t Sel = LHSMask | RHSMask;
13143 SDLoc DL(N);
13144
13145 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13146 N2: RHS.getOperand(i: 0),
13147 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13148 }
13149 }
13150 if (LHSMask == ~0u || RHSMask == ~0u) {
13151 if (SDValue Perm = matchPERM(N, DCI))
13152 return Perm;
13153 }
13154 }
13155
13156 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
13157 return SDValue();
13158
13159 // TODO: This could be a generic combine with a predicate for extracting the
13160 // high half of an integer being free.
13161
13162 // (or i64:x, (zero_extend i32:y)) ->
13163 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
13164 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
13165 RHS.getOpcode() != ISD::ZERO_EXTEND)
13166 std::swap(a&: LHS, b&: RHS);
13167
13168 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
13169 SDValue ExtSrc = RHS.getOperand(i: 0);
13170 EVT SrcVT = ExtSrc.getValueType();
13171 if (SrcVT == MVT::i32) {
13172 SDLoc SL(N);
13173 auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
13174 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
13175
13176 DCI.AddToWorklist(N: LowOr.getNode());
13177 DCI.AddToWorklist(N: HiBits.getNode());
13178
13179 SDValue Vec =
13180 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
13181 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
13182 }
13183 }
13184
13185 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
13186 if (CRHS) {
13187 if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
13188 LHS: N->getOperand(Num: 0), CRHS))
13189 return Split;
13190 }
13191
13192 return SDValue();
13193}
13194
13195SDValue SITargetLowering::performXorCombine(SDNode *N,
13196 DAGCombinerInfo &DCI) const {
13197 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
13198 return RV;
13199
13200 SDValue LHS = N->getOperand(Num: 0);
13201 SDValue RHS = N->getOperand(Num: 1);
13202
13203 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13204 SelectionDAG &DAG = DCI.DAG;
13205
13206 EVT VT = N->getValueType(ResNo: 0);
13207 if (CRHS && VT == MVT::i64) {
13208 if (SDValue Split =
13209 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
13210 return Split;
13211 }
13212
13213 // Make sure to apply the 64-bit constant splitting fold before trying to fold
13214 // fneg-like xors into 64-bit select.
13215 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
13216 // This looks like an fneg, try to fold as a source modifier.
13217 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
13218 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
13219 // xor (select c, a, b), 0x80000000 ->
13220 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
13221 SDLoc DL(N);
13222 SDValue CastLHS =
13223 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
13224 SDValue CastRHS =
13225 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
13226 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
13227 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
13228 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
13229 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
13230 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
13231 }
13232 }
13233
13234 return SDValue();
13235}
13236
13237SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
13238 DAGCombinerInfo &DCI) const {
13239 if (!Subtarget->has16BitInsts() ||
13240 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
13241 return SDValue();
13242
13243 EVT VT = N->getValueType(ResNo: 0);
13244 if (VT != MVT::i32)
13245 return SDValue();
13246
13247 SDValue Src = N->getOperand(Num: 0);
13248 if (Src.getValueType() != MVT::i16)
13249 return SDValue();
13250
13251 return SDValue();
13252}
13253
13254SDValue
13255SITargetLowering::performSignExtendInRegCombine(SDNode *N,
13256 DAGCombinerInfo &DCI) const {
13257 SDValue Src = N->getOperand(Num: 0);
13258 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
13259
13260 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
13261 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
13262 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
13263 VTSign->getVT() == MVT::i8) ||
13264 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
13265 VTSign->getVT() == MVT::i16))) {
13266 assert(Subtarget->hasScalarSubwordLoads() &&
13267 "s_buffer_load_{u8, i8} are supported "
13268 "in GFX12 (or newer) architectures.");
13269 EVT VT = Src.getValueType();
13270 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
13271 ? AMDGPUISD::SBUFFER_LOAD_BYTE
13272 : AMDGPUISD::SBUFFER_LOAD_SHORT;
13273 SDLoc DL(N);
13274 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
13275 SDValue Ops[] = {
13276 Src.getOperand(i: 0), // source register
13277 Src.getOperand(i: 1), // offset
13278 Src.getOperand(i: 2) // cachePolicy
13279 };
13280 auto *M = cast<MemSDNode>(Val&: Src);
13281 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
13282 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
13283 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
13284 return LoadVal;
13285 }
13286 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
13287 VTSign->getVT() == MVT::i8) ||
13288 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
13289 VTSign->getVT() == MVT::i16)) &&
13290 Src.hasOneUse()) {
13291 auto *M = cast<MemSDNode>(Val&: Src);
13292 SDValue Ops[] = {Src.getOperand(i: 0), // Chain
13293 Src.getOperand(i: 1), // rsrc
13294 Src.getOperand(i: 2), // vindex
13295 Src.getOperand(i: 3), // voffset
13296 Src.getOperand(i: 4), // soffset
13297 Src.getOperand(i: 5), // offset
13298 Src.getOperand(i: 6), Src.getOperand(i: 7)};
13299 // replace with BUFFER_LOAD_BYTE/SHORT
13300 SDVTList ResList =
13301 DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: 0).getValueType());
13302 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
13303 ? AMDGPUISD::BUFFER_LOAD_BYTE
13304 : AMDGPUISD::BUFFER_LOAD_SHORT;
13305 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
13306 Opcode: Opc, dl: SDLoc(N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
13307 return DCI.DAG.getMergeValues(
13308 Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
13309 }
13310 return SDValue();
13311}
13312
13313SDValue SITargetLowering::performClassCombine(SDNode *N,
13314 DAGCombinerInfo &DCI) const {
13315 SelectionDAG &DAG = DCI.DAG;
13316 SDValue Mask = N->getOperand(Num: 1);
13317
13318 // fp_class x, 0 -> false
13319 if (isNullConstant(V: Mask))
13320 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
13321
13322 if (N->getOperand(Num: 0).isUndef())
13323 return DAG.getUNDEF(VT: MVT::i1);
13324
13325 return SDValue();
13326}
13327
13328SDValue SITargetLowering::performRcpCombine(SDNode *N,
13329 DAGCombinerInfo &DCI) const {
13330 EVT VT = N->getValueType(ResNo: 0);
13331 SDValue N0 = N->getOperand(Num: 0);
13332
13333 if (N0.isUndef()) {
13334 return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
13335 DL: SDLoc(N), VT);
13336 }
13337
13338 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
13339 N0.getOpcode() == ISD::SINT_TO_FP)) {
13340 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
13341 Flags: N->getFlags());
13342 }
13343
13344 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
13345 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
13346 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
13347 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, Operand: N0.getOperand(i: 0),
13348 Flags: N->getFlags());
13349 }
13350
13351 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
13352}
13353
13354bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
13355 unsigned MaxDepth) const {
13356 unsigned Opcode = Op.getOpcode();
13357 if (Opcode == ISD::FCANONICALIZE)
13358 return true;
13359
13360 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
13361 const auto &F = CFP->getValueAPF();
13362 if (F.isNaN() && F.isSignaling())
13363 return false;
13364 if (!F.isDenormal())
13365 return true;
13366
13367 DenormalMode Mode =
13368 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
13369 return Mode == DenormalMode::getIEEE();
13370 }
13371
13372 // If source is a result of another standard FP operation it is already in
13373 // canonical form.
13374 if (MaxDepth == 0)
13375 return false;
13376
13377 switch (Opcode) {
13378 // These will flush denorms if required.
13379 case ISD::FADD:
13380 case ISD::FSUB:
13381 case ISD::FMUL:
13382 case ISD::FCEIL:
13383 case ISD::FFLOOR:
13384 case ISD::FMA:
13385 case ISD::FMAD:
13386 case ISD::FSQRT:
13387 case ISD::FDIV:
13388 case ISD::FREM:
13389 case ISD::FP_ROUND:
13390 case ISD::FP_EXTEND:
13391 case ISD::FP16_TO_FP:
13392 case ISD::FP_TO_FP16:
13393 case ISD::BF16_TO_FP:
13394 case ISD::FP_TO_BF16:
13395 case ISD::FLDEXP:
13396 case AMDGPUISD::FMUL_LEGACY:
13397 case AMDGPUISD::FMAD_FTZ:
13398 case AMDGPUISD::RCP:
13399 case AMDGPUISD::RSQ:
13400 case AMDGPUISD::RSQ_CLAMP:
13401 case AMDGPUISD::RCP_LEGACY:
13402 case AMDGPUISD::RCP_IFLAG:
13403 case AMDGPUISD::LOG:
13404 case AMDGPUISD::EXP:
13405 case AMDGPUISD::DIV_SCALE:
13406 case AMDGPUISD::DIV_FMAS:
13407 case AMDGPUISD::DIV_FIXUP:
13408 case AMDGPUISD::FRACT:
13409 case AMDGPUISD::CVT_PKRTZ_F16_F32:
13410 case AMDGPUISD::CVT_F32_UBYTE0:
13411 case AMDGPUISD::CVT_F32_UBYTE1:
13412 case AMDGPUISD::CVT_F32_UBYTE2:
13413 case AMDGPUISD::CVT_F32_UBYTE3:
13414 case AMDGPUISD::FP_TO_FP16:
13415 case AMDGPUISD::SIN_HW:
13416 case AMDGPUISD::COS_HW:
13417 return true;
13418
13419 // It can/will be lowered or combined as a bit operation.
13420 // Need to check their input recursively to handle.
13421 case ISD::FNEG:
13422 case ISD::FABS:
13423 case ISD::FCOPYSIGN:
13424 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
13425
13426 case ISD::AND:
13427 if (Op.getValueType() == MVT::i32) {
13428 // Be careful as we only know it is a bitcast floating point type. It
13429 // could be f32, v2f16, we have no way of knowing. Luckily the constant
13430 // value that we optimize for, which comes up in fp32 to bf16 conversions,
13431 // is valid to optimize for all types.
13432 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
13433 if (RHS->getZExtValue() == 0xffff0000) {
13434 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
13435 }
13436 }
13437 }
13438 break;
13439
13440 case ISD::FSIN:
13441 case ISD::FCOS:
13442 case ISD::FSINCOS:
13443 return Op.getValueType().getScalarType() != MVT::f16;
13444
13445 case ISD::FMINNUM:
13446 case ISD::FMAXNUM:
13447 case ISD::FMINNUM_IEEE:
13448 case ISD::FMAXNUM_IEEE:
13449 case ISD::FMINIMUM:
13450 case ISD::FMAXIMUM:
13451 case ISD::FMINIMUMNUM:
13452 case ISD::FMAXIMUMNUM:
13453 case AMDGPUISD::CLAMP:
13454 case AMDGPUISD::FMED3:
13455 case AMDGPUISD::FMAX3:
13456 case AMDGPUISD::FMIN3:
13457 case AMDGPUISD::FMAXIMUM3:
13458 case AMDGPUISD::FMINIMUM3: {
13459 // FIXME: Shouldn't treat the generic operations different based these.
13460 // However, we aren't really required to flush the result from
13461 // minnum/maxnum..
13462
13463 // snans will be quieted, so we only need to worry about denormals.
13464 if (Subtarget->supportsMinMaxDenormModes() ||
13465 // FIXME: denormalsEnabledForType is broken for dynamic
13466 denormalsEnabledForType(DAG, VT: Op.getValueType()))
13467 return true;
13468
13469 // Flushing may be required.
13470 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
13471 // targets need to check their input recursively.
13472
13473 // FIXME: Does this apply with clamp? It's implemented with max.
13474 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
13475 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1))
13476 return false;
13477 }
13478
13479 return true;
13480 }
13481 case ISD::SELECT: {
13482 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) &&
13483 isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1);
13484 }
13485 case ISD::BUILD_VECTOR: {
13486 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
13487 SDValue SrcOp = Op.getOperand(i);
13488 if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1))
13489 return false;
13490 }
13491
13492 return true;
13493 }
13494 case ISD::EXTRACT_VECTOR_ELT:
13495 case ISD::EXTRACT_SUBVECTOR: {
13496 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
13497 }
13498 case ISD::INSERT_VECTOR_ELT: {
13499 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) &&
13500 isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1);
13501 }
13502 case ISD::UNDEF:
13503 // Could be anything.
13504 return false;
13505
13506 case ISD::BITCAST:
13507 // TODO: This is incorrect as it loses track of the operand's type. We may
13508 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
13509 // same bits that are canonicalized in one type need not be in the other.
13510 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
13511 case ISD::TRUNCATE: {
13512 // Hack round the mess we make when legalizing extract_vector_elt
13513 if (Op.getValueType() == MVT::i16) {
13514 SDValue TruncSrc = Op.getOperand(i: 0);
13515 if (TruncSrc.getValueType() == MVT::i32 &&
13516 TruncSrc.getOpcode() == ISD::BITCAST &&
13517 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
13518 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1);
13519 }
13520 }
13521 return false;
13522 }
13523 case ISD::INTRINSIC_WO_CHAIN: {
13524 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
13525 // TODO: Handle more intrinsics
13526 switch (IntrinsicID) {
13527 case Intrinsic::amdgcn_cvt_pkrtz:
13528 case Intrinsic::amdgcn_cubeid:
13529 case Intrinsic::amdgcn_frexp_mant:
13530 case Intrinsic::amdgcn_fdot2:
13531 case Intrinsic::amdgcn_rcp:
13532 case Intrinsic::amdgcn_rsq:
13533 case Intrinsic::amdgcn_rsq_clamp:
13534 case Intrinsic::amdgcn_rcp_legacy:
13535 case Intrinsic::amdgcn_rsq_legacy:
13536 case Intrinsic::amdgcn_trig_preop:
13537 case Intrinsic::amdgcn_log:
13538 case Intrinsic::amdgcn_exp2:
13539 case Intrinsic::amdgcn_sqrt:
13540 return true;
13541 default:
13542 break;
13543 }
13544
13545 break;
13546 }
13547 default:
13548 break;
13549 }
13550
13551 // FIXME: denormalsEnabledForType is broken for dynamic
13552 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
13553 DAG.isKnownNeverSNaN(Op);
13554}
13555
13556bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
13557 unsigned MaxDepth) const {
13558 const MachineRegisterInfo &MRI = MF.getRegInfo();
13559 MachineInstr *MI = MRI.getVRegDef(Reg);
13560 unsigned Opcode = MI->getOpcode();
13561
13562 if (Opcode == AMDGPU::G_FCANONICALIZE)
13563 return true;
13564
13565 std::optional<FPValueAndVReg> FCR;
13566 // Constant splat (can be padded with undef) or scalar constant.
13567 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
13568 if (FCR->Value.isSignaling())
13569 return false;
13570 if (!FCR->Value.isDenormal())
13571 return true;
13572
13573 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
13574 return Mode == DenormalMode::getIEEE();
13575 }
13576
13577 if (MaxDepth == 0)
13578 return false;
13579
13580 switch (Opcode) {
13581 case AMDGPU::G_FADD:
13582 case AMDGPU::G_FSUB:
13583 case AMDGPU::G_FMUL:
13584 case AMDGPU::G_FCEIL:
13585 case AMDGPU::G_FFLOOR:
13586 case AMDGPU::G_FRINT:
13587 case AMDGPU::G_FNEARBYINT:
13588 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
13589 case AMDGPU::G_INTRINSIC_TRUNC:
13590 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
13591 case AMDGPU::G_FMA:
13592 case AMDGPU::G_FMAD:
13593 case AMDGPU::G_FSQRT:
13594 case AMDGPU::G_FDIV:
13595 case AMDGPU::G_FREM:
13596 case AMDGPU::G_FPOW:
13597 case AMDGPU::G_FPEXT:
13598 case AMDGPU::G_FLOG:
13599 case AMDGPU::G_FLOG2:
13600 case AMDGPU::G_FLOG10:
13601 case AMDGPU::G_FPTRUNC:
13602 case AMDGPU::G_AMDGPU_RCP_IFLAG:
13603 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
13604 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
13605 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
13606 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
13607 return true;
13608 case AMDGPU::G_FNEG:
13609 case AMDGPU::G_FABS:
13610 case AMDGPU::G_FCOPYSIGN:
13611 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
13612 case AMDGPU::G_FMINNUM:
13613 case AMDGPU::G_FMAXNUM:
13614 case AMDGPU::G_FMINNUM_IEEE:
13615 case AMDGPU::G_FMAXNUM_IEEE:
13616 case AMDGPU::G_FMINIMUM:
13617 case AMDGPU::G_FMAXIMUM:
13618 case AMDGPU::G_FMINIMUMNUM:
13619 case AMDGPU::G_FMAXIMUMNUM: {
13620 if (Subtarget->supportsMinMaxDenormModes() ||
13621 // FIXME: denormalsEnabledForType is broken for dynamic
13622 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
13623 return true;
13624
13625 [[fallthrough]];
13626 }
13627 case AMDGPU::G_BUILD_VECTOR:
13628 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
13629 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
13630 return false;
13631 return true;
13632 case AMDGPU::G_INTRINSIC:
13633 case AMDGPU::G_INTRINSIC_CONVERGENT:
13634 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
13635 case Intrinsic::amdgcn_fmul_legacy:
13636 case Intrinsic::amdgcn_fmad_ftz:
13637 case Intrinsic::amdgcn_sqrt:
13638 case Intrinsic::amdgcn_fmed3:
13639 case Intrinsic::amdgcn_sin:
13640 case Intrinsic::amdgcn_cos:
13641 case Intrinsic::amdgcn_log:
13642 case Intrinsic::amdgcn_exp2:
13643 case Intrinsic::amdgcn_log_clamp:
13644 case Intrinsic::amdgcn_rcp:
13645 case Intrinsic::amdgcn_rcp_legacy:
13646 case Intrinsic::amdgcn_rsq:
13647 case Intrinsic::amdgcn_rsq_clamp:
13648 case Intrinsic::amdgcn_rsq_legacy:
13649 case Intrinsic::amdgcn_div_scale:
13650 case Intrinsic::amdgcn_div_fmas:
13651 case Intrinsic::amdgcn_div_fixup:
13652 case Intrinsic::amdgcn_fract:
13653 case Intrinsic::amdgcn_cvt_pkrtz:
13654 case Intrinsic::amdgcn_cubeid:
13655 case Intrinsic::amdgcn_cubema:
13656 case Intrinsic::amdgcn_cubesc:
13657 case Intrinsic::amdgcn_cubetc:
13658 case Intrinsic::amdgcn_frexp_mant:
13659 case Intrinsic::amdgcn_fdot2:
13660 case Intrinsic::amdgcn_trig_preop:
13661 return true;
13662 default:
13663 break;
13664 }
13665
13666 [[fallthrough]];
13667 default:
13668 return false;
13669 }
13670
13671 llvm_unreachable("invalid operation");
13672}
13673
13674// Constant fold canonicalize.
13675SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
13676 const SDLoc &SL, EVT VT,
13677 const APFloat &C) const {
13678 // Flush denormals to 0 if not enabled.
13679 if (C.isDenormal()) {
13680 DenormalMode Mode =
13681 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
13682 if (Mode == DenormalMode::getPreserveSign()) {
13683 return DAG.getConstantFP(
13684 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
13685 }
13686
13687 if (Mode != DenormalMode::getIEEE())
13688 return SDValue();
13689 }
13690
13691 if (C.isNaN()) {
13692 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
13693 if (C.isSignaling()) {
13694 // Quiet a signaling NaN.
13695 // FIXME: Is this supposed to preserve payload bits?
13696 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
13697 }
13698
13699 // Make sure it is the canonical NaN bitpattern.
13700 //
13701 // TODO: Can we use -1 as the canonical NaN value since it's an inline
13702 // immediate?
13703 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
13704 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
13705 }
13706
13707 // Already canonical.
13708 return DAG.getConstantFP(Val: C, DL: SL, VT);
13709}
13710
13711static bool vectorEltWillFoldAway(SDValue Op) {
13712 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
13713}
13714
13715SDValue
13716SITargetLowering::performFCanonicalizeCombine(SDNode *N,
13717 DAGCombinerInfo &DCI) const {
13718 SelectionDAG &DAG = DCI.DAG;
13719 SDValue N0 = N->getOperand(Num: 0);
13720 EVT VT = N->getValueType(ResNo: 0);
13721
13722 // fcanonicalize undef -> qnan
13723 if (N0.isUndef()) {
13724 APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
13725 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
13726 }
13727
13728 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
13729 EVT VT = N->getValueType(ResNo: 0);
13730 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
13731 }
13732
13733 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13734 // (fcanonicalize k)
13735 //
13736 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13737
13738 // TODO: This could be better with wider vectors that will be split to v2f16,
13739 // and to consider uses since there aren't that many packed operations.
13740 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13741 isTypeLegal(VT: MVT::v2f16)) {
13742 SDLoc SL(N);
13743 SDValue NewElts[2];
13744 SDValue Lo = N0.getOperand(i: 0);
13745 SDValue Hi = N0.getOperand(i: 1);
13746 EVT EltVT = Lo.getValueType();
13747
13748 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
13749 for (unsigned I = 0; I != 2; ++I) {
13750 SDValue Op = N0.getOperand(i: I);
13751 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
13752 NewElts[I] =
13753 getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
13754 } else if (Op.isUndef()) {
13755 // Handled below based on what the other operand is.
13756 NewElts[I] = Op;
13757 } else {
13758 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
13759 }
13760 }
13761
13762 // If one half is undef, and one is constant, prefer a splat vector rather
13763 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13764 // cheaper to use and may be free with a packed operation.
13765 if (NewElts[0].isUndef()) {
13766 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
13767 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1])
13768 ? NewElts[1]
13769 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
13770 }
13771
13772 if (NewElts[1].isUndef()) {
13773 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0])
13774 ? NewElts[0]
13775 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
13776 }
13777
13778 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
13779 }
13780 }
13781
13782 return SDValue();
13783}
13784
13785static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13786 switch (Opc) {
13787 case ISD::FMAXNUM:
13788 case ISD::FMAXNUM_IEEE:
13789 case ISD::FMAXIMUMNUM:
13790 return AMDGPUISD::FMAX3;
13791 case ISD::FMAXIMUM:
13792 return AMDGPUISD::FMAXIMUM3;
13793 case ISD::SMAX:
13794 return AMDGPUISD::SMAX3;
13795 case ISD::UMAX:
13796 return AMDGPUISD::UMAX3;
13797 case ISD::FMINNUM:
13798 case ISD::FMINNUM_IEEE:
13799 case ISD::FMINIMUMNUM:
13800 return AMDGPUISD::FMIN3;
13801 case ISD::FMINIMUM:
13802 return AMDGPUISD::FMINIMUM3;
13803 case ISD::SMIN:
13804 return AMDGPUISD::SMIN3;
13805 case ISD::UMIN:
13806 return AMDGPUISD::UMIN3;
13807 default:
13808 llvm_unreachable("Not a min/max opcode");
13809 }
13810}
13811
13812SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13813 const SDLoc &SL, SDValue Src,
13814 SDValue MinVal,
13815 SDValue MaxVal,
13816 bool Signed) const {
13817
13818 // med3 comes from
13819 // min(max(x, K0), K1), K0 < K1
13820 // max(min(x, K0), K1), K1 < K0
13821 //
13822 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13823 // min/max op.
13824 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
13825 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
13826
13827 if (!MinK || !MaxK)
13828 return SDValue();
13829
13830 if (Signed) {
13831 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
13832 return SDValue();
13833 } else {
13834 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
13835 return SDValue();
13836 }
13837
13838 EVT VT = MinK->getValueType(ResNo: 0);
13839 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13840 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13841 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
13842
13843 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13844 // not available, but this is unlikely to be profitable as constants
13845 // will often need to be materialized & extended, especially on
13846 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13847 return SDValue();
13848}
13849
13850static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13851 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
13852 return C;
13853
13854 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
13855 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13856 return C;
13857 }
13858
13859 return nullptr;
13860}
13861
13862SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13863 const SDLoc &SL, SDValue Op0,
13864 SDValue Op1) const {
13865 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
13866 if (!K1)
13867 return SDValue();
13868
13869 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
13870 if (!K0)
13871 return SDValue();
13872
13873 // Ordered >= (although NaN inputs should have folded away by now).
13874 if (K0->getValueAPF() > K1->getValueAPF())
13875 return SDValue();
13876
13877 // med3 with a nan input acts like
13878 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
13879 //
13880 // So the result depends on whether the IEEE mode bit is enabled or not with a
13881 // signaling nan input.
13882 // ieee=1
13883 // s0 snan: yields s2
13884 // s1 snan: yields s2
13885 // s2 snan: qnan
13886
13887 // s0 qnan: min(s1, s2)
13888 // s1 qnan: min(s0, s2)
13889 // s2 qnan: min(s0, s1)
13890
13891 // ieee=0
13892 // s0 snan: min(s1, s2)
13893 // s1 snan: min(s0, s2)
13894 // s2 snan: qnan
13895
13896 // s0 qnan: min(s1, s2)
13897 // s1 qnan: min(s0, s2)
13898 // s2 qnan: min(s0, s1)
13899 const MachineFunction &MF = DAG.getMachineFunction();
13900 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13901
13902 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
13903 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
13904 // can only form if op0 is fmaxnum_ieee if IEEE=1.
13905 EVT VT = Op0.getValueType();
13906 if (Info->getMode().DX10Clamp) {
13907 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13908 // hardware fmed3 behavior converting to a min.
13909 // FIXME: Should this be allowing -0.0?
13910 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
13911 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
13912 }
13913
13914 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13915 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13916 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13917 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13918 // then give the other result, which is different from med3 with a NaN
13919 // input.
13920 SDValue Var = Op0.getOperand(i: 0);
13921 if (!DAG.isKnownNeverSNaN(Op: Var))
13922 return SDValue();
13923
13924 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13925
13926 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
13927 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
13928 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), N1: Var,
13929 N2: SDValue(K0, 0), N3: SDValue(K1, 0));
13930 }
13931 }
13932
13933 return SDValue();
13934}
13935
13936/// \return true if the subtarget supports minimum3 and maximum3 with the given
13937/// base min/max opcode \p Opc for type \p VT.
13938static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13939 EVT VT) {
13940 switch (Opc) {
13941 case ISD::FMINNUM:
13942 case ISD::FMAXNUM:
13943 case ISD::FMINNUM_IEEE:
13944 case ISD::FMAXNUM_IEEE:
13945 case ISD::FMINIMUMNUM:
13946 case ISD::FMAXIMUMNUM:
13947 case AMDGPUISD::FMIN_LEGACY:
13948 case AMDGPUISD::FMAX_LEGACY:
13949 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13950 case ISD::FMINIMUM:
13951 case ISD::FMAXIMUM:
13952 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
13953 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
13954 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
13955 case ISD::SMAX:
13956 case ISD::SMIN:
13957 case ISD::UMAX:
13958 case ISD::UMIN:
13959 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13960 default:
13961 return false;
13962 }
13963
13964 llvm_unreachable("not a min/max opcode");
13965}
13966
13967SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13968 DAGCombinerInfo &DCI) const {
13969 SelectionDAG &DAG = DCI.DAG;
13970
13971 EVT VT = N->getValueType(ResNo: 0);
13972 unsigned Opc = N->getOpcode();
13973 SDValue Op0 = N->getOperand(Num: 0);
13974 SDValue Op1 = N->getOperand(Num: 1);
13975
13976 // Only do this if the inner op has one use since this will just increases
13977 // register pressure for no benefit.
13978
13979 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
13980 // max(max(a, b), c) -> max3(a, b, c)
13981 // min(min(a, b), c) -> min3(a, b, c)
13982 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13983 SDLoc DL(N);
13984 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
13985 N1: Op0.getOperand(i: 0), N2: Op0.getOperand(i: 1), N3: Op1);
13986 }
13987
13988 // Try commuted.
13989 // max(a, max(b, c)) -> max3(a, b, c)
13990 // min(a, min(b, c)) -> min3(a, b, c)
13991 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13992 SDLoc DL(N);
13993 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
13994 N1: Op0, N2: Op1.getOperand(i: 0), N3: Op1.getOperand(i: 1));
13995 }
13996 }
13997
13998 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13999 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14000 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14001 if (SDValue Med3 = performIntMed3ImmCombine(
14002 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
14003 return Med3;
14004 }
14005 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14006 if (SDValue Med3 = performIntMed3ImmCombine(
14007 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
14008 return Med3;
14009 }
14010
14011 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14012 if (SDValue Med3 = performIntMed3ImmCombine(
14013 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
14014 return Med3;
14015 }
14016 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14017 if (SDValue Med3 = performIntMed3ImmCombine(
14018 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
14019 return Med3;
14020 }
14021
14022 // if !is_snan(x):
14023 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14024 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14025 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14026 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14027 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14028 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14029 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14030 (Opc == AMDGPUISD::FMIN_LEGACY &&
14031 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14032 (VT == MVT::f32 || VT == MVT::f64 ||
14033 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14034 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14035 Op0.hasOneUse()) {
14036 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1))
14037 return Res;
14038 }
14039
14040 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14041 // for some types, but at a higher cost since it's implemented with a 3
14042 // operand form.
14043 const SDNodeFlags Flags = N->getFlags();
14044 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14045 !Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
14046 unsigned NewOpc =
14047 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
14048 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VT, N1: Op0, N2: Op1, Flags);
14049 }
14050
14051 return SDValue();
14052}
14053
14054static bool isClampZeroToOne(SDValue A, SDValue B) {
14055 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
14056 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
14057 // FIXME: Should this be allowing -0.0?
14058 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
14059 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
14060 }
14061 }
14062
14063 return false;
14064}
14065
14066// FIXME: Should only worry about snans for version with chain.
14067SDValue SITargetLowering::performFMed3Combine(SDNode *N,
14068 DAGCombinerInfo &DCI) const {
14069 EVT VT = N->getValueType(ResNo: 0);
14070 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
14071 // NaNs. With a NaN input, the order of the operands may change the result.
14072
14073 SelectionDAG &DAG = DCI.DAG;
14074 SDLoc SL(N);
14075
14076 SDValue Src0 = N->getOperand(Num: 0);
14077 SDValue Src1 = N->getOperand(Num: 1);
14078 SDValue Src2 = N->getOperand(Num: 2);
14079
14080 if (isClampZeroToOne(A: Src0, B: Src1)) {
14081 // const_a, const_b, x -> clamp is safe in all cases including signaling
14082 // nans.
14083 // FIXME: Should this be allowing -0.0?
14084 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
14085 }
14086
14087 const MachineFunction &MF = DAG.getMachineFunction();
14088 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14089
14090 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
14091 // handling no dx10-clamp?
14092 if (Info->getMode().DX10Clamp) {
14093 // If NaNs is clamped to 0, we are free to reorder the inputs.
14094
14095 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
14096 std::swap(a&: Src0, b&: Src1);
14097
14098 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
14099 std::swap(a&: Src1, b&: Src2);
14100
14101 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
14102 std::swap(a&: Src0, b&: Src1);
14103
14104 if (isClampZeroToOne(A: Src1, B: Src2))
14105 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
14106 }
14107
14108 return SDValue();
14109}
14110
14111SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
14112 DAGCombinerInfo &DCI) const {
14113 SDValue Src0 = N->getOperand(Num: 0);
14114 SDValue Src1 = N->getOperand(Num: 1);
14115 if (Src0.isUndef() && Src1.isUndef())
14116 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
14117 return SDValue();
14118}
14119
14120// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
14121// expanded into a set of cmp/select instructions.
14122bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
14123 unsigned NumElem,
14124 bool IsDivergentIdx,
14125 const GCNSubtarget *Subtarget) {
14126 if (UseDivergentRegisterIndexing)
14127 return false;
14128
14129 unsigned VecSize = EltSize * NumElem;
14130
14131 // Sub-dword vectors of size 2 dword or less have better implementation.
14132 if (VecSize <= 64 && EltSize < 32)
14133 return false;
14134
14135 // Always expand the rest of sub-dword instructions, otherwise it will be
14136 // lowered via memory.
14137 if (EltSize < 32)
14138 return true;
14139
14140 // Always do this if var-idx is divergent, otherwise it will become a loop.
14141 if (IsDivergentIdx)
14142 return true;
14143
14144 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
14145 unsigned NumInsts = NumElem /* Number of compares */ +
14146 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
14147
14148 // On some architectures (GFX9) movrel is not available and it's better
14149 // to expand.
14150 if (Subtarget->useVGPRIndexMode())
14151 return NumInsts <= 16;
14152
14153 // If movrel is available, use it instead of expanding for vector of 8
14154 // elements.
14155 if (Subtarget->hasMovrel())
14156 return NumInsts <= 15;
14157
14158 return true;
14159}
14160
14161bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
14162 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
14163 if (isa<ConstantSDNode>(Val: Idx))
14164 return false;
14165
14166 SDValue Vec = N->getOperand(Num: 0);
14167 EVT VecVT = Vec.getValueType();
14168 EVT EltVT = VecVT.getVectorElementType();
14169 unsigned EltSize = EltVT.getSizeInBits();
14170 unsigned NumElem = VecVT.getVectorNumElements();
14171
14172 return SITargetLowering::shouldExpandVectorDynExt(
14173 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
14174}
14175
14176SDValue
14177SITargetLowering::performExtractVectorEltCombine(SDNode *N,
14178 DAGCombinerInfo &DCI) const {
14179 SDValue Vec = N->getOperand(Num: 0);
14180 SelectionDAG &DAG = DCI.DAG;
14181
14182 EVT VecVT = Vec.getValueType();
14183 EVT VecEltVT = VecVT.getVectorElementType();
14184 EVT ResVT = N->getValueType(ResNo: 0);
14185
14186 unsigned VecSize = VecVT.getSizeInBits();
14187 unsigned VecEltSize = VecEltVT.getSizeInBits();
14188
14189 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
14190 allUsesHaveSourceMods(N)) {
14191 SDLoc SL(N);
14192 SDValue Idx = N->getOperand(Num: 1);
14193 SDValue Elt =
14194 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
14195 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
14196 }
14197
14198 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
14199 // =>
14200 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
14201 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
14202 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
14203 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
14204 SDLoc SL(N);
14205 SDValue Idx = N->getOperand(Num: 1);
14206 unsigned Opc = Vec.getOpcode();
14207
14208 switch (Opc) {
14209 default:
14210 break;
14211 // TODO: Support other binary operations.
14212 case ISD::FADD:
14213 case ISD::FSUB:
14214 case ISD::FMUL:
14215 case ISD::ADD:
14216 case ISD::UMIN:
14217 case ISD::UMAX:
14218 case ISD::SMIN:
14219 case ISD::SMAX:
14220 case ISD::FMAXNUM:
14221 case ISD::FMINNUM:
14222 case ISD::FMAXNUM_IEEE:
14223 case ISD::FMINNUM_IEEE:
14224 case ISD::FMAXIMUM:
14225 case ISD::FMINIMUM: {
14226 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
14227 N1: Vec.getOperand(i: 0), N2: Idx);
14228 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
14229 N1: Vec.getOperand(i: 1), N2: Idx);
14230
14231 DCI.AddToWorklist(N: Elt0.getNode());
14232 DCI.AddToWorklist(N: Elt1.getNode());
14233 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
14234 }
14235 }
14236 }
14237
14238 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
14239 if (shouldExpandVectorDynExt(N)) {
14240 SDLoc SL(N);
14241 SDValue Idx = N->getOperand(Num: 1);
14242 SDValue V;
14243 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14244 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
14245 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
14246 if (I == 0)
14247 V = Elt;
14248 else
14249 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
14250 }
14251 return V;
14252 }
14253
14254 if (!DCI.isBeforeLegalize())
14255 return SDValue();
14256
14257 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
14258 // elements. This exposes more load reduction opportunities by replacing
14259 // multiple small extract_vector_elements with a single 32-bit extract.
14260 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14261 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
14262 VecSize > 32 && VecSize % 32 == 0 && Idx) {
14263 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
14264
14265 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
14266 unsigned EltIdx = BitIndex / 32;
14267 unsigned LeftoverBitIdx = BitIndex % 32;
14268 SDLoc SL(N);
14269
14270 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
14271 DCI.AddToWorklist(N: Cast.getNode());
14272
14273 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
14274 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
14275 DCI.AddToWorklist(N: Elt.getNode());
14276 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
14277 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
14278 DCI.AddToWorklist(N: Srl.getNode());
14279
14280 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
14281 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
14282 DCI.AddToWorklist(N: Trunc.getNode());
14283
14284 if (VecEltVT == ResVT) {
14285 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
14286 }
14287
14288 assert(ResVT.isScalarInteger());
14289 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
14290 }
14291
14292 return SDValue();
14293}
14294
14295SDValue
14296SITargetLowering::performInsertVectorEltCombine(SDNode *N,
14297 DAGCombinerInfo &DCI) const {
14298 SDValue Vec = N->getOperand(Num: 0);
14299 SDValue Idx = N->getOperand(Num: 2);
14300 EVT VecVT = Vec.getValueType();
14301 EVT EltVT = VecVT.getVectorElementType();
14302
14303 // INSERT_VECTOR_ELT (<n x e>, var-idx)
14304 // => BUILD_VECTOR n x select (e, const-idx)
14305 if (!shouldExpandVectorDynExt(N))
14306 return SDValue();
14307
14308 SelectionDAG &DAG = DCI.DAG;
14309 SDLoc SL(N);
14310 SDValue Ins = N->getOperand(Num: 1);
14311 EVT IdxVT = Idx.getValueType();
14312
14313 SmallVector<SDValue, 16> Ops;
14314 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
14315 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
14316 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
14317 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
14318 Ops.push_back(Elt: V);
14319 }
14320
14321 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
14322}
14323
14324/// Return the source of an fp_extend from f16 to f32, or a converted FP
14325/// constant.
14326static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
14327 if (Src.getOpcode() == ISD::FP_EXTEND &&
14328 Src.getOperand(i: 0).getValueType() == MVT::f16) {
14329 return Src.getOperand(i: 0);
14330 }
14331
14332 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
14333 APFloat Val = CFP->getValueAPF();
14334 bool LosesInfo = true;
14335 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
14336 if (!LosesInfo)
14337 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
14338 }
14339
14340 return SDValue();
14341}
14342
14343SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
14344 DAGCombinerInfo &DCI) const {
14345 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
14346 "combine only useful on gfx8");
14347
14348 SDValue TruncSrc = N->getOperand(Num: 0);
14349 EVT VT = N->getValueType(ResNo: 0);
14350 if (VT != MVT::f16)
14351 return SDValue();
14352
14353 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
14354 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
14355 return SDValue();
14356
14357 SelectionDAG &DAG = DCI.DAG;
14358 SDLoc SL(N);
14359
14360 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
14361 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
14362 // casting back.
14363
14364 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
14365 // fmin(fmax(a, b), fmax(fmin(a, b), c))
14366 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
14367 if (!A)
14368 return SDValue();
14369
14370 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
14371 if (!B)
14372 return SDValue();
14373
14374 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
14375 if (!C)
14376 return SDValue();
14377
14378 // This changes signaling nan behavior. If an input is a signaling nan, it
14379 // would have been quieted by the fpext originally. We don't care because
14380 // these are unconstrained ops. If we needed to insert quieting canonicalizes
14381 // we would be worse off than just doing the promotion.
14382 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
14383 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
14384 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
14385 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
14386}
14387
14388unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
14389 const SDNode *N0,
14390 const SDNode *N1) const {
14391 EVT VT = N0->getValueType(ResNo: 0);
14392
14393 // Only do this if we are not trying to support denormals. v_mad_f32 does not
14394 // support denormals ever.
14395 if (((VT == MVT::f32 &&
14396 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
14397 (VT == MVT::f16 && Subtarget->hasMadF16() &&
14398 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
14399 isOperationLegal(Op: ISD::FMAD, VT))
14400 return ISD::FMAD;
14401
14402 const TargetOptions &Options = DAG.getTarget().Options;
14403 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14404 (N0->getFlags().hasAllowContract() &&
14405 N1->getFlags().hasAllowContract())) &&
14406 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
14407 return ISD::FMA;
14408 }
14409
14410 return 0;
14411}
14412
14413// For a reassociatable opcode perform:
14414// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
14415SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
14416 SelectionDAG &DAG) const {
14417 EVT VT = N->getValueType(ResNo: 0);
14418 if (VT != MVT::i32 && VT != MVT::i64)
14419 return SDValue();
14420
14421 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
14422 return SDValue();
14423
14424 unsigned Opc = N->getOpcode();
14425 SDValue Op0 = N->getOperand(Num: 0);
14426 SDValue Op1 = N->getOperand(Num: 1);
14427
14428 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
14429 return SDValue();
14430
14431 if (Op0->isDivergent())
14432 std::swap(a&: Op0, b&: Op1);
14433
14434 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
14435 return SDValue();
14436
14437 SDValue Op2 = Op1.getOperand(i: 1);
14438 Op1 = Op1.getOperand(i: 0);
14439 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
14440 return SDValue();
14441
14442 if (Op1->isDivergent())
14443 std::swap(a&: Op1, b&: Op2);
14444
14445 SDLoc SL(N);
14446 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
14447 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
14448}
14449
14450static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
14451 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
14452 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
14453 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
14454 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
14455 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
14456}
14457
14458// Fold
14459// y = lshr i64 x, 32
14460// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
14461// with Const.hi == -1
14462// To
14463// res = mad_u64_u32 y.lo ,Const.lo, x.lo
14464static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
14465 SDValue MulLHS, SDValue MulRHS,
14466 SDValue AddRHS) {
14467 if (MulRHS.getOpcode() == ISD::SRL)
14468 std::swap(a&: MulLHS, b&: MulRHS);
14469
14470 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
14471 return SDValue();
14472
14473 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: 1));
14474 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
14475 MulLHS.getOperand(i: 0) != AddRHS)
14476 return SDValue();
14477
14478 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
14479 if (!Const || Hi_32(Value: Const->getZExtValue()) != uint32_t(-1))
14480 return SDValue();
14481
14482 SDValue ConstMul =
14483 DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
14484 return getMad64_32(DAG, SL, VT: MVT::i64,
14485 N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
14486 N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
14487}
14488
14489// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
14490// multiplies, if any.
14491//
14492// Full 64-bit multiplies that feed into an addition are lowered here instead
14493// of using the generic expansion. The generic expansion ends up with
14494// a tree of ADD nodes that prevents us from using the "add" part of the
14495// MAD instruction. The expansion produced here results in a chain of ADDs
14496// instead of a tree.
14497SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
14498 DAGCombinerInfo &DCI) const {
14499 assert(N->getOpcode() == ISD::ADD);
14500
14501 SelectionDAG &DAG = DCI.DAG;
14502 EVT VT = N->getValueType(ResNo: 0);
14503 SDLoc SL(N);
14504 SDValue LHS = N->getOperand(Num: 0);
14505 SDValue RHS = N->getOperand(Num: 1);
14506
14507 if (VT.isVector())
14508 return SDValue();
14509
14510 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
14511 // result in scalar registers for uniform values.
14512 if (!N->isDivergent() && Subtarget->hasSMulHi())
14513 return SDValue();
14514
14515 unsigned NumBits = VT.getScalarSizeInBits();
14516 if (NumBits <= 32 || NumBits > 64)
14517 return SDValue();
14518
14519 if (LHS.getOpcode() != ISD::MUL) {
14520 assert(RHS.getOpcode() == ISD::MUL);
14521 std::swap(a&: LHS, b&: RHS);
14522 }
14523
14524 // Avoid the fold if it would unduly increase the number of multiplies due to
14525 // multiple uses, except on hardware with full-rate multiply-add (which is
14526 // part of full-rate 64-bit ops).
14527 if (!Subtarget->hasFullRate64Ops()) {
14528 unsigned NumUsers = 0;
14529 for (SDNode *User : LHS->users()) {
14530 // There is a use that does not feed into addition, so the multiply can't
14531 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
14532 if (User->getOpcode() != ISD::ADD)
14533 return SDValue();
14534
14535 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
14536 // MUL + 3xADD + 3xADDC over 3xMAD.
14537 ++NumUsers;
14538 if (NumUsers >= 3)
14539 return SDValue();
14540 }
14541 }
14542
14543 SDValue MulLHS = LHS.getOperand(i: 0);
14544 SDValue MulRHS = LHS.getOperand(i: 1);
14545 SDValue AddRHS = RHS;
14546
14547 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
14548 return FoldedMAD;
14549
14550 // Always check whether operands are small unsigned values, since that
14551 // knowledge is useful in more cases. Check for small signed values only if
14552 // doing so can unlock a shorter code sequence.
14553 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
14554 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
14555
14556 bool MulSignedLo = false;
14557 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
14558 MulSignedLo =
14559 numBitsSigned(Op: MulLHS, DAG) <= 32 && numBitsSigned(Op: MulRHS, DAG) <= 32;
14560 }
14561
14562 // The operands and final result all have the same number of bits. If
14563 // operands need to be extended, they can be extended with garbage. The
14564 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
14565 // truncated away in the end.
14566 if (VT != MVT::i64) {
14567 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
14568 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
14569 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
14570 }
14571
14572 // The basic code generated is conceptually straightforward. Pseudo code:
14573 //
14574 // accum = mad_64_32 lhs.lo, rhs.lo, accum
14575 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
14576 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
14577 //
14578 // The second and third lines are optional, depending on whether the factors
14579 // are {sign,zero}-extended or not.
14580 //
14581 // The actual DAG is noisier than the pseudo code, but only due to
14582 // instructions that disassemble values into low and high parts, and
14583 // assemble the final result.
14584 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
14585
14586 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
14587 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
14588 SDValue Accum =
14589 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
14590
14591 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
14592 auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
14593
14594 if (!MulLHSUnsigned32) {
14595 auto MulLHSHi =
14596 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
14597 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
14598 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
14599 }
14600
14601 if (!MulRHSUnsigned32) {
14602 auto MulRHSHi =
14603 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
14604 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
14605 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
14606 }
14607
14608 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
14609 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
14610 }
14611
14612 if (VT != MVT::i64)
14613 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
14614 return Accum;
14615}
14616
14617SDValue
14618SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
14619 DAGCombinerInfo &DCI) const {
14620 SDValue RHS = N->getOperand(Num: 1);
14621 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
14622 if (!CRHS)
14623 return SDValue();
14624
14625 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
14626 // common.
14627 uint64_t Val = CRHS->getZExtValue();
14628 if (countr_zero(Val) >= 32) {
14629 SelectionDAG &DAG = DCI.DAG;
14630 SDLoc SL(N);
14631 SDValue LHS = N->getOperand(Num: 0);
14632
14633 // Avoid carry machinery if we know the low half of the add does not
14634 // contribute to the final result.
14635 //
14636 // add i64:x, K if computeTrailingZeros(K) >= 32
14637 // => build_pair (add x.hi, K.hi), x.lo
14638
14639 // Breaking the 64-bit add here with this strange constant is unlikely
14640 // to interfere with addressing mode patterns.
14641
14642 SDValue Hi = getHiHalf64(Op: LHS, DAG);
14643 SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
14644 SDValue AddHi =
14645 DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
14646
14647 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
14648 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
14649 }
14650
14651 return SDValue();
14652}
14653
14654// Collect the ultimate src of each of the mul node's operands, and confirm
14655// each operand is 8 bytes.
14656static std::optional<ByteProvider<SDValue>>
14657handleMulOperand(const SDValue &MulOperand) {
14658 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
14659 if (!Byte0 || Byte0->isConstantZero()) {
14660 return std::nullopt;
14661 }
14662 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
14663 if (Byte1 && !Byte1->isConstantZero()) {
14664 return std::nullopt;
14665 }
14666 return Byte0;
14667}
14668
14669static unsigned addPermMasks(unsigned First, unsigned Second) {
14670 unsigned FirstCs = First & 0x0c0c0c0c;
14671 unsigned SecondCs = Second & 0x0c0c0c0c;
14672 unsigned FirstNoCs = First & ~0x0c0c0c0c;
14673 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
14674
14675 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
14676 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
14677 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
14678 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
14679
14680 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
14681}
14682
14683struct DotSrc {
14684 SDValue SrcOp;
14685 int64_t PermMask;
14686 int64_t DWordOffset;
14687};
14688
14689static void placeSources(ByteProvider<SDValue> &Src0,
14690 ByteProvider<SDValue> &Src1,
14691 SmallVectorImpl<DotSrc> &Src0s,
14692 SmallVectorImpl<DotSrc> &Src1s, int Step) {
14693
14694 assert(Src0.Src.has_value() && Src1.Src.has_value());
14695 // Src0s and Src1s are empty, just place arbitrarily.
14696 if (Step == 0) {
14697 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
14698 .DWordOffset: Src0.SrcOffset / 4});
14699 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
14700 .DWordOffset: Src1.SrcOffset / 4});
14701 return;
14702 }
14703
14704 for (int BPI = 0; BPI < 2; BPI++) {
14705 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
14706 if (BPI == 1) {
14707 BPP = {Src1, Src0};
14708 }
14709 unsigned ZeroMask = 0x0c0c0c0c;
14710 unsigned FMask = 0xFF << (8 * (3 - Step));
14711
14712 unsigned FirstMask =
14713 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14714 unsigned SecondMask =
14715 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
14716 // Attempt to find Src vector which contains our SDValue, if so, add our
14717 // perm mask to the existing one. If we are unable to find a match for the
14718 // first SDValue, attempt to find match for the second.
14719 int FirstGroup = -1;
14720 for (int I = 0; I < 2; I++) {
14721 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
14722 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
14723 return IterElt.SrcOp == *BPP.first.Src &&
14724 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
14725 };
14726
14727 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
14728 if (Match != Srcs.end()) {
14729 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
14730 FirstGroup = I;
14731 break;
14732 }
14733 }
14734 if (FirstGroup != -1) {
14735 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
14736 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
14737 return IterElt.SrcOp == *BPP.second.Src &&
14738 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
14739 };
14740 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
14741 if (Match != Srcs.end()) {
14742 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
14743 } else
14744 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
14745 return;
14746 }
14747 }
14748
14749 // If we have made it here, then we could not find a match in Src0s or Src1s
14750 // for either Src0 or Src1, so just place them arbitrarily.
14751
14752 unsigned ZeroMask = 0x0c0c0c0c;
14753 unsigned FMask = 0xFF << (8 * (3 - Step));
14754
14755 Src0s.push_back(
14756 Elt: {.SrcOp: *Src0.Src,
14757 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14758 .DWordOffset: Src0.SrcOffset / 4});
14759 Src1s.push_back(
14760 Elt: {.SrcOp: *Src1.Src,
14761 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
14762 .DWordOffset: Src1.SrcOffset / 4});
14763}
14764
14765static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
14766 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
14767 bool IsAny) {
14768
14769 // If we just have one source, just permute it accordingly.
14770 if (Srcs.size() == 1) {
14771 auto *Elt = Srcs.begin();
14772 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
14773
14774 // v_perm will produce the original value
14775 if (Elt->PermMask == 0x3020100)
14776 return EltOp;
14777
14778 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
14779 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
14780 }
14781
14782 auto *FirstElt = Srcs.begin();
14783 auto *SecondElt = std::next(x: FirstElt);
14784
14785 SmallVector<SDValue, 2> Perms;
14786
14787 // If we have multiple sources in the chain, combine them via perms (using
14788 // calculated perm mask) and Ors.
14789 while (true) {
14790 auto FirstMask = FirstElt->PermMask;
14791 auto SecondMask = SecondElt->PermMask;
14792
14793 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
14794 unsigned FirstPlusFour = FirstMask | 0x04040404;
14795 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
14796 // original 0x0C.
14797 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
14798
14799 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
14800 auto FirstVal =
14801 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14802 auto SecondVal =
14803 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
14804
14805 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
14806 N2: SecondVal,
14807 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
14808
14809 FirstElt = std::next(x: SecondElt);
14810 if (FirstElt == Srcs.end())
14811 break;
14812
14813 SecondElt = std::next(x: FirstElt);
14814 // If we only have a FirstElt, then just combine that into the cumulative
14815 // source node.
14816 if (SecondElt == Srcs.end()) {
14817 auto EltOp =
14818 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14819
14820 Perms.push_back(
14821 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
14822 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
14823 break;
14824 }
14825 }
14826
14827 assert(Perms.size() == 1 || Perms.size() == 2);
14828 return Perms.size() == 2
14829 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
14830 : Perms[0];
14831}
14832
14833static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14834 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14835 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14836 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14837 EntryMask += ZeroMask;
14838 }
14839}
14840
14841static bool isMul(const SDValue Op) {
14842 auto Opcode = Op.getOpcode();
14843
14844 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14845 Opcode == AMDGPUISD::MUL_I24);
14846}
14847
14848static std::optional<bool>
14849checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
14850 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14851 const SDValue &S1Op, const SelectionDAG &DAG) {
14852 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14853 // of the dot4 is irrelevant.
14854 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14855 return false;
14856
14857 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
14858 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14859 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14860 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
14861 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14862 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14863
14864 assert(!(S0IsUnsigned && S0IsSigned));
14865 assert(!(S1IsUnsigned && S1IsSigned));
14866
14867 // There are 9 possible permutations of
14868 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14869
14870 // In two permutations, the sign bits are known to be the same for both Ops,
14871 // so simply return Signed / Unsigned corresponding to the MSB
14872
14873 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14874 return S0IsSigned;
14875
14876 // In another two permutations, the sign bits are known to be opposite. In
14877 // this case return std::nullopt to indicate a bad match.
14878
14879 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14880 return std::nullopt;
14881
14882 // In the remaining five permutations, we don't know the value of the sign
14883 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14884 // the upper bits must be extension bits. Thus, the only ways for the sign
14885 // bit to be unknown is if it was sign extended from unknown value, or if it
14886 // was any extended. In either case, it is correct to use the signed
14887 // version of the signedness semantics of dot4
14888
14889 // In two of such permutations, we known the sign bit is set for
14890 // one op, and the other is unknown. It is okay to used signed version of
14891 // dot4.
14892 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14893 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14894 return true;
14895
14896 // In one such permutation, we don't know either of the sign bits. It is okay
14897 // to used the signed version of dot4.
14898 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14899 return true;
14900
14901 // In two of such permutations, we known the sign bit is unset for
14902 // one op, and the other is unknown. Return std::nullopt to indicate a
14903 // bad match.
14904 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14905 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14906 return std::nullopt;
14907
14908 llvm_unreachable("Fully covered condition");
14909}
14910
14911SDValue SITargetLowering::performAddCombine(SDNode *N,
14912 DAGCombinerInfo &DCI) const {
14913 SelectionDAG &DAG = DCI.DAG;
14914 EVT VT = N->getValueType(ResNo: 0);
14915 SDLoc SL(N);
14916 SDValue LHS = N->getOperand(Num: 0);
14917 SDValue RHS = N->getOperand(Num: 1);
14918
14919 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14920 if (Subtarget->hasMad64_32()) {
14921 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14922 return Folded;
14923 }
14924 }
14925
14926 if (SDValue V = reassociateScalarOps(N, DAG)) {
14927 return V;
14928 }
14929
14930 if (VT == MVT::i64) {
14931 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
14932 return Folded;
14933 }
14934
14935 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
14936 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14937 SDValue TempNode(N, 0);
14938 std::optional<bool> IsSigned;
14939 SmallVector<DotSrc, 4> Src0s;
14940 SmallVector<DotSrc, 4> Src1s;
14941 SmallVector<SDValue, 4> Src2s;
14942
14943 // Match the v_dot4 tree, while collecting src nodes.
14944 int ChainLength = 0;
14945 for (int I = 0; I < 4; I++) {
14946 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
14947 if (MulIdx == -1)
14948 break;
14949 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
14950 if (!Src0)
14951 break;
14952 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
14953 if (!Src1)
14954 break;
14955
14956 auto IterIsSigned = checkDot4MulSignedness(
14957 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
14958 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
14959 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
14960 if (!IterIsSigned)
14961 break;
14962 if (!IsSigned)
14963 IsSigned = *IterIsSigned;
14964 if (*IterIsSigned != *IsSigned)
14965 break;
14966 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
14967 auto AddIdx = 1 - MulIdx;
14968 // Allow the special case where add (add (mul24, 0), mul24) became ->
14969 // add (mul24, mul24).
14970 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
14971 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
14972 auto Src0 =
14973 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
14974 if (!Src0)
14975 break;
14976 auto Src1 =
14977 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
14978 if (!Src1)
14979 break;
14980 auto IterIsSigned = checkDot4MulSignedness(
14981 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
14982 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
14983 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
14984 if (!IterIsSigned)
14985 break;
14986 assert(IsSigned);
14987 if (*IterIsSigned != *IsSigned)
14988 break;
14989 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
14990 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
14991 ChainLength = I + 2;
14992 break;
14993 }
14994
14995 TempNode = TempNode->getOperand(Num: AddIdx);
14996 Src2s.push_back(Elt: TempNode);
14997 ChainLength = I + 1;
14998 if (TempNode->getNumOperands() < 2)
14999 break;
15000 LHS = TempNode->getOperand(Num: 0);
15001 RHS = TempNode->getOperand(Num: 1);
15002 }
15003
15004 if (ChainLength < 2)
15005 return SDValue();
15006
15007 // Masks were constructed with assumption that we would find a chain of
15008 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15009 // 0x0c) so they do not affect dot calculation.
15010 if (ChainLength < 4) {
15011 fixMasks(Srcs&: Src0s, ChainLength);
15012 fixMasks(Srcs&: Src1s, ChainLength);
15013 }
15014
15015 SDValue Src0, Src1;
15016
15017 // If we are just using a single source for both, and have permuted the
15018 // bytes consistently, we can just use the sources without permuting
15019 // (commutation).
15020 bool UseOriginalSrc = false;
15021 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
15022 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
15023 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
15024 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
15025 SmallVector<unsigned, 4> SrcBytes;
15026 auto Src0Mask = Src0s.begin()->PermMask;
15027 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
15028 bool UniqueEntries = true;
15029 for (auto I = 1; I < 4; I++) {
15030 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
15031
15032 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
15033 UniqueEntries = false;
15034 break;
15035 }
15036 SrcBytes.push_back(Elt: NextByte);
15037 }
15038
15039 if (UniqueEntries) {
15040 UseOriginalSrc = true;
15041
15042 auto *FirstElt = Src0s.begin();
15043 auto FirstEltOp =
15044 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
15045
15046 auto *SecondElt = Src1s.begin();
15047 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
15048 DWordOffset: SecondElt->DWordOffset);
15049
15050 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
15051 VT: MVT::getIntegerVT(BitWidth: 32));
15052 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
15053 VT: MVT::getIntegerVT(BitWidth: 32));
15054 }
15055 }
15056
15057 if (!UseOriginalSrc) {
15058 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
15059 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
15060 }
15061
15062 assert(IsSigned);
15063 SDValue Src2 =
15064 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
15065
15066 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
15067 : Intrinsic::amdgcn_udot4,
15068 DL: SL, VT: MVT::i64);
15069
15070 assert(!VT.isVector());
15071 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
15072 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
15073
15074 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
15075 }
15076
15077 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
15078 return SDValue();
15079
15080 // add x, zext (setcc) => uaddo_carry x, 0, setcc
15081 // add x, sext (setcc) => usubo_carry x, 0, setcc
15082 unsigned Opc = LHS.getOpcode();
15083 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
15084 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
15085 std::swap(a&: RHS, b&: LHS);
15086
15087 Opc = RHS.getOpcode();
15088 switch (Opc) {
15089 default:
15090 break;
15091 case ISD::ZERO_EXTEND:
15092 case ISD::SIGN_EXTEND:
15093 case ISD::ANY_EXTEND: {
15094 auto Cond = RHS.getOperand(i: 0);
15095 // If this won't be a real VOPC output, we would still need to insert an
15096 // extra instruction anyway.
15097 if (!isBoolSGPR(V: Cond))
15098 break;
15099 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
15100 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
15101 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
15102 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
15103 }
15104 case ISD::UADDO_CARRY: {
15105 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
15106 if (!isNullConstant(V: RHS.getOperand(i: 1)))
15107 break;
15108 SDValue Args[] = {LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2)};
15109 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
15110 }
15111 }
15112 return SDValue();
15113}
15114
15115SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
15116 DAGCombinerInfo &DCI) const {
15117 SelectionDAG &DAG = DCI.DAG;
15118 SDLoc DL(N);
15119 SDValue N0 = N->getOperand(Num: 0);
15120 SDValue N1 = N->getOperand(Num: 1);
15121
15122 if (N1.getOpcode() == ISD::ADD) {
15123 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15124 // y is not, and (add y, z) is used only once.
15125 // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15126 // z is not, and (add y, z) is used only once.
15127 // The goal is to move constant offsets to the outermost ptradd, to create
15128 // more opportunities to fold offsets into memory instructions.
15129 // Together with the generic combines in DAGCombiner.cpp, this also
15130 // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15131 //
15132 // This transform is here instead of in the general DAGCombiner as it can
15133 // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15134 // AArch64's CPA.
15135 SDValue X = N0;
15136 SDValue Y = N1.getOperand(i: 0);
15137 SDValue Z = N1.getOperand(i: 1);
15138 if (N1.hasOneUse()) {
15139 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
15140 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
15141 if (ZIsConstant != YIsConstant) {
15142 // If both additions in the original were NUW, the new ones are as well.
15143 SDNodeFlags Flags =
15144 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15145 if (YIsConstant)
15146 std::swap(a&: Y, b&: Z);
15147
15148 SDValue Inner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags);
15149 DCI.AddToWorklist(N: Inner.getNode());
15150 return DAG.getMemBasePlusOffset(Base: Inner, Offset: Z, DL, Flags);
15151 }
15152 }
15153 }
15154
15155 return SDValue();
15156}
15157
15158SDValue SITargetLowering::performSubCombine(SDNode *N,
15159 DAGCombinerInfo &DCI) const {
15160 SelectionDAG &DAG = DCI.DAG;
15161 EVT VT = N->getValueType(ResNo: 0);
15162
15163 if (VT == MVT::i64) {
15164 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15165 return Folded;
15166 }
15167
15168 if (VT != MVT::i32)
15169 return SDValue();
15170
15171 SDLoc SL(N);
15172 SDValue LHS = N->getOperand(Num: 0);
15173 SDValue RHS = N->getOperand(Num: 1);
15174
15175 // sub x, zext (setcc) => usubo_carry x, 0, setcc
15176 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
15177 unsigned Opc = RHS.getOpcode();
15178 switch (Opc) {
15179 default:
15180 break;
15181 case ISD::ZERO_EXTEND:
15182 case ISD::SIGN_EXTEND:
15183 case ISD::ANY_EXTEND: {
15184 auto Cond = RHS.getOperand(i: 0);
15185 // If this won't be a real VOPC output, we would still need to insert an
15186 // extra instruction anyway.
15187 if (!isBoolSGPR(V: Cond))
15188 break;
15189 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
15190 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
15191 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
15192 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
15193 }
15194 }
15195
15196 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
15197 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
15198 if (!isNullConstant(V: LHS.getOperand(i: 1)))
15199 return SDValue();
15200 SDValue Args[] = {LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2)};
15201 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
15202 }
15203 return SDValue();
15204}
15205
15206SDValue
15207SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
15208 DAGCombinerInfo &DCI) const {
15209
15210 if (N->getValueType(ResNo: 0) != MVT::i32)
15211 return SDValue();
15212
15213 if (!isNullConstant(V: N->getOperand(Num: 1)))
15214 return SDValue();
15215
15216 SelectionDAG &DAG = DCI.DAG;
15217 SDValue LHS = N->getOperand(Num: 0);
15218
15219 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
15220 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
15221 unsigned LHSOpc = LHS.getOpcode();
15222 unsigned Opc = N->getOpcode();
15223 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
15224 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
15225 SDValue Args[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2)};
15226 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
15227 }
15228 return SDValue();
15229}
15230
15231SDValue SITargetLowering::performFAddCombine(SDNode *N,
15232 DAGCombinerInfo &DCI) const {
15233 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15234 return SDValue();
15235
15236 SelectionDAG &DAG = DCI.DAG;
15237 EVT VT = N->getValueType(ResNo: 0);
15238
15239 SDLoc SL(N);
15240 SDValue LHS = N->getOperand(Num: 0);
15241 SDValue RHS = N->getOperand(Num: 1);
15242
15243 // These should really be instruction patterns, but writing patterns with
15244 // source modifiers is a pain.
15245
15246 // fadd (fadd (a, a), b) -> mad 2.0, a, b
15247 if (LHS.getOpcode() == ISD::FADD) {
15248 SDValue A = LHS.getOperand(i: 0);
15249 if (A == LHS.getOperand(i: 1)) {
15250 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
15251 if (FusedOp != 0) {
15252 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
15253 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
15254 }
15255 }
15256 }
15257
15258 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
15259 if (RHS.getOpcode() == ISD::FADD) {
15260 SDValue A = RHS.getOperand(i: 0);
15261 if (A == RHS.getOperand(i: 1)) {
15262 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
15263 if (FusedOp != 0) {
15264 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
15265 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
15266 }
15267 }
15268 }
15269
15270 return SDValue();
15271}
15272
15273SDValue SITargetLowering::performFSubCombine(SDNode *N,
15274 DAGCombinerInfo &DCI) const {
15275 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
15276 return SDValue();
15277
15278 SelectionDAG &DAG = DCI.DAG;
15279 SDLoc SL(N);
15280 EVT VT = N->getValueType(ResNo: 0);
15281 assert(!VT.isVector());
15282
15283 // Try to get the fneg to fold into the source modifier. This undoes generic
15284 // DAG combines and folds them into the mad.
15285 //
15286 // Only do this if we are not trying to support denormals. v_mad_f32 does
15287 // not support denormals ever.
15288 SDValue LHS = N->getOperand(Num: 0);
15289 SDValue RHS = N->getOperand(Num: 1);
15290 if (LHS.getOpcode() == ISD::FADD) {
15291 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
15292 SDValue A = LHS.getOperand(i: 0);
15293 if (A == LHS.getOperand(i: 1)) {
15294 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
15295 if (FusedOp != 0) {
15296 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
15297 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
15298
15299 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
15300 }
15301 }
15302 }
15303
15304 if (RHS.getOpcode() == ISD::FADD) {
15305 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
15306
15307 SDValue A = RHS.getOperand(i: 0);
15308 if (A == RHS.getOperand(i: 1)) {
15309 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
15310 if (FusedOp != 0) {
15311 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
15312 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
15313 }
15314 }
15315 }
15316
15317 return SDValue();
15318}
15319
15320SDValue SITargetLowering::performFDivCombine(SDNode *N,
15321 DAGCombinerInfo &DCI) const {
15322 SelectionDAG &DAG = DCI.DAG;
15323 SDLoc SL(N);
15324 EVT VT = N->getValueType(ResNo: 0);
15325 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
15326 return SDValue();
15327
15328 SDValue LHS = N->getOperand(Num: 0);
15329 SDValue RHS = N->getOperand(Num: 1);
15330
15331 SDNodeFlags Flags = N->getFlags();
15332 SDNodeFlags RHSFlags = RHS->getFlags();
15333 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
15334 !RHS->hasOneUse())
15335 return SDValue();
15336
15337 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
15338 bool IsNegative = false;
15339 if (CLHS->isExactlyValue(V: 1.0) ||
15340 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
15341 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
15342 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
15343 if (RHS.getOpcode() == ISD::FSQRT) {
15344 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
15345 SDValue Rsq =
15346 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
15347 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
15348 }
15349 }
15350 }
15351
15352 return SDValue();
15353}
15354
15355SDValue SITargetLowering::performFMulCombine(SDNode *N,
15356 DAGCombinerInfo &DCI) const {
15357 SelectionDAG &DAG = DCI.DAG;
15358 EVT VT = N->getValueType(ResNo: 0);
15359 EVT ScalarVT = VT.getScalarType();
15360 EVT IntVT = VT.changeElementType(EltVT: MVT::i32);
15361
15362 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
15363 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
15364 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
15365 return SDValue();
15366 }
15367
15368 SDValue LHS = N->getOperand(Num: 0);
15369 SDValue RHS = N->getOperand(Num: 1);
15370
15371 // It is cheaper to realize i32 inline constants as compared against
15372 // materializing f16 or f64 (or even non-inline f32) values,
15373 // possible via ldexp usage, as shown below :
15374 //
15375 // Given : A = 2^a & B = 2^b ; where a and b are integers.
15376 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
15377 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
15378 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
15379 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
15380 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: 1));
15381 if (!TrueNode)
15382 return SDValue();
15383 const ConstantFPSDNode *FalseNode =
15384 isConstOrConstSplatFP(N: RHS.getOperand(i: 2));
15385 if (!FalseNode)
15386 return SDValue();
15387
15388 if (TrueNode->isNegative() != FalseNode->isNegative())
15389 return SDValue();
15390
15391 // For f32, only non-inline constants should be transformed.
15392 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15393 if (ScalarVT == MVT::f32 &&
15394 TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
15395 TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
15396 return SDValue();
15397
15398 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
15399 if (TrueNodeExpVal == INT_MIN)
15400 return SDValue();
15401 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
15402 if (FalseNodeExpVal == INT_MIN)
15403 return SDValue();
15404
15405 SDLoc SL(N);
15406 SDValue SelectNode =
15407 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: 0),
15408 N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
15409 N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
15410
15411 LHS = TrueNode->isNegative()
15412 ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS->getFlags())
15413 : LHS;
15414
15415 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
15416 }
15417
15418 return SDValue();
15419}
15420
15421SDValue SITargetLowering::performFMACombine(SDNode *N,
15422 DAGCombinerInfo &DCI) const {
15423 SelectionDAG &DAG = DCI.DAG;
15424 EVT VT = N->getValueType(ResNo: 0);
15425 SDLoc SL(N);
15426
15427 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
15428 return SDValue();
15429
15430 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
15431 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
15432 SDValue Op1 = N->getOperand(Num: 0);
15433 SDValue Op2 = N->getOperand(Num: 1);
15434 SDValue FMA = N->getOperand(Num: 2);
15435
15436 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
15437 Op2.getOpcode() != ISD::FP_EXTEND)
15438 return SDValue();
15439
15440 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
15441 // regardless of the denorm mode setting. Therefore,
15442 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
15443 const TargetOptions &Options = DAG.getTarget().Options;
15444 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
15445 (N->getFlags().hasAllowContract() &&
15446 FMA->getFlags().hasAllowContract())) {
15447 Op1 = Op1.getOperand(i: 0);
15448 Op2 = Op2.getOperand(i: 0);
15449 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15450 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15451 return SDValue();
15452
15453 SDValue Vec1 = Op1.getOperand(i: 0);
15454 SDValue Idx1 = Op1.getOperand(i: 1);
15455 SDValue Vec2 = Op2.getOperand(i: 0);
15456
15457 SDValue FMAOp1 = FMA.getOperand(i: 0);
15458 SDValue FMAOp2 = FMA.getOperand(i: 1);
15459 SDValue FMAAcc = FMA.getOperand(i: 2);
15460
15461 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
15462 FMAOp2.getOpcode() != ISD::FP_EXTEND)
15463 return SDValue();
15464
15465 FMAOp1 = FMAOp1.getOperand(i: 0);
15466 FMAOp2 = FMAOp2.getOperand(i: 0);
15467 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15468 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15469 return SDValue();
15470
15471 SDValue Vec3 = FMAOp1.getOperand(i: 0);
15472 SDValue Vec4 = FMAOp2.getOperand(i: 0);
15473 SDValue Idx2 = FMAOp1.getOperand(i: 1);
15474
15475 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
15476 // Idx1 and Idx2 cannot be the same.
15477 Idx1 == Idx2)
15478 return SDValue();
15479
15480 if (Vec1 == Vec2 || Vec3 == Vec4)
15481 return SDValue();
15482
15483 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
15484 return SDValue();
15485
15486 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
15487 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
15488 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
15489 }
15490 }
15491 return SDValue();
15492}
15493
15494SDValue SITargetLowering::performSetCCCombine(SDNode *N,
15495 DAGCombinerInfo &DCI) const {
15496 SelectionDAG &DAG = DCI.DAG;
15497 SDLoc SL(N);
15498
15499 SDValue LHS = N->getOperand(Num: 0);
15500 SDValue RHS = N->getOperand(Num: 1);
15501 EVT VT = LHS.getValueType();
15502 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
15503
15504 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
15505 if (!CRHS) {
15506 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
15507 if (CRHS) {
15508 std::swap(a&: LHS, b&: RHS);
15509 CC = getSetCCSwappedOperands(Operation: CC);
15510 }
15511 }
15512
15513 if (CRHS) {
15514 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
15515 isBoolSGPR(V: LHS.getOperand(i: 0))) {
15516 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
15517 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
15518 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
15519 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
15520 if ((CRHS->isAllOnes() &&
15521 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
15522 (CRHS->isZero() &&
15523 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
15524 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
15525 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
15526 if ((CRHS->isAllOnes() &&
15527 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
15528 (CRHS->isZero() &&
15529 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
15530 return LHS.getOperand(i: 0);
15531 }
15532
15533 const APInt &CRHSVal = CRHS->getAPIntValue();
15534 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
15535 LHS.getOpcode() == ISD::SELECT &&
15536 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
15537 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
15538 LHS.getConstantOperandVal(i: 1) != LHS.getConstantOperandVal(i: 2) &&
15539 isBoolSGPR(V: LHS.getOperand(i: 0))) {
15540 // Given CT != FT:
15541 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
15542 // setcc (select cc, CT, CF), CF, ne => cc
15543 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
15544 // setcc (select cc, CT, CF), CT, eq => cc
15545 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
15546 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
15547
15548 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
15549 (CT == CRHSVal && CC == ISD::SETNE))
15550 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
15551 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
15552 if ((CF == CRHSVal && CC == ISD::SETNE) ||
15553 (CT == CRHSVal && CC == ISD::SETEQ))
15554 return LHS.getOperand(i: 0);
15555 }
15556 }
15557
15558 if (VT != MVT::f32 && VT != MVT::f64 &&
15559 (!Subtarget->has16BitInsts() || VT != MVT::f16))
15560 return SDValue();
15561
15562 // Match isinf/isfinite pattern
15563 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
15564 // (fcmp one (fabs x), inf) -> (fp_class x,
15565 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
15566 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
15567 LHS.getOpcode() == ISD::FABS) {
15568 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
15569 if (!CRHS)
15570 return SDValue();
15571
15572 const APFloat &APF = CRHS->getValueAPF();
15573 if (APF.isInfinity() && !APF.isNegative()) {
15574 const unsigned IsInfMask =
15575 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
15576 const unsigned IsFiniteMask =
15577 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
15578 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
15579 SIInstrFlags::P_SUBNORMAL;
15580 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
15581 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
15582 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
15583 }
15584 }
15585
15586 return SDValue();
15587}
15588
15589SDValue
15590SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
15591 DAGCombinerInfo &DCI) const {
15592 SelectionDAG &DAG = DCI.DAG;
15593 SDLoc SL(N);
15594 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
15595
15596 SDValue Src = N->getOperand(Num: 0);
15597 SDValue Shift = N->getOperand(Num: 0);
15598
15599 // TODO: Extend type shouldn't matter (assuming legal types).
15600 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
15601 Shift = Shift.getOperand(i: 0);
15602
15603 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
15604 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
15605 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
15606 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
15607 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
15608 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
15609 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
15610 SDValue Shifted = DAG.getZExtOrTrunc(
15611 Op: Shift.getOperand(i: 0), DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
15612
15613 unsigned ShiftOffset = 8 * Offset;
15614 if (Shift.getOpcode() == ISD::SHL)
15615 ShiftOffset -= C->getZExtValue();
15616 else
15617 ShiftOffset += C->getZExtValue();
15618
15619 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
15620 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
15621 VT: MVT::f32, Operand: Shifted);
15622 }
15623 }
15624 }
15625
15626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15627 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
15628 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
15629 // We simplified Src. If this node is not dead, visit it again so it is
15630 // folded properly.
15631 if (N->getOpcode() != ISD::DELETED_NODE)
15632 DCI.AddToWorklist(N);
15633 return SDValue(N, 0);
15634 }
15635
15636 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
15637 if (SDValue DemandedSrc =
15638 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
15639 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
15640
15641 return SDValue();
15642}
15643
15644SDValue SITargetLowering::performClampCombine(SDNode *N,
15645 DAGCombinerInfo &DCI) const {
15646 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
15647 if (!CSrc)
15648 return SDValue();
15649
15650 const MachineFunction &MF = DCI.DAG.getMachineFunction();
15651 const APFloat &F = CSrc->getValueAPF();
15652 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
15653 if (F < Zero ||
15654 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
15655 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
15656 }
15657
15658 APFloat One(F.getSemantics(), "1.0");
15659 if (F > One)
15660 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
15661
15662 return SDValue(CSrc, 0);
15663}
15664
15665SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
15666 DAGCombinerInfo &DCI) const {
15667 switch (N->getOpcode()) {
15668 case ISD::ADD:
15669 case ISD::SUB:
15670 case ISD::SHL:
15671 case ISD::SRL:
15672 case ISD::SRA:
15673 case ISD::AND:
15674 case ISD::OR:
15675 case ISD::XOR:
15676 case ISD::MUL:
15677 case ISD::SETCC:
15678 case ISD::SELECT:
15679 case ISD::SMIN:
15680 case ISD::SMAX:
15681 case ISD::UMIN:
15682 case ISD::UMAX:
15683 if (auto Res = promoteUniformOpToI32(Op: SDValue(N, 0), DCI))
15684 return Res;
15685 break;
15686 default:
15687 break;
15688 }
15689
15690 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
15691 return SDValue();
15692
15693 switch (N->getOpcode()) {
15694 case ISD::ADD:
15695 return performAddCombine(N, DCI);
15696 case ISD::PTRADD:
15697 return performPtrAddCombine(N, DCI);
15698 case ISD::SUB:
15699 return performSubCombine(N, DCI);
15700 case ISD::UADDO_CARRY:
15701 case ISD::USUBO_CARRY:
15702 return performAddCarrySubCarryCombine(N, DCI);
15703 case ISD::FADD:
15704 return performFAddCombine(N, DCI);
15705 case ISD::FSUB:
15706 return performFSubCombine(N, DCI);
15707 case ISD::FDIV:
15708 return performFDivCombine(N, DCI);
15709 case ISD::FMUL:
15710 return performFMulCombine(N, DCI);
15711 case ISD::SETCC:
15712 return performSetCCCombine(N, DCI);
15713 case ISD::FMAXNUM:
15714 case ISD::FMINNUM:
15715 case ISD::FMAXNUM_IEEE:
15716 case ISD::FMINNUM_IEEE:
15717 case ISD::FMAXIMUM:
15718 case ISD::FMINIMUM:
15719 case ISD::FMAXIMUMNUM:
15720 case ISD::FMINIMUMNUM:
15721 case ISD::SMAX:
15722 case ISD::SMIN:
15723 case ISD::UMAX:
15724 case ISD::UMIN:
15725 case AMDGPUISD::FMIN_LEGACY:
15726 case AMDGPUISD::FMAX_LEGACY:
15727 return performMinMaxCombine(N, DCI);
15728 case ISD::FMA:
15729 return performFMACombine(N, DCI);
15730 case ISD::AND:
15731 return performAndCombine(N, DCI);
15732 case ISD::OR:
15733 return performOrCombine(N, DCI);
15734 case ISD::FSHR: {
15735 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15736 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
15737 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
15738 return matchPERM(N, DCI);
15739 }
15740 break;
15741 }
15742 case ISD::XOR:
15743 return performXorCombine(N, DCI);
15744 case ISD::ZERO_EXTEND:
15745 return performZeroExtendCombine(N, DCI);
15746 case ISD::SIGN_EXTEND_INREG:
15747 return performSignExtendInRegCombine(N, DCI);
15748 case AMDGPUISD::FP_CLASS:
15749 return performClassCombine(N, DCI);
15750 case ISD::FCANONICALIZE:
15751 return performFCanonicalizeCombine(N, DCI);
15752 case AMDGPUISD::RCP:
15753 return performRcpCombine(N, DCI);
15754 case ISD::FLDEXP:
15755 case AMDGPUISD::FRACT:
15756 case AMDGPUISD::RSQ:
15757 case AMDGPUISD::RCP_LEGACY:
15758 case AMDGPUISD::RCP_IFLAG:
15759 case AMDGPUISD::RSQ_CLAMP: {
15760 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
15761 SDValue Src = N->getOperand(Num: 0);
15762 if (Src.isUndef())
15763 return Src;
15764 break;
15765 }
15766 case ISD::SINT_TO_FP:
15767 case ISD::UINT_TO_FP:
15768 return performUCharToFloatCombine(N, DCI);
15769 case ISD::FCOPYSIGN:
15770 return performFCopySignCombine(N, DCI);
15771 case AMDGPUISD::CVT_F32_UBYTE0:
15772 case AMDGPUISD::CVT_F32_UBYTE1:
15773 case AMDGPUISD::CVT_F32_UBYTE2:
15774 case AMDGPUISD::CVT_F32_UBYTE3:
15775 return performCvtF32UByteNCombine(N, DCI);
15776 case AMDGPUISD::FMED3:
15777 return performFMed3Combine(N, DCI);
15778 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15779 return performCvtPkRTZCombine(N, DCI);
15780 case AMDGPUISD::CLAMP:
15781 return performClampCombine(N, DCI);
15782 case ISD::SCALAR_TO_VECTOR: {
15783 SelectionDAG &DAG = DCI.DAG;
15784 EVT VT = N->getValueType(ResNo: 0);
15785
15786 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
15787 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
15788 SDLoc SL(N);
15789 SDValue Src = N->getOperand(Num: 0);
15790 EVT EltVT = Src.getValueType();
15791 if (EltVT != MVT::i16)
15792 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
15793
15794 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
15795 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
15796 }
15797
15798 break;
15799 }
15800 case ISD::EXTRACT_VECTOR_ELT:
15801 return performExtractVectorEltCombine(N, DCI);
15802 case ISD::INSERT_VECTOR_ELT:
15803 return performInsertVectorEltCombine(N, DCI);
15804 case ISD::FP_ROUND:
15805 return performFPRoundCombine(N, DCI);
15806 case ISD::LOAD: {
15807 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
15808 return Widened;
15809 [[fallthrough]];
15810 }
15811 default: {
15812 if (!DCI.isBeforeLegalize()) {
15813 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
15814 return performMemSDNodeCombine(N: MemNode, DCI);
15815 }
15816
15817 break;
15818 }
15819 }
15820
15821 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
15822}
15823
15824/// Helper function for adjustWritemask
15825static unsigned SubIdx2Lane(unsigned Idx) {
15826 switch (Idx) {
15827 default:
15828 return ~0u;
15829 case AMDGPU::sub0:
15830 return 0;
15831 case AMDGPU::sub1:
15832 return 1;
15833 case AMDGPU::sub2:
15834 return 2;
15835 case AMDGPU::sub3:
15836 return 3;
15837 case AMDGPU::sub4:
15838 return 4; // Possible with TFE/LWE
15839 }
15840}
15841
15842/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
15843SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
15844 SelectionDAG &DAG) const {
15845 unsigned Opcode = Node->getMachineOpcode();
15846
15847 // Subtract 1 because the vdata output is not a MachineSDNode operand.
15848 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - 1;
15849 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
15850 return Node; // not implemented for D16
15851
15852 SDNode *Users[5] = {nullptr};
15853 unsigned Lane = 0;
15854 unsigned DmaskIdx =
15855 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - 1;
15856 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
15857 unsigned NewDmask = 0;
15858 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - 1;
15859 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - 1;
15860 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
15861 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx));
15862 unsigned TFCLane = 0;
15863 bool HasChain = Node->getNumValues() > 1;
15864
15865 if (OldDmask == 0) {
15866 // These are folded out, but on the chance it happens don't assert.
15867 return Node;
15868 }
15869
15870 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
15871 // Work out which is the TFE/LWE lane if that is enabled.
15872 if (UsesTFC) {
15873 TFCLane = OldBitsSet;
15874 }
15875
15876 // Try to figure out the used register components
15877 for (SDUse &Use : Node->uses()) {
15878
15879 // Don't look at users of the chain.
15880 if (Use.getResNo() != 0)
15881 continue;
15882
15883 SDNode *User = Use.getUser();
15884
15885 // Abort if we can't understand the usage
15886 if (!User->isMachineOpcode() ||
15887 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
15888 return Node;
15889
15890 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
15891 // Note that subregs are packed, i.e. Lane==0 is the first bit set
15892 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
15893 // set, etc.
15894 Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: 1));
15895 if (Lane == ~0u)
15896 return Node;
15897
15898 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
15899 if (UsesTFC && Lane == TFCLane) {
15900 Users[Lane] = User;
15901 } else {
15902 // Set which texture component corresponds to the lane.
15903 unsigned Comp;
15904 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
15905 Comp = llvm::countr_zero(Val: Dmask);
15906 Dmask &= ~(1 << Comp);
15907 }
15908
15909 // Abort if we have more than one user per component.
15910 if (Users[Lane])
15911 return Node;
15912
15913 Users[Lane] = User;
15914 NewDmask |= 1 << Comp;
15915 }
15916 }
15917
15918 // Don't allow 0 dmask, as hardware assumes one channel enabled.
15919 bool NoChannels = !NewDmask;
15920 if (NoChannels) {
15921 if (!UsesTFC) {
15922 // No uses of the result and not using TFC. Then do nothing.
15923 return Node;
15924 }
15925 // If the original dmask has one channel - then nothing to do
15926 if (OldBitsSet == 1)
15927 return Node;
15928 // Use an arbitrary dmask - required for the instruction to work
15929 NewDmask = 1;
15930 }
15931 // Abort if there's no change
15932 if (NewDmask == OldDmask)
15933 return Node;
15934
15935 unsigned BitsSet = llvm::popcount(Value: NewDmask);
15936
15937 // Check for TFE or LWE - increase the number of channels by one to account
15938 // for the extra return value
15939 // This will need adjustment for D16 if this is also included in
15940 // adjustWriteMask (this function) but at present D16 are excluded.
15941 unsigned NewChannels = BitsSet + UsesTFC;
15942
15943 int NewOpcode =
15944 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
15945 assert(NewOpcode != -1 &&
15946 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
15947 "failed to find equivalent MIMG op");
15948
15949 // Adjust the writemask in the node
15950 SmallVector<SDValue, 12> Ops;
15951 llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
15952 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
15953 llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + 1));
15954
15955 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
15956
15957 MVT ResultVT = NewChannels == 1
15958 ? SVT
15959 : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4
15960 : NewChannels == 5 ? 8
15961 : NewChannels);
15962 SDVTList NewVTList =
15963 HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
15964
15965 MachineSDNode *NewNode =
15966 DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), VTs: NewVTList, Ops);
15967
15968 if (HasChain) {
15969 // Update chain.
15970 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
15971 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
15972 }
15973
15974 if (NewChannels == 1) {
15975 assert(Node->hasNUsesOfValue(1, 0));
15976 SDNode *Copy =
15977 DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc(Node),
15978 VT: Users[Lane]->getValueType(ResNo: 0), Op1: SDValue(NewNode, 0));
15979 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
15980 return nullptr;
15981 }
15982
15983 // Update the users of the node with the new indices
15984 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15985 SDNode *User = Users[i];
15986 if (!User) {
15987 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15988 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15989 if (i || !NoChannels)
15990 continue;
15991 } else {
15992 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
15993 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
15994 if (NewUser != User) {
15995 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
15996 DAG.RemoveDeadNode(N: User);
15997 }
15998 }
15999
16000 switch (Idx) {
16001 default:
16002 break;
16003 case AMDGPU::sub0:
16004 Idx = AMDGPU::sub1;
16005 break;
16006 case AMDGPU::sub1:
16007 Idx = AMDGPU::sub2;
16008 break;
16009 case AMDGPU::sub2:
16010 Idx = AMDGPU::sub3;
16011 break;
16012 case AMDGPU::sub3:
16013 Idx = AMDGPU::sub4;
16014 break;
16015 }
16016 }
16017
16018 DAG.RemoveDeadNode(N: Node);
16019 return nullptr;
16020}
16021
16022static bool isFrameIndexOp(SDValue Op) {
16023 if (Op.getOpcode() == ISD::AssertZext)
16024 Op = Op.getOperand(i: 0);
16025
16026 return isa<FrameIndexSDNode>(Val: Op);
16027}
16028
16029/// Legalize target independent instructions (e.g. INSERT_SUBREG)
16030/// with frame index operands.
16031/// LLVM assumes that inputs are to these instructions are registers.
16032SDNode *
16033SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
16034 SelectionDAG &DAG) const {
16035 if (Node->getOpcode() == ISD::CopyToReg) {
16036 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
16037 SDValue SrcVal = Node->getOperand(Num: 2);
16038
16039 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
16040 // to try understanding copies to physical registers.
16041 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
16042 SDLoc SL(Node);
16043 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16044 SDValue VReg = DAG.getRegister(
16045 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
16046
16047 SDNode *Glued = Node->getGluedNode();
16048 SDValue ToVReg = DAG.getCopyToReg(
16049 Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
16050 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
16051 SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
16052 N: VReg, Glue: ToVReg.getValue(R: 1));
16053 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
16054 DAG.RemoveDeadNode(N: Node);
16055 return ToResultReg.getNode();
16056 }
16057 }
16058
16059 SmallVector<SDValue, 8> Ops;
16060 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
16061 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
16062 Ops.push_back(Elt: Node->getOperand(Num: i));
16063 continue;
16064 }
16065
16066 SDLoc DL(Node);
16067 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
16068 VT: Node->getOperand(Num: i).getValueType(),
16069 Op1: Node->getOperand(Num: i)),
16070 0));
16071 }
16072
16073 return DAG.UpdateNodeOperands(N: Node, Ops);
16074}
16075
16076/// Fold the instructions after selecting them.
16077/// Returns null if users were already updated.
16078SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
16079 SelectionDAG &DAG) const {
16080 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16081 unsigned Opcode = Node->getMachineOpcode();
16082
16083 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
16084 !TII->isGather4(Opcode) &&
16085 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
16086 return adjustWritemask(Node, DAG);
16087 }
16088
16089 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
16090 legalizeTargetIndependentNode(Node, DAG);
16091 return Node;
16092 }
16093
16094 switch (Opcode) {
16095 case AMDGPU::V_DIV_SCALE_F32_e64:
16096 case AMDGPU::V_DIV_SCALE_F64_e64: {
16097 // Satisfy the operand register constraint when one of the inputs is
16098 // undefined. Ordinarily each undef value will have its own implicit_def of
16099 // a vreg, so force these to use a single register.
16100 SDValue Src0 = Node->getOperand(Num: 1);
16101 SDValue Src1 = Node->getOperand(Num: 3);
16102 SDValue Src2 = Node->getOperand(Num: 5);
16103
16104 if ((Src0.isMachineOpcode() &&
16105 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
16106 (Src0 == Src1 || Src0 == Src2))
16107 break;
16108
16109 MVT VT = Src0.getValueType().getSimpleVT();
16110 const TargetRegisterClass *RC =
16111 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
16112
16113 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
16114 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
16115
16116 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), Reg: UndefReg,
16117 N: Src0, Glue: SDValue());
16118
16119 // src0 must be the same register as src1 or src2, even if the value is
16120 // undefined, so make sure we don't violate this constraint.
16121 if (Src0.isMachineOpcode() &&
16122 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
16123 if (Src1.isMachineOpcode() &&
16124 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16125 Src0 = Src1;
16126 else if (Src2.isMachineOpcode() &&
16127 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
16128 Src0 = Src2;
16129 else {
16130 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
16131 Src0 = UndefReg;
16132 Src1 = UndefReg;
16133 }
16134 } else
16135 break;
16136
16137 SmallVector<SDValue, 9> Ops(Node->ops());
16138 Ops[1] = Src0;
16139 Ops[3] = Src1;
16140 Ops[5] = Src2;
16141 Ops.push_back(Elt: ImpDef.getValue(R: 1));
16142 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
16143 }
16144 default:
16145 break;
16146 }
16147
16148 return Node;
16149}
16150
16151// Any MIMG instructions that use tfe or lwe require an initialization of the
16152// result register that will be written in the case of a memory access failure.
16153// The required code is also added to tie this init code to the result of the
16154// img instruction.
16155void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
16156 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16157 const SIRegisterInfo &TRI = TII->getRegisterInfo();
16158 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
16159 MachineBasicBlock &MBB = *MI.getParent();
16160
16161 int DstIdx =
16162 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
16163 unsigned InitIdx = 0;
16164
16165 if (TII->isImage(MI)) {
16166 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
16167 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
16168 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
16169
16170 if (!TFE && !LWE) // intersect_ray
16171 return;
16172
16173 unsigned TFEVal = TFE ? TFE->getImm() : 0;
16174 unsigned LWEVal = LWE ? LWE->getImm() : 0;
16175 unsigned D16Val = D16 ? D16->getImm() : 0;
16176
16177 if (!TFEVal && !LWEVal)
16178 return;
16179
16180 // At least one of TFE or LWE are non-zero
16181 // We have to insert a suitable initialization of the result value and
16182 // tie this to the dest of the image instruction.
16183
16184 // Calculate which dword we have to initialize to 0.
16185 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
16186
16187 // check that dmask operand is found.
16188 assert(MO_Dmask && "Expected dmask operand in instruction");
16189
16190 unsigned dmask = MO_Dmask->getImm();
16191 // Determine the number of active lanes taking into account the
16192 // Gather4 special case
16193 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
16194
16195 bool Packed = !Subtarget->hasUnpackedD16VMem();
16196
16197 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
16198
16199 // Abandon attempt if the dst size isn't large enough
16200 // - this is in fact an error but this is picked up elsewhere and
16201 // reported correctly.
16202 uint32_t DstSize =
16203 TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
16204 if (DstSize < InitIdx)
16205 return;
16206 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
16207 InitIdx = TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
16208 } else {
16209 return;
16210 }
16211
16212 const DebugLoc &DL = MI.getDebugLoc();
16213
16214 // Create a register for the initialization value.
16215 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
16216 unsigned NewDst = 0; // Final initialized value will be in here
16217
16218 // If PRTStrictNull feature is enabled (the default) then initialize
16219 // all the result registers to 0, otherwise just the error indication
16220 // register (VGPRn+1)
16221 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
16222 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
16223
16224 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
16225 for (; SizeLeft; SizeLeft--, CurrIdx++) {
16226 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
16227 // Initialize dword
16228 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
16229 // clang-format off
16230 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
16231 .addImm(Val: 0);
16232 // clang-format on
16233 // Insert into the super-reg
16234 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
16235 .addReg(RegNo: PrevDst)
16236 .addReg(RegNo: SubReg)
16237 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
16238
16239 PrevDst = NewDst;
16240 }
16241
16242 // Add as an implicit operand
16243 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
16244
16245 // Tie the just added implicit operand to the dst
16246 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
16247}
16248
16249/// Assign the register class depending on the number of
16250/// bits set in the writemask
16251void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
16252 SDNode *Node) const {
16253 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16254
16255 MachineFunction *MF = MI.getParent()->getParent();
16256 MachineRegisterInfo &MRI = MF->getRegInfo();
16257 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
16258
16259 if (TII->isVOP3(Opcode: MI.getOpcode())) {
16260 // Make sure constant bus requirements are respected.
16261 TII->legalizeOperandsVOP3(MRI, MI);
16262
16263 // Prefer VGPRs over AGPRs in mAI instructions where possible.
16264 // This saves a chain-copy of registers and better balance register
16265 // use between vgpr and agpr as agpr tuples tend to be big.
16266 if (!MI.getDesc().operands().empty()) {
16267 unsigned Opc = MI.getOpcode();
16268 bool HasAGPRs = Info->mayNeedAGPRs();
16269 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16270 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src2);
16271 for (auto I :
16272 {AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src0),
16273 AMDGPU::getNamedOperandIdx(Opcode: Opc, Name: AMDGPU::OpName::src1), Src2Idx}) {
16274 if (I == -1)
16275 break;
16276 if ((I == Src2Idx) && (HasAGPRs))
16277 break;
16278 MachineOperand &Op = MI.getOperand(i: I);
16279 if (!Op.isReg() || !Op.getReg().isVirtual())
16280 continue;
16281 auto *RC = TRI->getRegClassForReg(MRI, Reg: Op.getReg());
16282 if (!TRI->hasAGPRs(RC))
16283 continue;
16284 auto *Src = MRI.getUniqueVRegDef(Reg: Op.getReg());
16285 if (!Src || !Src->isCopy() ||
16286 !TRI->isSGPRReg(MRI, Reg: Src->getOperand(i: 1).getReg()))
16287 continue;
16288 auto *NewRC = TRI->getEquivalentVGPRClass(SRC: RC);
16289 // All uses of agpr64 and agpr32 can also accept vgpr except for
16290 // v_accvgpr_read, but we do not produce agpr reads during selection,
16291 // so no use checks are needed.
16292 MRI.setRegClass(Reg: Op.getReg(), RC: NewRC);
16293 }
16294
16295 if (TII->isMAI(MI)) {
16296 // The ordinary src0, src1, src2 were legalized above.
16297 //
16298 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
16299 // as a separate instruction.
16300 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
16301 Name: AMDGPU::OpName::scale_src0);
16302 if (Src0Idx != -1) {
16303 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
16304 Name: AMDGPU::OpName::scale_src1);
16305 if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
16306 TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
16307 TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
16308 }
16309 }
16310
16311 if (!HasAGPRs)
16312 return;
16313
16314 // Resolve the rest of AV operands to AGPRs.
16315 if (auto *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)) {
16316 if (Src2->isReg() && Src2->getReg().isVirtual()) {
16317 auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg());
16318 if (TRI->isVectorSuperClass(RC)) {
16319 auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC);
16320 MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC);
16321 if (Src2->isTied())
16322 MRI.setRegClass(Reg: MI.getOperand(i: 0).getReg(), RC: NewRC);
16323 }
16324 }
16325 }
16326 }
16327
16328 return;
16329 }
16330
16331 if (TII->isImage(MI))
16332 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
16333}
16334
16335static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
16336 uint64_t Val) {
16337 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
16338 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
16339}
16340
16341MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
16342 const SDLoc &DL,
16343 SDValue Ptr) const {
16344 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16345
16346 // Build the half of the subregister with the constants before building the
16347 // full 128-bit register. If we are building multiple resource descriptors,
16348 // this will allow CSEing of the 2-component register.
16349 const SDValue Ops0[] = {
16350 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
16351 buildSMovImm32(DAG, DL, Val: 0),
16352 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
16353 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
16354 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
16355
16356 SDValue SubRegHi = SDValue(
16357 DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), 0);
16358
16359 // Combine the constants and the pointer.
16360 const SDValue Ops1[] = {
16361 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
16362 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
16363 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
16364
16365 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
16366}
16367
16368/// Return a resource descriptor with the 'Add TID' bit enabled
16369/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
16370/// of the resource descriptor) to create an offset, which is added to
16371/// the resource pointer.
16372MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
16373 SDValue Ptr, uint32_t RsrcDword1,
16374 uint64_t RsrcDword2And3) const {
16375 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
16376 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
16377 if (RsrcDword1) {
16378 PtrHi =
16379 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
16380 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
16381 0);
16382 }
16383
16384 SDValue DataLo =
16385 buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
16386 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
16387
16388 const SDValue Ops[] = {
16389 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
16390 PtrLo,
16391 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
16392 PtrHi,
16393 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
16394 DataLo,
16395 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
16396 DataHi,
16397 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
16398
16399 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
16400}
16401
16402//===----------------------------------------------------------------------===//
16403// SI Inline Assembly Support
16404//===----------------------------------------------------------------------===//
16405
16406std::pair<unsigned, const TargetRegisterClass *>
16407SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
16408 StringRef Constraint,
16409 MVT VT) const {
16410 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
16411
16412 const TargetRegisterClass *RC = nullptr;
16413 if (Constraint.size() == 1) {
16414 const unsigned BitWidth = VT.getSizeInBits();
16415 switch (Constraint[0]) {
16416 default:
16417 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16418 case 's':
16419 case 'r':
16420 switch (BitWidth) {
16421 case 16:
16422 RC = &AMDGPU::SReg_32RegClass;
16423 break;
16424 case 64:
16425 RC = &AMDGPU::SGPR_64RegClass;
16426 break;
16427 default:
16428 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
16429 if (!RC)
16430 return std::pair(0U, nullptr);
16431 break;
16432 }
16433 break;
16434 case 'v':
16435 switch (BitWidth) {
16436 case 16:
16437 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
16438 : &AMDGPU::VGPR_32RegClass;
16439 break;
16440 default:
16441 RC = TRI->getVGPRClassForBitWidth(BitWidth);
16442 if (!RC)
16443 return std::pair(0U, nullptr);
16444 break;
16445 }
16446 break;
16447 case 'a':
16448 if (!Subtarget->hasMAIInsts())
16449 break;
16450 switch (BitWidth) {
16451 case 16:
16452 RC = &AMDGPU::AGPR_32RegClass;
16453 break;
16454 default:
16455 RC = TRI->getAGPRClassForBitWidth(BitWidth);
16456 if (!RC)
16457 return std::pair(0U, nullptr);
16458 break;
16459 }
16460 break;
16461 }
16462 // We actually support i128, i16 and f16 as inline parameters
16463 // even if they are not reported as legal
16464 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
16465 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
16466 return std::pair(0U, RC);
16467 }
16468
16469 if (Constraint.starts_with(Prefix: "{") && Constraint.ends_with(Suffix: "}")) {
16470 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
16471 if (RegName.consume_front(Prefix: "v")) {
16472 RC = &AMDGPU::VGPR_32RegClass;
16473 } else if (RegName.consume_front(Prefix: "s")) {
16474 RC = &AMDGPU::SGPR_32RegClass;
16475 } else if (RegName.consume_front(Prefix: "a")) {
16476 RC = &AMDGPU::AGPR_32RegClass;
16477 }
16478
16479 if (RC) {
16480 uint32_t Idx;
16481 if (RegName.consume_front(Prefix: "[")) {
16482 uint32_t End;
16483 bool Failed = RegName.consumeInteger(Radix: 10, Result&: Idx);
16484 Failed |= !RegName.consume_front(Prefix: ":");
16485 Failed |= RegName.consumeInteger(Radix: 10, Result&: End);
16486 Failed |= !RegName.consume_back(Suffix: "]");
16487 if (!Failed) {
16488 uint32_t Width = (End - Idx + 1) * 32;
16489 // Prohibit constraints for register ranges with a width that does not
16490 // match the required type.
16491 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
16492 return std::pair(0U, nullptr);
16493 MCRegister Reg = RC->getRegister(i: Idx);
16494 if (SIRegisterInfo::isVGPRClass(RC))
16495 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
16496 else if (SIRegisterInfo::isSGPRClass(RC))
16497 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
16498 else if (SIRegisterInfo::isAGPRClass(RC))
16499 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
16500 if (RC) {
16501 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
16502 if (!Reg) {
16503 // The register class does not contain the requested register,
16504 // e.g., because it is an SGPR pair that would violate alignment
16505 // requirements.
16506 return std::pair(0U, nullptr);
16507 }
16508 return std::pair(Reg, RC);
16509 }
16510 }
16511 } else {
16512 // Check for lossy scalar/vector conversions.
16513 if (VT.isVector() && VT.getSizeInBits() != 32)
16514 return std::pair(0U, nullptr);
16515 bool Failed = RegName.getAsInteger(Radix: 10, Result&: Idx);
16516 if (!Failed && Idx < RC->getNumRegs())
16517 return std::pair(RC->getRegister(i: Idx), RC);
16518 }
16519 }
16520 }
16521
16522 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16523 if (Ret.first)
16524 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
16525
16526 return Ret;
16527}
16528
16529static bool isImmConstraint(StringRef Constraint) {
16530 if (Constraint.size() == 1) {
16531 switch (Constraint[0]) {
16532 default:
16533 break;
16534 case 'I':
16535 case 'J':
16536 case 'A':
16537 case 'B':
16538 case 'C':
16539 return true;
16540 }
16541 } else if (Constraint == "DA" || Constraint == "DB") {
16542 return true;
16543 }
16544 return false;
16545}
16546
16547SITargetLowering::ConstraintType
16548SITargetLowering::getConstraintType(StringRef Constraint) const {
16549 if (Constraint.size() == 1) {
16550 switch (Constraint[0]) {
16551 default:
16552 break;
16553 case 's':
16554 case 'v':
16555 case 'a':
16556 return C_RegisterClass;
16557 }
16558 }
16559 if (isImmConstraint(Constraint)) {
16560 return C_Other;
16561 }
16562 return TargetLowering::getConstraintType(Constraint);
16563}
16564
16565static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
16566 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
16567 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
16568 }
16569 return Val;
16570}
16571
16572void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16573 StringRef Constraint,
16574 std::vector<SDValue> &Ops,
16575 SelectionDAG &DAG) const {
16576 if (isImmConstraint(Constraint)) {
16577 uint64_t Val;
16578 if (getAsmOperandConstVal(Op, Val) &&
16579 checkAsmConstraintVal(Op, Constraint, Val)) {
16580 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
16581 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
16582 }
16583 } else {
16584 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16585 }
16586}
16587
16588bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
16589 unsigned Size = Op.getScalarValueSizeInBits();
16590 if (Size > 64)
16591 return false;
16592
16593 if (Size == 16 && !Subtarget->has16BitInsts())
16594 return false;
16595
16596 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
16597 Val = C->getSExtValue();
16598 return true;
16599 }
16600 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
16601 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16602 return true;
16603 }
16604 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
16605 if (Size != 16 || Op.getNumOperands() != 2)
16606 return false;
16607 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
16608 return false;
16609 if (ConstantSDNode *C = V->getConstantSplatNode()) {
16610 Val = C->getSExtValue();
16611 return true;
16612 }
16613 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
16614 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
16615 return true;
16616 }
16617 }
16618
16619 return false;
16620}
16621
16622bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
16623 uint64_t Val) const {
16624 if (Constraint.size() == 1) {
16625 switch (Constraint[0]) {
16626 case 'I':
16627 return AMDGPU::isInlinableIntLiteral(Literal: Val);
16628 case 'J':
16629 return isInt<16>(x: Val);
16630 case 'A':
16631 return checkAsmConstraintValA(Op, Val);
16632 case 'B':
16633 return isInt<32>(x: Val);
16634 case 'C':
16635 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
16636 AMDGPU::isInlinableIntLiteral(Literal: Val);
16637 default:
16638 break;
16639 }
16640 } else if (Constraint.size() == 2) {
16641 if (Constraint == "DA") {
16642 int64_t HiBits = static_cast<int32_t>(Val >> 32);
16643 int64_t LoBits = static_cast<int32_t>(Val);
16644 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
16645 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
16646 }
16647 if (Constraint == "DB") {
16648 return true;
16649 }
16650 }
16651 llvm_unreachable("Invalid asm constraint");
16652}
16653
16654bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
16655 unsigned MaxSize) const {
16656 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
16657 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
16658 if (Size == 16) {
16659 MVT VT = Op.getSimpleValueType();
16660 switch (VT.SimpleTy) {
16661 default:
16662 return false;
16663 case MVT::i16:
16664 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
16665 case MVT::f16:
16666 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
16667 case MVT::bf16:
16668 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
16669 case MVT::v2i16:
16670 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
16671 case MVT::v2f16:
16672 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
16673 case MVT::v2bf16:
16674 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
16675 }
16676 }
16677 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
16678 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
16679 return true;
16680 return false;
16681}
16682
16683static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
16684 switch (UnalignedClassID) {
16685 case AMDGPU::VReg_64RegClassID:
16686 return AMDGPU::VReg_64_Align2RegClassID;
16687 case AMDGPU::VReg_96RegClassID:
16688 return AMDGPU::VReg_96_Align2RegClassID;
16689 case AMDGPU::VReg_128RegClassID:
16690 return AMDGPU::VReg_128_Align2RegClassID;
16691 case AMDGPU::VReg_160RegClassID:
16692 return AMDGPU::VReg_160_Align2RegClassID;
16693 case AMDGPU::VReg_192RegClassID:
16694 return AMDGPU::VReg_192_Align2RegClassID;
16695 case AMDGPU::VReg_224RegClassID:
16696 return AMDGPU::VReg_224_Align2RegClassID;
16697 case AMDGPU::VReg_256RegClassID:
16698 return AMDGPU::VReg_256_Align2RegClassID;
16699 case AMDGPU::VReg_288RegClassID:
16700 return AMDGPU::VReg_288_Align2RegClassID;
16701 case AMDGPU::VReg_320RegClassID:
16702 return AMDGPU::VReg_320_Align2RegClassID;
16703 case AMDGPU::VReg_352RegClassID:
16704 return AMDGPU::VReg_352_Align2RegClassID;
16705 case AMDGPU::VReg_384RegClassID:
16706 return AMDGPU::VReg_384_Align2RegClassID;
16707 case AMDGPU::VReg_512RegClassID:
16708 return AMDGPU::VReg_512_Align2RegClassID;
16709 case AMDGPU::VReg_1024RegClassID:
16710 return AMDGPU::VReg_1024_Align2RegClassID;
16711 case AMDGPU::AReg_64RegClassID:
16712 return AMDGPU::AReg_64_Align2RegClassID;
16713 case AMDGPU::AReg_96RegClassID:
16714 return AMDGPU::AReg_96_Align2RegClassID;
16715 case AMDGPU::AReg_128RegClassID:
16716 return AMDGPU::AReg_128_Align2RegClassID;
16717 case AMDGPU::AReg_160RegClassID:
16718 return AMDGPU::AReg_160_Align2RegClassID;
16719 case AMDGPU::AReg_192RegClassID:
16720 return AMDGPU::AReg_192_Align2RegClassID;
16721 case AMDGPU::AReg_256RegClassID:
16722 return AMDGPU::AReg_256_Align2RegClassID;
16723 case AMDGPU::AReg_512RegClassID:
16724 return AMDGPU::AReg_512_Align2RegClassID;
16725 case AMDGPU::AReg_1024RegClassID:
16726 return AMDGPU::AReg_1024_Align2RegClassID;
16727 default:
16728 return -1;
16729 }
16730}
16731
16732// Figure out which registers should be reserved for stack access. Only after
16733// the function is legalized do we know all of the non-spill stack objects or if
16734// calls are present.
16735void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
16736 MachineRegisterInfo &MRI = MF.getRegInfo();
16737 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16738 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16739 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16740 const SIInstrInfo *TII = ST.getInstrInfo();
16741
16742 if (Info->isEntryFunction()) {
16743 // Callable functions have fixed registers used for stack access.
16744 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
16745 }
16746
16747 // TODO: Move this logic to getReservedRegs()
16748 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
16749 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
16750 Register SReg = ST.isWave32()
16751 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
16752 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
16753 RC: &AMDGPU::SGPR_64RegClass);
16754 Info->setSGPRForEXECCopy(SReg);
16755
16756 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
16757 Info->getStackPtrOffsetReg()));
16758 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
16759 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
16760
16761 // We need to worry about replacing the default register with itself in case
16762 // of MIR testcases missing the MFI.
16763 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
16764 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
16765
16766 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
16767 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
16768
16769 Info->limitOccupancy(MF);
16770
16771 if (ST.isWave32() && !MF.empty()) {
16772 for (auto &MBB : MF) {
16773 for (auto &MI : MBB) {
16774 TII->fixImplicitOperands(MI);
16775 }
16776 }
16777 }
16778
16779 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
16780 // classes if required. Ideally the register class constraints would differ
16781 // per-subtarget, but there's no easy way to achieve that right now. This is
16782 // not a problem for VGPRs because the correctly aligned VGPR class is implied
16783 // from using them as the register class for legal types.
16784 if (ST.needsAlignedVGPRs()) {
16785 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
16786 const Register Reg = Register::index2VirtReg(Index: I);
16787 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
16788 if (!RC)
16789 continue;
16790 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
16791 if (NewClassID != -1)
16792 MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID));
16793 }
16794 }
16795
16796 TargetLoweringBase::finalizeLowering(MF);
16797}
16798
16799void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16800 KnownBits &Known,
16801 const APInt &DemandedElts,
16802 const SelectionDAG &DAG,
16803 unsigned Depth) const {
16804 Known.resetAll();
16805 unsigned Opc = Op.getOpcode();
16806 switch (Opc) {
16807 case ISD::INTRINSIC_WO_CHAIN: {
16808 unsigned IID = Op.getConstantOperandVal(i: 0);
16809 switch (IID) {
16810 case Intrinsic::amdgcn_mbcnt_lo:
16811 case Intrinsic::amdgcn_mbcnt_hi: {
16812 const GCNSubtarget &ST =
16813 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
16814 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16815 // most 31 + src1.
16816 Known.Zero.setBitsFrom(
16817 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
16818 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
16819 Known = KnownBits::add(LHS: Known, RHS: Known2);
16820 return;
16821 }
16822 }
16823 break;
16824 }
16825 }
16826 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
16827 Op, Known, DemandedElts, DAG, Depth);
16828}
16829
16830void SITargetLowering::computeKnownBitsForFrameIndex(
16831 const int FI, KnownBits &Known, const MachineFunction &MF) const {
16832 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
16833
16834 // Set the high bits to zero based on the maximum allowed scratch size per
16835 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
16836 // calculation won't overflow, so assume the sign bit is never set.
16837 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
16838}
16839
16840static void knownBitsForWorkitemID(const GCNSubtarget &ST,
16841 GISelValueTracking &VT, KnownBits &Known,
16842 unsigned Dim) {
16843 unsigned MaxValue =
16844 ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
16845 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
16846}
16847
16848void SITargetLowering::computeKnownBitsForTargetInstr(
16849 GISelValueTracking &VT, Register R, KnownBits &Known,
16850 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
16851 unsigned Depth) const {
16852 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
16853 switch (MI->getOpcode()) {
16854 case AMDGPU::G_INTRINSIC:
16855 case AMDGPU::G_INTRINSIC_CONVERGENT: {
16856 Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
16857 switch (IID) {
16858 case Intrinsic::amdgcn_workitem_id_x:
16859 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 0);
16860 break;
16861 case Intrinsic::amdgcn_workitem_id_y:
16862 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 1);
16863 break;
16864 case Intrinsic::amdgcn_workitem_id_z:
16865 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 2);
16866 break;
16867 case Intrinsic::amdgcn_mbcnt_lo:
16868 case Intrinsic::amdgcn_mbcnt_hi: {
16869 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
16870 // most 31 + src1.
16871 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
16872 ? getSubtarget()->getWavefrontSizeLog2()
16873 : 5);
16874 KnownBits Known2;
16875 VT.computeKnownBitsImpl(R: MI->getOperand(i: 3).getReg(), Known&: Known2, DemandedElts,
16876 Depth: Depth + 1);
16877 Known = KnownBits::add(LHS: Known, RHS: Known2);
16878 break;
16879 }
16880 case Intrinsic::amdgcn_groupstaticsize: {
16881 // We can report everything over the maximum size as 0. We can't report
16882 // based on the actual size because we don't know if it's accurate or not
16883 // at any given point.
16884 Known.Zero.setHighBits(
16885 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
16886 break;
16887 }
16888 }
16889 break;
16890 }
16891 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
16892 Known.Zero.setHighBits(24);
16893 break;
16894 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
16895 Known.Zero.setHighBits(16);
16896 break;
16897 case AMDGPU::G_AMDGPU_SMED3:
16898 case AMDGPU::G_AMDGPU_UMED3: {
16899 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
16900
16901 KnownBits Known2;
16902 VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
16903 if (Known2.isUnknown())
16904 break;
16905
16906 KnownBits Known1;
16907 VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
16908 if (Known1.isUnknown())
16909 break;
16910
16911 KnownBits Known0;
16912 VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
16913 if (Known0.isUnknown())
16914 break;
16915
16916 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
16917 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
16918 Known.One = Known0.One & Known1.One & Known2.One;
16919 break;
16920 }
16921 }
16922}
16923
16924Align SITargetLowering::computeKnownAlignForTargetInstr(
16925 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
16926 unsigned Depth) const {
16927 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
16928 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
16929 // FIXME: Can this move to generic code? What about the case where the call
16930 // site specifies a lower alignment?
16931 Intrinsic::ID IID = GI->getIntrinsicID();
16932 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
16933 AttributeList Attrs =
16934 Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
16935 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
16936 return *RetAlign;
16937 }
16938 return Align(1);
16939}
16940
16941Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16942 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
16943 const Align CacheLineAlign = Align(64);
16944
16945 // Pre-GFX10 target did not benefit from loop alignment
16946 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
16947 getSubtarget()->hasInstFwdPrefetchBug())
16948 return PrefAlign;
16949
16950 // On GFX10 I$ is 4 x 64 bytes cache lines.
16951 // By default prefetcher keeps one cache line behind and reads two ahead.
16952 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
16953 // behind and one ahead.
16954 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
16955 // If loop fits 64 bytes it always spans no more than two cache lines and
16956 // does not need an alignment.
16957 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
16958 // Else if loop is less or equal 192 bytes we need two lines behind.
16959
16960 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16961 const MachineBasicBlock *Header = ML->getHeader();
16962 if (Header->getAlignment() != PrefAlign)
16963 return Header->getAlignment(); // Already processed.
16964
16965 unsigned LoopSize = 0;
16966 for (const MachineBasicBlock *MBB : ML->blocks()) {
16967 // If inner loop block is aligned assume in average half of the alignment
16968 // size to be added as nops.
16969 if (MBB != Header)
16970 LoopSize += MBB->getAlignment().value() / 2;
16971
16972 for (const MachineInstr &MI : *MBB) {
16973 LoopSize += TII->getInstSizeInBytes(MI);
16974 if (LoopSize > 192)
16975 return PrefAlign;
16976 }
16977 }
16978
16979 if (LoopSize <= 64)
16980 return PrefAlign;
16981
16982 if (LoopSize <= 128)
16983 return CacheLineAlign;
16984
16985 // If any of parent loops is surrounded by prefetch instructions do not
16986 // insert new for inner loop, which would reset parent's settings.
16987 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
16988 if (MachineBasicBlock *Exit = P->getExitBlock()) {
16989 auto I = Exit->getFirstNonDebugInstr();
16990 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
16991 return CacheLineAlign;
16992 }
16993 }
16994
16995 MachineBasicBlock *Pre = ML->getLoopPreheader();
16996 MachineBasicBlock *Exit = ML->getExitBlock();
16997
16998 if (Pre && Exit) {
16999 auto PreTerm = Pre->getFirstTerminator();
17000 if (PreTerm == Pre->begin() ||
17001 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
17002 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
17003 .addImm(Val: 1); // prefetch 2 lines behind PC
17004
17005 auto ExitHead = Exit->getFirstNonDebugInstr();
17006 if (ExitHead == Exit->end() ||
17007 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
17008 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
17009 .addImm(Val: 2); // prefetch 1 line behind PC
17010 }
17011
17012 return CacheLineAlign;
17013}
17014
17015LLVM_ATTRIBUTE_UNUSED
17016static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
17017 assert(N->getOpcode() == ISD::CopyFromReg);
17018 do {
17019 // Follow the chain until we find an INLINEASM node.
17020 N = N->getOperand(Num: 0).getNode();
17021 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
17022 return true;
17023 } while (N->getOpcode() == ISD::CopyFromReg);
17024 return false;
17025}
17026
17027bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
17028 FunctionLoweringInfo *FLI,
17029 UniformityInfo *UA) const {
17030 switch (N->getOpcode()) {
17031 case ISD::CopyFromReg: {
17032 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
17033 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
17034 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17035 Register Reg = R->getReg();
17036
17037 // FIXME: Why does this need to consider isLiveIn?
17038 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
17039 return !TRI->isSGPRReg(MRI, Reg);
17040
17041 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
17042 return UA->isDivergent(V);
17043
17044 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
17045 return !TRI->isSGPRReg(MRI, Reg);
17046 }
17047 case ISD::LOAD: {
17048 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
17049 unsigned AS = L->getAddressSpace();
17050 // A flat load may access private memory.
17051 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
17052 }
17053 case ISD::CALLSEQ_END:
17054 return true;
17055 case ISD::INTRINSIC_WO_CHAIN:
17056 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
17057 case ISD::INTRINSIC_W_CHAIN:
17058 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
17059 case AMDGPUISD::ATOMIC_CMP_SWAP:
17060 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
17061 case AMDGPUISD::BUFFER_ATOMIC_ADD:
17062 case AMDGPUISD::BUFFER_ATOMIC_SUB:
17063 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
17064 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
17065 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
17066 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
17067 case AMDGPUISD::BUFFER_ATOMIC_AND:
17068 case AMDGPUISD::BUFFER_ATOMIC_OR:
17069 case AMDGPUISD::BUFFER_ATOMIC_XOR:
17070 case AMDGPUISD::BUFFER_ATOMIC_INC:
17071 case AMDGPUISD::BUFFER_ATOMIC_DEC:
17072 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
17073 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
17074 case AMDGPUISD::BUFFER_ATOMIC_FADD:
17075 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
17076 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
17077 // Target-specific read-modify-write atomics are sources of divergence.
17078 return true;
17079 default:
17080 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
17081 // Generic read-modify-write atomics are sources of divergence.
17082 return A->readMem() && A->writeMem();
17083 }
17084 return false;
17085 }
17086}
17087
17088bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
17089 EVT VT) const {
17090 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
17091 case MVT::f32:
17092 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
17093 case MVT::f64:
17094 case MVT::f16:
17095 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
17096 default:
17097 return false;
17098 }
17099}
17100
17101bool SITargetLowering::denormalsEnabledForType(
17102 LLT Ty, const MachineFunction &MF) const {
17103 switch (Ty.getScalarSizeInBits()) {
17104 case 32:
17105 return !denormalModeIsFlushAllF32(MF);
17106 case 64:
17107 case 16:
17108 return !denormalModeIsFlushAllF64F16(MF);
17109 default:
17110 return false;
17111 }
17112}
17113
17114bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
17115 const APInt &DemandedElts,
17116 const SelectionDAG &DAG,
17117 bool SNaN,
17118 unsigned Depth) const {
17119 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
17120 const MachineFunction &MF = DAG.getMachineFunction();
17121 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
17122
17123 if (Info->getMode().DX10Clamp)
17124 return true; // Clamped to 0.
17125 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
17126 }
17127
17128 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
17129 DAG, SNaN, Depth);
17130}
17131
17132// On older subtargets, global FP atomic instructions have a hardcoded FP mode
17133// and do not support FP32 denormals, and only support v2f16/f64 denormals.
17134static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
17135 if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
17136 return true;
17137
17138 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
17139 auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
17140 if (DenormMode == DenormalMode::getPreserveSign())
17141 return true;
17142
17143 // TODO: Remove this.
17144 return RMW->getFunction()
17145 ->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
17146 .getValueAsBool();
17147}
17148
17149static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
17150 LLVMContext &Ctx = RMW->getContext();
17151 StringRef MemScope =
17152 Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
17153
17154 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
17155 << "Hardware instruction generated for atomic "
17156 << RMW->getOperationName(Op: RMW->getOperation())
17157 << " operation at memory scope " << MemScope;
17158}
17159
17160static bool isV2F16OrV2BF16(Type *Ty) {
17161 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
17162 Type *EltTy = VT->getElementType();
17163 return VT->getNumElements() == 2 &&
17164 (EltTy->isHalfTy() || EltTy->isBFloatTy());
17165 }
17166
17167 return false;
17168}
17169
17170static bool isV2F16(Type *Ty) {
17171 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
17172 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
17173}
17174
17175static bool isV2BF16(Type *Ty) {
17176 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
17177 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
17178}
17179
17180/// \return true if atomicrmw integer ops work for the type.
17181static bool isAtomicRMWLegalIntTy(Type *Ty) {
17182 if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
17183 unsigned BW = IT->getBitWidth();
17184 return BW == 32 || BW == 64;
17185 }
17186
17187 return false;
17188}
17189
17190/// \return true if this atomicrmw xchg type can be selected.
17191static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
17192 Type *Ty = RMW->getType();
17193 if (isAtomicRMWLegalIntTy(Ty))
17194 return true;
17195
17196 if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
17197 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
17198 unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
17199 return BW == 32 || BW == 64;
17200 }
17201
17202 if (Ty->isFloatTy() || Ty->isDoubleTy())
17203 return true;
17204
17205 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
17206 return VT->getNumElements() == 2 &&
17207 VT->getElementType()->getPrimitiveSizeInBits() == 16;
17208 }
17209
17210 return false;
17211}
17212
17213/// \returns true if it's valid to emit a native instruction for \p RMW, based
17214/// on the properties of the target memory.
17215static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
17216 const AtomicRMWInst *RMW,
17217 bool HasSystemScope) {
17218 // The remote/fine-grained access logic is different from the integer
17219 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
17220 // fine-grained access does not work, even for a device local allocation.
17221 //
17222 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
17223 // allocations work.
17224 if (HasSystemScope) {
17225 if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() &&
17226 RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
17227 return true;
17228 } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics())
17229 return true;
17230
17231 return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
17232}
17233
17234/// \return Action to perform on AtomicRMWInsts for integer operations.
17235static TargetLowering::AtomicExpansionKind
17236atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
17237 return isAtomicRMWLegalIntTy(Ty: RMW->getType())
17238 ? TargetLowering::AtomicExpansionKind::None
17239 : TargetLowering::AtomicExpansionKind::CmpXChg;
17240}
17241
17242/// Return if a flat address space atomicrmw can access private memory.
17243static bool flatInstrMayAccessPrivate(const Instruction *I) {
17244 const MDNode *NoaliasAddrSpaceMD =
17245 I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
17246 if (!NoaliasAddrSpaceMD)
17247 return true;
17248
17249 for (unsigned I = 0, E = NoaliasAddrSpaceMD->getNumOperands() / 2; I != E;
17250 ++I) {
17251 auto *Low = mdconst::extract<ConstantInt>(
17252 MD: NoaliasAddrSpaceMD->getOperand(I: 2 * I + 0));
17253 if (Low->getValue().uge(RHS: AMDGPUAS::PRIVATE_ADDRESS)) {
17254 auto *High = mdconst::extract<ConstantInt>(
17255 MD: NoaliasAddrSpaceMD->getOperand(I: 2 * I + 1));
17256 return High->getValue().ule(RHS: AMDGPUAS::PRIVATE_ADDRESS);
17257 }
17258 }
17259
17260 return true;
17261}
17262
17263TargetLowering::AtomicExpansionKind
17264SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
17265 unsigned AS = RMW->getPointerAddressSpace();
17266 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
17267 return AtomicExpansionKind::NotAtomic;
17268
17269 // 64-bit flat atomics that dynamically reside in private memory will silently
17270 // be dropped.
17271 //
17272 // Note that we will emit a new copy of the original atomic in the expansion,
17273 // which will be incrementally relegalized.
17274 const DataLayout &DL = RMW->getFunction()->getDataLayout();
17275 if (AS == AMDGPUAS::FLAT_ADDRESS &&
17276 DL.getTypeSizeInBits(Ty: RMW->getType()) == 64 &&
17277 flatInstrMayAccessPrivate(I: RMW))
17278 return AtomicExpansionKind::Expand;
17279
17280 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
17281 OptimizationRemarkEmitter ORE(RMW->getFunction());
17282 ORE.emit(RemarkBuilder: [=]() {
17283 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
17284 });
17285 return Kind;
17286 };
17287
17288 auto SSID = RMW->getSyncScopeID();
17289 bool HasSystemScope =
17290 SSID == SyncScope::System ||
17291 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
17292
17293 auto Op = RMW->getOperation();
17294 switch (Op) {
17295 case AtomicRMWInst::Xchg: {
17296 // PCIe supports add and xchg for system atomics.
17297 return isAtomicRMWLegalXChgTy(RMW)
17298 ? TargetLowering::AtomicExpansionKind::None
17299 : TargetLowering::AtomicExpansionKind::CmpXChg;
17300 }
17301 case AtomicRMWInst::Add:
17302 case AtomicRMWInst::And:
17303 case AtomicRMWInst::UIncWrap:
17304 case AtomicRMWInst::UDecWrap:
17305 return atomicSupportedIfLegalIntType(RMW);
17306 case AtomicRMWInst::Sub:
17307 case AtomicRMWInst::Or:
17308 case AtomicRMWInst::Xor: {
17309 // Atomic sub/or/xor do not work over PCI express, but atomic add
17310 // does. InstCombine transforms these with 0 to or, so undo that.
17311 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
17312 if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
17313 ConstVal && ConstVal->isNullValue())
17314 return AtomicExpansionKind::Expand;
17315 }
17316
17317 return atomicSupportedIfLegalIntType(RMW);
17318 }
17319 case AtomicRMWInst::FAdd: {
17320 Type *Ty = RMW->getType();
17321
17322 // TODO: Handle REGION_ADDRESS
17323 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17324 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
17325 // is fixed to round-to-nearest-even.
17326 //
17327 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
17328 // round-to-nearest-even.
17329 //
17330 // We ignore the rounding mode problem, even in strictfp. The C++ standard
17331 // suggests it is OK if the floating-point mode may not match the calling
17332 // thread.
17333 if (Ty->isFloatTy()) {
17334 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
17335 : AtomicExpansionKind::CmpXChg;
17336 }
17337
17338 if (Ty->isDoubleTy()) {
17339 // Ignores denormal mode, but we don't consider flushing mandatory.
17340 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
17341 : AtomicExpansionKind::CmpXChg;
17342 }
17343
17344 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17345 return AtomicExpansionKind::None;
17346
17347 return AtomicExpansionKind::CmpXChg;
17348 }
17349
17350 // LDS atomics respect the denormal mode from the mode register.
17351 //
17352 // Traditionally f32 global/buffer memory atomics would unconditionally
17353 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
17354 // flush.
17355 //
17356 // On targets with flat atomic fadd, denormals would flush depending on
17357 // whether the target address resides in LDS or global memory. We consider
17358 // this flat-maybe-flush as will-flush.
17359 if (Ty->isFloatTy() &&
17360 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
17361 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
17362 return AtomicExpansionKind::CmpXChg;
17363
17364 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
17365 // safe. The message phrasing also should be better.
17366 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
17367 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17368 // gfx942, gfx12
17369 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
17370 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17371 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
17372 // gfx90a, gfx942, gfx12
17373 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17374 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17375
17376 // gfx942, gfx12
17377 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
17378 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17379 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17380 // gfx90a, gfx942, gfx12
17381 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
17382 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17383
17384 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
17385 // buffer. gfx12 does have the buffer version.
17386 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
17387 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17388 }
17389
17390 // global and flat atomic fadd f64: gfx90a, gfx942.
17391 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
17392 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17393
17394 if (AS != AMDGPUAS::FLAT_ADDRESS) {
17395 if (Ty->isFloatTy()) {
17396 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
17397 // gfx11+.
17398 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17399 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17400 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
17401 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17402 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17403 } else {
17404 // gfx908
17405 if (RMW->use_empty() &&
17406 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
17407 isV2F16(Ty))
17408 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17409 }
17410 }
17411
17412 // flat atomic fadd f32: gfx942, gfx11+.
17413 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
17414 if (Subtarget->hasFlatAtomicFaddF32Inst())
17415 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17416
17417 // If it is in flat address space, and the type is float, we will try to
17418 // expand it, if the target supports global and lds atomic fadd. The
17419 // reason we need that is, in the expansion, we emit the check of
17420 // address space. If it is in global address space, we emit the global
17421 // atomic fadd; if it is in shared address space, we emit the LDS atomic
17422 // fadd.
17423 if (Subtarget->hasLDSFPAtomicAddF32()) {
17424 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
17425 return AtomicExpansionKind::Expand;
17426 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
17427 return AtomicExpansionKind::Expand;
17428 }
17429 }
17430 }
17431
17432 return AtomicExpansionKind::CmpXChg;
17433 }
17434 case AtomicRMWInst::FMin:
17435 case AtomicRMWInst::FMax: {
17436 Type *Ty = RMW->getType();
17437
17438 // LDS float and double fmin/fmax were always supported.
17439 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
17440 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
17441 : AtomicExpansionKind::CmpXChg;
17442 }
17443
17444 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
17445 // For flat and global cases:
17446 // float, double in gfx7. Manual claims denormal support.
17447 // Removed in gfx8.
17448 // float, double restored in gfx10.
17449 // double removed again in gfx11, so only f32 for gfx11/gfx12.
17450 //
17451 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
17452 // no f32.
17453 if (AS == AMDGPUAS::FLAT_ADDRESS) {
17454 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
17455 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17456 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
17457 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17458 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
17459 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17460 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
17461 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17462 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
17463 return ReportUnsafeHWInst(AtomicExpansionKind::None);
17464 }
17465 }
17466
17467 return AtomicExpansionKind::CmpXChg;
17468 }
17469 case AtomicRMWInst::Min:
17470 case AtomicRMWInst::Max:
17471 case AtomicRMWInst::UMin:
17472 case AtomicRMWInst::UMax: {
17473 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
17474 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
17475 // Always expand system scope min/max atomics.
17476 if (HasSystemScope)
17477 return AtomicExpansionKind::CmpXChg;
17478 }
17479
17480 return atomicSupportedIfLegalIntType(RMW);
17481 }
17482 case AtomicRMWInst::Nand:
17483 case AtomicRMWInst::FSub:
17484 default:
17485 return AtomicExpansionKind::CmpXChg;
17486 }
17487
17488 llvm_unreachable("covered atomicrmw op switch");
17489}
17490
17491TargetLowering::AtomicExpansionKind
17492SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
17493 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17494 ? AtomicExpansionKind::NotAtomic
17495 : AtomicExpansionKind::None;
17496}
17497
17498TargetLowering::AtomicExpansionKind
17499SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
17500 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
17501 ? AtomicExpansionKind::NotAtomic
17502 : AtomicExpansionKind::None;
17503}
17504
17505TargetLowering::AtomicExpansionKind
17506SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
17507 unsigned AddrSpace = CmpX->getPointerAddressSpace();
17508 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
17509 return AtomicExpansionKind::NotAtomic;
17510
17511 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(I: CmpX))
17512 return AtomicExpansionKind::None;
17513
17514 const DataLayout &DL = CmpX->getDataLayout();
17515
17516 Type *ValTy = CmpX->getNewValOperand()->getType();
17517
17518 // If a 64-bit flat atomic may alias private, we need to avoid using the
17519 // atomic in the private case.
17520 return DL.getTypeSizeInBits(Ty: ValTy) == 64 ? AtomicExpansionKind::Expand
17521 : AtomicExpansionKind::None;
17522}
17523
17524const TargetRegisterClass *
17525SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
17526 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
17527 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17528 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
17529 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
17530 : &AMDGPU::SReg_32RegClass;
17531 if (!TRI->isSGPRClass(RC) && !isDivergent)
17532 return TRI->getEquivalentSGPRClass(VRC: RC);
17533 if (TRI->isSGPRClass(RC) && isDivergent)
17534 return TRI->getEquivalentVGPRClass(SRC: RC);
17535
17536 return RC;
17537}
17538
17539// FIXME: This is a workaround for DivergenceAnalysis not understanding always
17540// uniform values (as produced by the mask results of control flow intrinsics)
17541// used outside of divergent blocks. The phi users need to also be treated as
17542// always uniform.
17543//
17544// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
17545static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
17546 unsigned WaveSize) {
17547 // FIXME: We assume we never cast the mask results of a control flow
17548 // intrinsic.
17549 // Early exit if the type won't be consistent as a compile time hack.
17550 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
17551 if (!IT || IT->getBitWidth() != WaveSize)
17552 return false;
17553
17554 if (!isa<Instruction>(Val: V))
17555 return false;
17556 if (!Visited.insert(Ptr: V).second)
17557 return false;
17558 bool Result = false;
17559 for (const auto *U : V->users()) {
17560 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
17561 if (V == U->getOperand(i: 1)) {
17562 switch (Intrinsic->getIntrinsicID()) {
17563 default:
17564 Result = false;
17565 break;
17566 case Intrinsic::amdgcn_if_break:
17567 case Intrinsic::amdgcn_if:
17568 case Intrinsic::amdgcn_else:
17569 Result = true;
17570 break;
17571 }
17572 }
17573 if (V == U->getOperand(i: 0)) {
17574 switch (Intrinsic->getIntrinsicID()) {
17575 default:
17576 Result = false;
17577 break;
17578 case Intrinsic::amdgcn_end_cf:
17579 case Intrinsic::amdgcn_loop:
17580 Result = true;
17581 break;
17582 }
17583 }
17584 } else {
17585 Result = hasCFUser(V: U, Visited, WaveSize);
17586 }
17587 if (Result)
17588 break;
17589 }
17590 return Result;
17591}
17592
17593bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
17594 const Value *V) const {
17595 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
17596 if (CI->isInlineAsm()) {
17597 // FIXME: This cannot give a correct answer. This should only trigger in
17598 // the case where inline asm returns mixed SGPR and VGPR results, used
17599 // outside the defining block. We don't have a specific result to
17600 // consider, so this assumes if any value is SGPR, the overall register
17601 // also needs to be SGPR.
17602 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
17603 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
17604 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
17605 for (auto &TC : TargetConstraints) {
17606 if (TC.Type == InlineAsm::isOutput) {
17607 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
17608 const TargetRegisterClass *RC =
17609 getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
17610 VT: TC.ConstraintVT)
17611 .second;
17612 if (RC && SIRI->isSGPRClass(RC))
17613 return true;
17614 }
17615 }
17616 }
17617 }
17618 SmallPtrSet<const Value *, 16> Visited;
17619 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
17620}
17621
17622bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
17623 for (SDUse &Use : N->uses()) {
17624 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
17625 if (getBasePtrIndex(N: M) == Use.getOperandNo())
17626 return true;
17627 }
17628 }
17629 return false;
17630}
17631
17632bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
17633 SDValue N1) const {
17634 if (!N0.hasOneUse())
17635 return false;
17636 // Take care of the opportunity to keep N0 uniform
17637 if (N0->isDivergent() || !N1->isDivergent())
17638 return true;
17639 // Check if we have a good chance to form the memory access pattern with the
17640 // base and offset
17641 return (DAG.isBaseWithConstantOffset(Op: N0) &&
17642 hasMemSDNodeUser(N: *N0->user_begin()));
17643}
17644
17645bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
17646 Register N0, Register N1) const {
17647 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
17648}
17649
17650MachineMemOperand::Flags
17651SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
17652 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
17653 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
17654 if (I.getMetadata(Kind: "amdgpu.noclobber"))
17655 Flags |= MONoClobber;
17656 if (I.getMetadata(Kind: "amdgpu.last.use"))
17657 Flags |= MOLastUse;
17658 return Flags;
17659}
17660
17661bool SITargetLowering::checkForPhysRegDependency(
17662 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
17663 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
17664 if (User->getOpcode() != ISD::CopyToReg)
17665 return false;
17666 if (!Def->isMachineOpcode())
17667 return false;
17668 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def);
17669 if (!MDef)
17670 return false;
17671
17672 unsigned ResNo = User->getOperand(Num: Op).getResNo();
17673 if (User->getOperand(Num: Op)->getValueType(ResNo) != MVT::i1)
17674 return false;
17675 const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode());
17676 if (II.isCompare() && II.hasImplicitDefOfPhysReg(Reg: AMDGPU::SCC)) {
17677 PhysReg = AMDGPU::SCC;
17678 const TargetRegisterClass *RC =
17679 TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo));
17680 Cost = RC->getCopyCost();
17681 return true;
17682 }
17683 return false;
17684}
17685
17686void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
17687 Instruction *AI) const {
17688 // Given: atomicrmw fadd ptr %addr, float %val ordering
17689 //
17690 // With this expansion we produce the following code:
17691 // [...]
17692 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
17693 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
17694 //
17695 // atomicrmw.shared:
17696 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
17697 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
17698 // float %val ordering
17699 // br label %atomicrmw.phi
17700 //
17701 // atomicrmw.check.private:
17702 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
17703 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
17704 //
17705 // atomicrmw.private:
17706 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
17707 // %loaded.private = load float, ptr addrspace(5) %cast.private
17708 // %val.new = fadd float %loaded.private, %val
17709 // store float %val.new, ptr addrspace(5) %cast.private
17710 // br label %atomicrmw.phi
17711 //
17712 // atomicrmw.global:
17713 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
17714 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
17715 // float %val ordering
17716 // br label %atomicrmw.phi
17717 //
17718 // atomicrmw.phi:
17719 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
17720 // [ %loaded.private, %atomicrmw.private ],
17721 // [ %loaded.global, %atomicrmw.global ]
17722 // br label %atomicrmw.end
17723 //
17724 // atomicrmw.end:
17725 // [...]
17726 //
17727 //
17728 // For 64-bit atomics which may reside in private memory, we perform a simpler
17729 // version that only inserts the private check, and uses the flat operation.
17730
17731 IRBuilder<> Builder(AI);
17732 LLVMContext &Ctx = Builder.getContext();
17733
17734 auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
17735 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
17736 : AtomicCmpXchgInst::getPointerOperandIndex();
17737 Value *Addr = AI->getOperand(i: PtrOpIdx);
17738
17739 /// TODO: Only need to check private, then emit flat-known-not private (no
17740 /// need for shared block, or cast to global).
17741 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
17742
17743 Align Alignment;
17744 if (RMW)
17745 Alignment = RMW->getAlign();
17746 else if (CX)
17747 Alignment = CX->getAlign();
17748 else
17749 llvm_unreachable("unhandled atomic operation");
17750
17751 // FullFlatEmulation is true if we need to issue the private, shared, and
17752 // global cases.
17753 //
17754 // If this is false, we are only dealing with the flat-targeting-private case,
17755 // where we only insert a check for private and still use the flat instruction
17756 // for global and shared.
17757
17758 bool FullFlatEmulation =
17759 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
17760 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
17761 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
17762 RMW->getType()->isDoubleTy()));
17763
17764 // If the return value isn't used, do not introduce a false use in the phi.
17765 bool ReturnValueIsUsed = !AI->use_empty();
17766
17767 BasicBlock *BB = Builder.GetInsertBlock();
17768 Function *F = BB->getParent();
17769 BasicBlock *ExitBB =
17770 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
17771 BasicBlock *SharedBB = nullptr;
17772
17773 BasicBlock *CheckPrivateBB = BB;
17774 if (FullFlatEmulation) {
17775 SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
17776 CheckPrivateBB =
17777 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
17778 }
17779
17780 BasicBlock *PrivateBB =
17781 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
17782 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
17783 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
17784
17785 std::prev(x: BB->end())->eraseFromParent();
17786 Builder.SetInsertPoint(BB);
17787
17788 Value *LoadedShared = nullptr;
17789 if (FullFlatEmulation) {
17790 CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
17791 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
17792 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
17793 Builder.SetInsertPoint(SharedBB);
17794 Value *CastToLocal = Builder.CreateAddrSpaceCast(
17795 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
17796
17797 Instruction *Clone = AI->clone();
17798 Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
17799 Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
17800 LoadedShared = Clone;
17801
17802 Builder.CreateBr(Dest: PhiBB);
17803 Builder.SetInsertPoint(CheckPrivateBB);
17804 }
17805
17806 CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
17807 Args: {Addr}, FMFSource: nullptr, Name: "is.private");
17808 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
17809
17810 Builder.SetInsertPoint(PrivateBB);
17811
17812 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
17813 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
17814
17815 Value *LoadedPrivate;
17816 if (RMW) {
17817 LoadedPrivate = Builder.CreateAlignedLoad(
17818 Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
17819
17820 Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
17821 Loaded: LoadedPrivate, Val: RMW->getValOperand());
17822
17823 Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
17824 } else {
17825 auto [ResultLoad, Equal] =
17826 buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
17827 Val: CX->getNewValOperand(), Alignment: CX->getAlign());
17828
17829 Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
17830 Val: ResultLoad, Idxs: 0);
17831 LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: 1);
17832 }
17833
17834 Builder.CreateBr(Dest: PhiBB);
17835
17836 Builder.SetInsertPoint(GlobalBB);
17837
17838 // Continue using a flat instruction if we only emitted the check for private.
17839 Instruction *LoadedGlobal = AI;
17840 if (FullFlatEmulation) {
17841 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
17842 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
17843 AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
17844 }
17845
17846 AI->removeFromParent();
17847 AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
17848
17849 // The new atomicrmw may go through another round of legalization later.
17850 if (!FullFlatEmulation) {
17851 // We inserted the runtime check already, make sure we do not try to
17852 // re-expand this.
17853 // TODO: Should union with any existing metadata.
17854 MDBuilder MDB(F->getContext());
17855 MDNode *RangeNotPrivate =
17856 MDB.createRange(Lo: APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
17857 Hi: APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
17858 LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
17859 Node: RangeNotPrivate);
17860 }
17861
17862 Builder.CreateBr(Dest: PhiBB);
17863
17864 Builder.SetInsertPoint(PhiBB);
17865
17866 if (ReturnValueIsUsed) {
17867 PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: 3);
17868 AI->replaceAllUsesWith(V: Loaded);
17869 if (FullFlatEmulation)
17870 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
17871 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
17872 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
17873 Loaded->takeName(V: AI);
17874 }
17875
17876 Builder.CreateBr(Dest: ExitBB);
17877}
17878
17879void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
17880 AtomicRMWInst::BinOp Op = AI->getOperation();
17881
17882 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
17883 Op == AtomicRMWInst::Xor) {
17884 if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
17885 ConstVal && ConstVal->isNullValue()) {
17886 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
17887 AI->setOperation(AtomicRMWInst::Add);
17888
17889 // We may still need the private-alias-flat handling below.
17890
17891 // TODO: Skip this for cases where we cannot access remote memory.
17892 }
17893 }
17894
17895 // The non-flat expansions should only perform the de-canonicalization of
17896 // identity values.
17897 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
17898 return;
17899
17900 emitExpandAtomicAddrSpacePredicate(AI);
17901}
17902
17903void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
17904 emitExpandAtomicAddrSpacePredicate(AI: CI);
17905}
17906
17907LoadInst *
17908SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
17909 IRBuilder<> Builder(AI);
17910 auto Order = AI->getOrdering();
17911
17912 // The optimization removes store aspect of the atomicrmw. Therefore, cache
17913 // must be flushed if the atomic ordering had a release semantics. This is
17914 // not necessary a fence, a release fence just coincides to do that flush.
17915 // Avoid replacing of an atomicrmw with a release semantics.
17916 if (isReleaseOrStronger(AO: Order))
17917 return nullptr;
17918
17919 LoadInst *LI = Builder.CreateAlignedLoad(
17920 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
17921 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
17922 LI->copyMetadata(SrcInst: *AI);
17923 LI->takeName(V: AI);
17924 AI->replaceAllUsesWith(V: LI);
17925 AI->eraseFromParent();
17926 return LI;
17927}
17928