1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
19#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20#include "SIMachineFunctionInfo.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
23#include "llvm/ADT/FloatingPointMode.h"
24#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/OptimizationRemarkEmitter.h"
26#include "llvm/Analysis/UniformityAnalysis.h"
27#include "llvm/BinaryFormat/ELF.h"
28#include "llvm/CodeGen/Analysis.h"
29#include "llvm/CodeGen/ByteProvider.h"
30#include "llvm/CodeGen/FunctionLoweringInfo.h"
31#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
32#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
33#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
34#include "llvm/CodeGen/MachineFrameInfo.h"
35#include "llvm/CodeGen/MachineFunction.h"
36#include "llvm/CodeGen/MachineLoopInfo.h"
37#include "llvm/IR/DiagnosticInfo.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/KnownBits.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
53static cl::opt<bool> DisableLoopAlignment(
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(Val: false));
57
58static cl::opt<bool> UseDivergentRegisterIndexing(
59 "amdgpu-use-divergent-register-indexing",
60 cl::Hidden,
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(Val: false));
63
64static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
65 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
69static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
84SITargetLowering::SITargetLowering(const TargetMachine &TM,
85 const GCNSubtarget &STI)
86 : AMDGPUTargetLowering(TM, STI),
87 Subtarget(&STI) {
88 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
89 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
92 addRegisterClass(VT: MVT::f32, RC: &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
100 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
101 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
102
103 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(VT: MVT::v3f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 96));
105
106 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(VT: MVT::v4f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 128));
111
112 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(VT: MVT::v5f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 160));
114
115 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(VT: MVT::v6f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
117
118 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(VT: MVT::v3f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 192));
120
121 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(VT: MVT::v7f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 224));
123
124 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(VT: MVT::v8f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256));
126
127 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(VT: MVT::v4f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 256));
129
130 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(VT: MVT::v9f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 288));
132
133 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(VT: MVT::v10f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 320));
135
136 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(VT: MVT::v11f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 352));
138
139 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(VT: MVT::v12f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 384));
141
142 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(VT: MVT::v16f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512));
144
145 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(VT: MVT::v8f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 512));
147
148 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(VT: MVT::v16f64, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
158 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
159 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
164 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
165 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
166 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
167 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
168 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
169 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(VT: MVT::v32f32, RC: TRI->getVGPRClassForBitWidth(BitWidth: 1024));
182
183 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
189 setBooleanContents(ZeroOrOneBooleanContent);
190 setBooleanVectorContents(ZeroOrOneBooleanContent);
191
192 // We need to custom lower vector stores from local memory
193 setOperationAction(Ops: ISD::LOAD,
194 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Action: Custom);
199
200 setOperationAction(Ops: ISD::STORE,
201 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Action: Custom);
206
207 if (isTypeLegal(VT: MVT::bf16)) {
208 for (unsigned Opc :
209 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
210 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
211 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
212 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
213 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
214 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
215 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
216 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
220 AddPromotedToType(Opc, OrigVT: MVT::bf16, DestVT: MVT::f32);
221 }
222
223 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
224
225 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
226 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
227
228 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
229 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
230 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
234 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
235 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
236 }
237
238 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
239 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
240 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
241 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
242 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
243 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
244 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
245 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
246 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
247 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
248 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
249 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
250 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
251 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
252 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
253 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
254
255 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
262
263 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
264
265 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
266 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
267 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
268 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
269
270 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
271
272 setOperationAction(Ops: ISD::SELECT_CC,
273 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
274
275 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
276 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
277 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
278
279 setOperationAction(Ops: ISD::TRUNCATE,
280 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
283 Action: Expand);
284 setOperationAction(Ops: ISD::FP_ROUND,
285 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
288 Action: Expand);
289
290 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
291 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
293 Action: Custom);
294
295 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
296 setOperationAction(Ops: ISD::BR_CC,
297 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
298
299 setOperationAction(Ops: {ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
300
301 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
302
303 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
304 Action: Expand);
305
306#if 0
307 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
308#endif
309
310 // We only support LOAD/STORE and vector manipulation ops for vectors
311 // with > 4 elements.
312 for (MVT VT :
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
321 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
322 switch (Op) {
323 case ISD::LOAD:
324 case ISD::STORE:
325 case ISD::BUILD_VECTOR:
326 case ISD::BITCAST:
327 case ISD::UNDEF:
328 case ISD::EXTRACT_VECTOR_ELT:
329 case ISD::INSERT_VECTOR_ELT:
330 case ISD::SCALAR_TO_VECTOR:
331 case ISD::IS_FPCLASS:
332 break;
333 case ISD::EXTRACT_SUBVECTOR:
334 case ISD::INSERT_SUBVECTOR:
335 case ISD::CONCAT_VECTORS:
336 setOperationAction(Op, VT, Action: Custom);
337 break;
338 default:
339 setOperationAction(Op, VT, Action: Expand);
340 break;
341 }
342 }
343 }
344
345 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
346
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
348 // is expanded to avoid having two separate loops in case the index is a VGPR.
349
350 // Most operations are naturally 32-bit vector operations. We only support
351 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
352 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
353 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
354 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
355
356 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
357 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
358
359 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
360 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
361
362 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
363 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
364 }
365
366 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
367 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
368 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
369
370 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
371 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
372
373 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
374 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
375
376 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
377 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
378 }
379
380 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
381 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
382 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
383
384 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
385 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
386
387 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
388 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
389
390 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
391 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
392 }
393
394 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
395 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
396 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
397
398 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
399 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
400
401 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
402 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
403
404 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
405 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
406 }
407
408 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
409 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
410 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
411
412 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
413 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
414
415 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
416 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
417
418 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
419 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
420 }
421
422 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
423 VTs: {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
424 Action: Expand);
425
426 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
427 Action: Custom);
428
429 // Avoid stack access for these.
430 // TODO: Generalize to more vector types.
431 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
432 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
434 Action: Custom);
435
436 // Deal with vec3 vector operations when widened to vec4.
437 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
438 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
439
440 // Deal with vec5/6/7 vector operations when widened to vec8.
441 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
442 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
446 Action: Custom);
447
448 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
449 // and output demarshalling
450 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
451
452 // We can't return success/failure, only the old value,
453 // let LLVM add the comparison
454 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
455 Action: Expand);
456
457 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
458
459 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
460
461 // FIXME: This should be narrowed to i32, but that only happens if i64 is
462 // illegal.
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
464 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
465
466 // On SI this is s_memtime and s_memrealtime on VI.
467 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
468
469 if (Subtarget->hasSMemRealTime() ||
470 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
471 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
472 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
473
474 if (Subtarget->has16BitInsts()) {
475 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
476 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
477 } else {
478 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
479 }
480
481 if (Subtarget->hasMadMacF32Insts())
482 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
483
484 if (!Subtarget->hasBFI())
485 // fcopysign can be done in a single instruction with BFI.
486 setOperationAction(Ops: ISD::FCOPYSIGN, VTs: {MVT::f32, MVT::f64}, Action: Expand);
487
488 if (!Subtarget->hasBCNT(Size: 32))
489 setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Expand);
490
491 if (!Subtarget->hasBCNT(Size: 64))
492 setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Expand);
493
494 if (Subtarget->hasFFBH())
495 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
496
497 if (Subtarget->hasFFBL())
498 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
499
500 // We only really have 32-bit BFE instructions (and 16-bit on VI).
501 //
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
503 // effort to match them now. We want this to be false for i64 cases when the
504 // extraction isn't restricted to the upper or lower half. Ideally we would
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
506 // span the midpoint are probably relatively rare, so don't worry about them
507 // for now.
508 if (Subtarget->hasBFE())
509 setHasExtractBitsInsn(true);
510
511 // Clamp modifier on add/sub
512 if (Subtarget->hasIntClamp())
513 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
514
515 if (Subtarget->hasAddNoCarry())
516 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
517 Action: Legal);
518
519 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM}, VTs: {MVT::f32, MVT::f64},
520 Action: Custom);
521
522 // These are really only legal for ieee_mode functions. We should be avoiding
523 // them for functions that don't have ieee_mode enabled, so just say they are
524 // legal.
525 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
526 VTs: {MVT::f32, MVT::f64}, Action: Legal);
527
528 if (Subtarget->haveRoundOpsF64())
529 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
530 Action: Legal);
531 else
532 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
533 VT: MVT::f64, Action: Custom);
534
535 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
536 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
537 Action: Legal);
538 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
539
540 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
541 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
542
543 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
544 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
545
546 // Custom lower these because we can't specify a rule based on an illegal
547 // source bf16.
548 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
549 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
550
551 if (Subtarget->has16BitInsts()) {
552 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
553 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
554 VT: MVT::i16, Action: Legal);
555
556 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
557
558 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
559 VT: MVT::i16, Action: Expand);
560
561 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
562 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
563 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
564 ISD::CTPOP},
565 VT: MVT::i16, Action: Promote);
566
567 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
568
569 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
570
571 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
572 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
573 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
574 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
575
576 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
577 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
578 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
579
580 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
581
582 // F16 - Constant Actions.
583 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
584 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
585
586 // F16 - Load/Store Actions.
587 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
588 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
589 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
590 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
591
592 // BF16 - Load/Store Actions.
593 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
594 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
595 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
596 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
597
598 // F16 - VOP1 Actions.
599 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
600 ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
601 VT: MVT::f16, Action: Custom);
602
603 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::f16, Action: Promote);
604 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::bf16, Action: Promote);
605
606 // F16 - VOP2 Actions.
607 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
608 Action: Expand);
609 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
610 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
611 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
612
613 // F16 - VOP3 Actions.
614 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
615 if (STI.hasMadF16())
616 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
617
618 for (MVT VT :
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
622 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
623 switch (Op) {
624 case ISD::LOAD:
625 case ISD::STORE:
626 case ISD::BUILD_VECTOR:
627 case ISD::BITCAST:
628 case ISD::UNDEF:
629 case ISD::EXTRACT_VECTOR_ELT:
630 case ISD::INSERT_VECTOR_ELT:
631 case ISD::INSERT_SUBVECTOR:
632 case ISD::EXTRACT_SUBVECTOR:
633 case ISD::SCALAR_TO_VECTOR:
634 case ISD::IS_FPCLASS:
635 break;
636 case ISD::CONCAT_VECTORS:
637 setOperationAction(Op, VT, Action: Custom);
638 break;
639 default:
640 setOperationAction(Op, VT, Action: Expand);
641 break;
642 }
643 }
644 }
645
646 // v_perm_b32 can handle either of these.
647 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
648 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
649
650 // XXX - Do these do anything? Vector constants turn into build_vector.
651 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
652
653 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
654 Action: Legal);
655
656 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
657 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
658 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
659 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
660
661 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
662 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
663 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
664 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
665
666 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
667 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
668 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
669 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
670 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
671 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
672
673 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
674 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
675 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
676 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
677 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
678 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
679
680 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
681 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
682 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
683 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
684 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
685 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
686
687 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
688 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
689 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
690 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
691 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
692 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
693
694 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
695 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
696 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
697 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
698
699 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
700 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
701 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
702 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
703 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
704 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
705
706 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
707 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
708 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
709 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
710 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
711 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
712
713 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
714 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
715 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
716 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
717 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
718 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
719
720 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
721 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
722 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
723 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
724 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
725 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
726
727 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
728 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
729 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
730 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
731 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
732 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
733
734 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
735 VT: MVT::v2i32, Action: Expand);
736 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
737
738 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
739 VT: MVT::v4i32, Action: Expand);
740
741 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
742 VT: MVT::v8i32, Action: Expand);
743
744 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
745 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
746
747 setOperationAction(Op: ISD::FNEG, VT: MVT::v2f16, Action: Legal);
748 // This isn't really legal, but this avoids the legalizer unrolling it (and
749 // allows matching fneg (fabs x) patterns)
750 setOperationAction(Op: ISD::FABS, VT: MVT::v2f16, Action: Legal);
751
752 setOperationAction(Ops: {ISD::FMAXNUM, ISD::FMINNUM}, VT: MVT::f16, Action: Custom);
753 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
754
755 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
756 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
757 Action: Custom);
758
759 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
760 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Action: Expand);
762
763 for (MVT Vec16 :
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
766 setOperationAction(
767 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
768 VT: Vec16, Action: Custom);
769 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
770 }
771 }
772
773 if (Subtarget->hasVOP3PInsts()) {
774 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
775 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
776 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
777 VT: MVT::v2i16, Action: Legal);
778
779 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
780 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
781 VT: MVT::v2f16, Action: Legal);
782
783 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Action: Custom);
785
786 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
787 VTs: {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
789 Action: Custom);
790
791 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
792 // Split vector operations.
793 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
794 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
795 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
796 ISD::SSUBSAT},
797 VT, Action: Custom);
798
799 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
800 // Split vector operations.
801 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
802 VT, Action: Custom);
803
804 setOperationAction(Ops: {ISD::FMAXNUM, ISD::FMINNUM}, VTs: {MVT::v2f16, MVT::v4f16},
805 Action: Custom);
806
807 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
808 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
809 Action: Custom);
810
811 if (Subtarget->hasPackedFP32Ops()) {
812 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
813 VT: MVT::v2f32, Action: Legal);
814 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
815 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
816 Action: Custom);
817 }
818 }
819
820 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
821
822 if (Subtarget->has16BitInsts()) {
823 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
824 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
825 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
826 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
827 } else {
828 // Legalization hack.
829 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
830
831 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
832 }
833
834 setOperationAction(Ops: ISD::SELECT,
835 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
839 Action: Custom);
840
841 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
842
843 if (Subtarget->hasScalarSMulU64())
844 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
845
846 if (Subtarget->hasMad64_32())
847 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
848
849 if (Subtarget->hasPrefetch())
850 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
851
852 if (Subtarget->hasIEEEMinMax()) {
853 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
854 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
855 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
856 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
857 Action: Custom);
858 }
859
860 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
861 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
863 MVT::i8},
864 Action: Custom);
865
866 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
867 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
871 Action: Custom);
872
873 setOperationAction(Ops: ISD::INTRINSIC_VOID,
874 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
878 Action: Custom);
879
880 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
881 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
882 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
883 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
884 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
885
886 // TODO: Could move this to custom lowering, could benefit from combines on
887 // extract of relevant bits.
888 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
889
890 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
891
892 setTargetDAGCombine({ISD::ADD,
893 ISD::UADDO_CARRY,
894 ISD::SUB,
895 ISD::USUBO_CARRY,
896 ISD::FADD,
897 ISD::FSUB,
898 ISD::FDIV,
899 ISD::FMINNUM,
900 ISD::FMAXNUM,
901 ISD::FMINNUM_IEEE,
902 ISD::FMAXNUM_IEEE,
903 ISD::FMINIMUM,
904 ISD::FMAXIMUM,
905 ISD::FMA,
906 ISD::SMIN,
907 ISD::SMAX,
908 ISD::UMIN,
909 ISD::UMAX,
910 ISD::SETCC,
911 ISD::AND,
912 ISD::OR,
913 ISD::XOR,
914 ISD::FSHR,
915 ISD::SINT_TO_FP,
916 ISD::UINT_TO_FP,
917 ISD::FCANONICALIZE,
918 ISD::SCALAR_TO_VECTOR,
919 ISD::ZERO_EXTEND,
920 ISD::SIGN_EXTEND_INREG,
921 ISD::EXTRACT_VECTOR_ELT,
922 ISD::INSERT_VECTOR_ELT,
923 ISD::FCOPYSIGN});
924
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
926 setTargetDAGCombine(ISD::FP_ROUND);
927
928 // All memory operations. Some folding on the pointer operand is done to help
929 // matching the constant offsets in the addressing modes.
930 setTargetDAGCombine({ISD::LOAD,
931 ISD::STORE,
932 ISD::ATOMIC_LOAD,
933 ISD::ATOMIC_STORE,
934 ISD::ATOMIC_CMP_SWAP,
935 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
936 ISD::ATOMIC_SWAP,
937 ISD::ATOMIC_LOAD_ADD,
938 ISD::ATOMIC_LOAD_SUB,
939 ISD::ATOMIC_LOAD_AND,
940 ISD::ATOMIC_LOAD_OR,
941 ISD::ATOMIC_LOAD_XOR,
942 ISD::ATOMIC_LOAD_NAND,
943 ISD::ATOMIC_LOAD_MIN,
944 ISD::ATOMIC_LOAD_MAX,
945 ISD::ATOMIC_LOAD_UMIN,
946 ISD::ATOMIC_LOAD_UMAX,
947 ISD::ATOMIC_LOAD_FADD,
948 ISD::ATOMIC_LOAD_FMIN,
949 ISD::ATOMIC_LOAD_FMAX,
950 ISD::ATOMIC_LOAD_UINC_WRAP,
951 ISD::ATOMIC_LOAD_UDEC_WRAP,
952 ISD::INTRINSIC_VOID,
953 ISD::INTRINSIC_W_CHAIN});
954
955 // FIXME: In other contexts we pretend this is a per-function property.
956 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
957
958 setSchedulingPreference(Sched::RegPressure);
959}
960
961const GCNSubtarget *SITargetLowering::getSubtarget() const {
962 return Subtarget;
963}
964
965ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
967 return RCRegs;
968}
969
970//===----------------------------------------------------------------------===//
971// TargetLowering queries
972//===----------------------------------------------------------------------===//
973
974// v_mad_mix* support a conversion from f16 to f32.
975//
976// There is only one special case when denormals are enabled we don't currently,
977// where this is OK to use.
978bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
979 EVT DestVT, EVT SrcVT) const {
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
982 DestVT.getScalarType() == MVT::f32 &&
983 SrcVT.getScalarType() == MVT::f16 &&
984 // TODO: This probably only requires no input flushing?
985 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
986}
987
988bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
989 LLT DestTy, LLT SrcTy) const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
992 DestTy.getScalarSizeInBits() == 32 &&
993 SrcTy.getScalarSizeInBits() == 16 &&
994 // TODO: This probably only requires no input flushing?
995 denormalModeIsFlushAllF32(MF: *MI.getMF());
996}
997
998bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
999 // SI has some legal vector types, but no legal vector operations. Say no
1000 // shuffles are legal in order to prefer scalarizing some vector operations.
1001 return false;
1002}
1003
1004MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1005 CallingConv::ID CC,
1006 EVT VT) const {
1007 if (CC == CallingConv::AMDGPU_KERNEL)
1008 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1009
1010 if (VT.isVector()) {
1011 EVT ScalarVT = VT.getScalarType();
1012 unsigned Size = ScalarVT.getSizeInBits();
1013 if (Size == 16) {
1014 if (Subtarget->has16BitInsts()) {
1015 if (VT.isInteger())
1016 return MVT::v2i16;
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1018 }
1019 return VT.isInteger() ? MVT::i32 : MVT::f32;
1020 }
1021
1022 if (Size < 16)
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1024 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1025 }
1026
1027 if (VT.getSizeInBits() > 32)
1028 return MVT::i32;
1029
1030 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1031}
1032
1033unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1034 CallingConv::ID CC,
1035 EVT VT) const {
1036 if (CC == CallingConv::AMDGPU_KERNEL)
1037 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1038
1039 if (VT.isVector()) {
1040 unsigned NumElts = VT.getVectorNumElements();
1041 EVT ScalarVT = VT.getScalarType();
1042 unsigned Size = ScalarVT.getSizeInBits();
1043
1044 // FIXME: Should probably promote 8-bit vectors to i16.
1045 if (Size == 16 && Subtarget->has16BitInsts())
1046 return (NumElts + 1) / 2;
1047
1048 if (Size <= 32)
1049 return NumElts;
1050
1051 if (Size > 32)
1052 return NumElts * ((Size + 31) / 32);
1053 } else if (VT.getSizeInBits() > 32)
1054 return (VT.getSizeInBits() + 31) / 32;
1055
1056 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1057}
1058
1059unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1060 LLVMContext &Context, CallingConv::ID CC,
1061 EVT VT, EVT &IntermediateVT,
1062 unsigned &NumIntermediates, MVT &RegisterVT) const {
1063 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1064 unsigned NumElts = VT.getVectorNumElements();
1065 EVT ScalarVT = VT.getScalarType();
1066 unsigned Size = ScalarVT.getSizeInBits();
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1068 // support, but unless we can properly handle 3-vectors, it will be still be
1069 // inconsistent.
1070 if (Size == 16 && Subtarget->has16BitInsts()) {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1074 } else {
1075 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1077 }
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1080 }
1081
1082 if (Size == 32) {
1083 RegisterVT = ScalarVT.getSimpleVT();
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1087 }
1088
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1090 // FIXME: Should probably form v2i16 pieces
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1095 }
1096
1097
1098 if (Size != 16 && Size <= 32) {
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1103 }
1104
1105 if (Size > 32) {
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((Size + 31) / 32);
1109 return NumIntermediates;
1110 }
1111 }
1112
1113 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1114 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1115}
1116
1117static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1118 const DataLayout &DL, Type *Ty,
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1121
1122 LLVMContext &Ctx = Ty->getContext();
1123 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1124 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1125 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1126 NumElements: NumElts);
1127 }
1128
1129 return TLI.getValueType(DL, Ty);
1130}
1131
1132// Peek through TFE struct returns to only use the data size.
1133static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1134 const DataLayout &DL, Type *Ty,
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Val: Ty);
1137 if (!ST)
1138 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1139
1140 // TFE intrinsics return an aggregate type.
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1143 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1144}
1145
1146/// Map address space 7 to MVT::v5i32 because that's its in-memory
1147/// representation. This return value is vector-typed because there is no
1148/// MVT::i160 and it is not clear if one can be added. While this could
1149/// cause issues during codegen, these address space 7 pointers will be
1150/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1151/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1152/// modeling, to work.
1153MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1154 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1155 return MVT::v5i32;
1156 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1157 DL.getPointerSizeInBits(AS) == 192)
1158 return MVT::v6i32;
1159 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1160}
1161/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1162/// v8i32 when padding is added.
1163/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1164/// also v8i32 with padding.
1165MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1166 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1167 DL.getPointerSizeInBits(AS) == 160) ||
1168 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1169 DL.getPointerSizeInBits(AS) == 192))
1170 return MVT::v8i32;
1171 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1172}
1173
1174bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1175 const CallInst &CI,
1176 MachineFunction &MF,
1177 unsigned IntrID) const {
1178 Info.flags = MachineMemOperand::MONone;
1179 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1180 Info.flags |= MachineMemOperand::MOInvariant;
1181
1182 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1183 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1184 AttributeList Attr = Intrinsic::getAttributes(C&: CI.getContext(),
1185 id: (Intrinsic::ID)IntrID);
1186 MemoryEffects ME = Attr.getMemoryEffects();
1187 if (ME.doesNotAccessMemory())
1188 return false;
1189
1190 // TODO: Should images get their own address space?
1191 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1192
1193 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1194 if (RsrcIntr->IsImage) {
1195 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1196 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1198 Info.align.reset();
1199 }
1200
1201 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1204 // We conservatively set the memory operand of a buffer intrinsic to the
1205 // base resource pointer, so that we can access alias information about
1206 // those pointers. Cases like "this points at the same value
1207 // but with a different offset" are handled in
1208 // areMemAccessesTriviallyDisjoint.
1209 Info.ptrVal = RsrcArg;
1210 }
1211
1212 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1214 Info.flags |= MachineMemOperand::MOVolatile;
1215 Info.flags |= MachineMemOperand::MODereferenceable;
1216 if (ME.onlyReadsMemory()) {
1217 if (RsrcIntr->IsImage) {
1218 unsigned MaxNumLanes = 4;
1219
1220 if (!BaseOpcode->Gather4) {
1221 // If this isn't a gather, we may have excess loaded elements in the
1222 // IR type. Check the dmask for the real number of elements loaded.
1223 unsigned DMask
1224 = cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1225 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1226 }
1227
1228 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1229 Ty: CI.getType(), MaxNumLanes);
1230 } else {
1231 Info.memVT =
1232 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1233 MaxNumLanes: std::numeric_limits<unsigned>::max());
1234 }
1235
1236 // FIXME: What does alignment mean for an image?
1237 Info.opc = ISD::INTRINSIC_W_CHAIN;
1238 Info.flags |= MachineMemOperand::MOLoad;
1239 } else if (ME.onlyWritesMemory()) {
1240 Info.opc = ISD::INTRINSIC_VOID;
1241
1242 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1245 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1246 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1247 MaxNumLanes: DMaskLanes);
1248 } else
1249 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1250
1251 Info.flags |= MachineMemOperand::MOStore;
1252 } else {
1253 // Atomic or NoReturn Sampler
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1255 ISD::INTRINSIC_W_CHAIN;
1256 Info.flags |= MachineMemOperand::MOLoad |
1257 MachineMemOperand::MOStore |
1258 MachineMemOperand::MODereferenceable;
1259
1260 switch (IntrID) {
1261 default:
1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) {
1263 // Fake memory access type for no return sampler intrinsics
1264 Info.memVT = MVT::i32;
1265 } else {
1266 // XXX - Should this be volatile without known ordering?
1267 Info.flags |= MachineMemOperand::MOVolatile;
1268 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1269 }
1270 break;
1271 case Intrinsic::amdgcn_raw_buffer_load_lds:
1272 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1273 case Intrinsic::amdgcn_struct_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1275 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1276 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1277 Info.ptrVal = CI.getArgOperand(i: 1);
1278 return true;
1279 }
1280 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
1282 Info.memVT =
1283 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1284 MaxNumLanes: std::numeric_limits<unsigned>::max());
1285 Info.flags &= ~MachineMemOperand::MOStore;
1286 return true;
1287 }
1288 }
1289 }
1290 return true;
1291 }
1292
1293 switch (IntrID) {
1294 case Intrinsic::amdgcn_ds_ordered_add:
1295 case Intrinsic::amdgcn_ds_ordered_swap: {
1296 Info.opc = ISD::INTRINSIC_W_CHAIN;
1297 Info.memVT = MVT::getVT(Ty: CI.getType());
1298 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1299 Info.align.reset();
1300 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1301
1302 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1303 if (!Vol->isZero())
1304 Info.flags |= MachineMemOperand::MOVolatile;
1305
1306 return true;
1307 }
1308 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1309 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1310 Info.opc = ISD::INTRINSIC_W_CHAIN;
1311 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1312 Info.ptrVal = nullptr;
1313 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1314 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1315 return true;
1316 }
1317 case Intrinsic::amdgcn_ds_append:
1318 case Intrinsic::amdgcn_ds_consume: {
1319 Info.opc = ISD::INTRINSIC_W_CHAIN;
1320 Info.memVT = MVT::getVT(Ty: CI.getType());
1321 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1322 Info.align.reset();
1323 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1324
1325 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1326 if (!Vol->isZero())
1327 Info.flags |= MachineMemOperand::MOVolatile;
1328
1329 return true;
1330 }
1331 case Intrinsic::amdgcn_global_atomic_csub: {
1332 Info.opc = ISD::INTRINSIC_W_CHAIN;
1333 Info.memVT = MVT::getVT(Ty: CI.getType());
1334 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1335 Info.align.reset();
1336 Info.flags |= MachineMemOperand::MOLoad |
1337 MachineMemOperand::MOStore |
1338 MachineMemOperand::MOVolatile;
1339 return true;
1340 }
1341 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1342 Info.opc = ISD::INTRINSIC_W_CHAIN;
1343 Info.memVT = MVT::getVT(Ty: CI.getType()); // XXX: what is correct VT?
1344
1345 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1346 Info.align.reset();
1347 Info.flags |= MachineMemOperand::MOLoad |
1348 MachineMemOperand::MODereferenceable;
1349 return true;
1350 }
1351 case Intrinsic::amdgcn_global_atomic_fadd:
1352 case Intrinsic::amdgcn_global_atomic_fmin:
1353 case Intrinsic::amdgcn_global_atomic_fmax:
1354 case Intrinsic::amdgcn_global_atomic_fmin_num:
1355 case Intrinsic::amdgcn_global_atomic_fmax_num:
1356 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1357 case Intrinsic::amdgcn_flat_atomic_fadd:
1358 case Intrinsic::amdgcn_flat_atomic_fmin:
1359 case Intrinsic::amdgcn_flat_atomic_fmax:
1360 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1361 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1362 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1363 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1364 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1365 Info.opc = ISD::INTRINSIC_W_CHAIN;
1366 Info.memVT = MVT::getVT(Ty: CI.getType());
1367 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1368 Info.align.reset();
1369 Info.flags |= MachineMemOperand::MOLoad |
1370 MachineMemOperand::MOStore |
1371 MachineMemOperand::MODereferenceable |
1372 MachineMemOperand::MOVolatile;
1373 return true;
1374 }
1375 case Intrinsic::amdgcn_global_load_tr_b64:
1376 case Intrinsic::amdgcn_global_load_tr_b128: {
1377 Info.opc = ISD::INTRINSIC_W_CHAIN;
1378 Info.memVT = MVT::getVT(Ty: CI.getType());
1379 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1380 Info.align.reset();
1381 Info.flags |= MachineMemOperand::MOLoad;
1382 return true;
1383 }
1384 case Intrinsic::amdgcn_ds_gws_init:
1385 case Intrinsic::amdgcn_ds_gws_barrier:
1386 case Intrinsic::amdgcn_ds_gws_sema_v:
1387 case Intrinsic::amdgcn_ds_gws_sema_br:
1388 case Intrinsic::amdgcn_ds_gws_sema_p:
1389 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1390 Info.opc = ISD::INTRINSIC_VOID;
1391
1392 const GCNTargetMachine &TM =
1393 static_cast<const GCNTargetMachine &>(getTargetMachine());
1394
1395 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1396 Info.ptrVal = MFI->getGWSPSV(TM);
1397
1398 // This is an abstract access, but we need to specify a type and size.
1399 Info.memVT = MVT::i32;
1400 Info.size = 4;
1401 Info.align = Align(4);
1402
1403 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1404 Info.flags |= MachineMemOperand::MOLoad;
1405 else
1406 Info.flags |= MachineMemOperand::MOStore;
1407 return true;
1408 }
1409 case Intrinsic::amdgcn_global_load_lds: {
1410 Info.opc = ISD::INTRINSIC_VOID;
1411 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1412 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1413 Info.ptrVal = CI.getArgOperand(i: 1);
1414 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1415 return true;
1416 }
1417 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1418 Info.opc = ISD::INTRINSIC_W_CHAIN;
1419
1420 const GCNTargetMachine &TM =
1421 static_cast<const GCNTargetMachine &>(getTargetMachine());
1422
1423 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1424 Info.ptrVal = MFI->getGWSPSV(TM);
1425
1426 // This is an abstract access, but we need to specify a type and size.
1427 Info.memVT = MVT::i32;
1428 Info.size = 4;
1429 Info.align = Align(4);
1430
1431 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1432 return true;
1433 }
1434 default:
1435 return false;
1436 }
1437}
1438
1439void SITargetLowering::CollectTargetIntrinsicOperands(
1440 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1441 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1442 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1443 // The DAG's ValueType loses the addrspaces.
1444 // Add them as 2 extra Constant operands "from" and "to".
1445 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1446 unsigned DstAS = I.getType()->getPointerAddressSpace();
1447 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1448 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1449 break;
1450 }
1451 default:
1452 break;
1453 }
1454}
1455
1456bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1457 SmallVectorImpl<Value*> &Ops,
1458 Type *&AccessTy) const {
1459 Value *Ptr = nullptr;
1460 switch (II->getIntrinsicID()) {
1461 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1462 case Intrinsic::amdgcn_ds_append:
1463 case Intrinsic::amdgcn_ds_consume:
1464 case Intrinsic::amdgcn_ds_ordered_add:
1465 case Intrinsic::amdgcn_ds_ordered_swap:
1466 case Intrinsic::amdgcn_flat_atomic_fadd:
1467 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1468 case Intrinsic::amdgcn_flat_atomic_fmax:
1469 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1470 case Intrinsic::amdgcn_flat_atomic_fmin:
1471 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1472 case Intrinsic::amdgcn_global_atomic_csub:
1473 case Intrinsic::amdgcn_global_atomic_fadd:
1474 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1475 case Intrinsic::amdgcn_global_atomic_fmax:
1476 case Intrinsic::amdgcn_global_atomic_fmax_num:
1477 case Intrinsic::amdgcn_global_atomic_fmin:
1478 case Intrinsic::amdgcn_global_atomic_fmin_num:
1479 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1480 case Intrinsic::amdgcn_global_load_tr_b64:
1481 case Intrinsic::amdgcn_global_load_tr_b128:
1482 Ptr = II->getArgOperand(i: 0);
1483 break;
1484 case Intrinsic::amdgcn_global_load_lds:
1485 Ptr = II->getArgOperand(i: 1);
1486 break;
1487 default:
1488 return false;
1489 }
1490 AccessTy = II->getType();
1491 Ops.push_back(Elt: Ptr);
1492 return true;
1493}
1494
1495bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1496 unsigned AddrSpace) const {
1497 if (!Subtarget->hasFlatInstOffsets()) {
1498 // Flat instructions do not have offsets, and only have the register
1499 // address.
1500 return AM.BaseOffs == 0 && AM.Scale == 0;
1501 }
1502
1503 decltype(SIInstrFlags::FLAT) FlatVariant =
1504 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1505 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1506 : SIInstrFlags::FLAT;
1507
1508 return AM.Scale == 0 &&
1509 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1510 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1511}
1512
1513bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1514 if (Subtarget->hasFlatGlobalInsts())
1515 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1516
1517 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1518 // Assume the we will use FLAT for all global memory accesses
1519 // on VI.
1520 // FIXME: This assumption is currently wrong. On VI we still use
1521 // MUBUF instructions for the r + i addressing mode. As currently
1522 // implemented, the MUBUF instructions only work on buffer < 4GB.
1523 // It may be possible to support > 4GB buffers with MUBUF instructions,
1524 // by setting the stride value in the resource descriptor which would
1525 // increase the size limit to (stride * 4GB). However, this is risky,
1526 // because it has never been validated.
1527 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1528 }
1529
1530 return isLegalMUBUFAddressingMode(AM);
1531}
1532
1533bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1534 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1535 // additionally can do r + r + i with addr64. 32-bit has more addressing
1536 // mode options. Depending on the resource constant, it can also do
1537 // (i64 r0) + (i32 r1) * (i14 i).
1538 //
1539 // Private arrays end up using a scratch buffer most of the time, so also
1540 // assume those use MUBUF instructions. Scratch loads / stores are currently
1541 // implemented as mubuf instructions with offen bit set, so slightly
1542 // different than the normal addr64.
1543 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1544 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1545 return false;
1546
1547 // FIXME: Since we can split immediate into soffset and immediate offset,
1548 // would it make sense to allow any immediate?
1549
1550 switch (AM.Scale) {
1551 case 0: // r + i or just i, depending on HasBaseReg.
1552 return true;
1553 case 1:
1554 return true; // We have r + r or r + i.
1555 case 2:
1556 if (AM.HasBaseReg) {
1557 // Reject 2 * r + r.
1558 return false;
1559 }
1560
1561 // Allow 2 * r as r + r
1562 // Or 2 * r + i is allowed as r + r + i.
1563 return true;
1564 default: // Don't allow n * r
1565 return false;
1566 }
1567}
1568
1569bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1570 const AddrMode &AM, Type *Ty,
1571 unsigned AS, Instruction *I) const {
1572 // No global is ever allowed as a base.
1573 if (AM.BaseGV)
1574 return false;
1575
1576 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1577 return isLegalGlobalAddressingMode(AM);
1578
1579 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1580 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1581 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1582 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1583 // If the offset isn't a multiple of 4, it probably isn't going to be
1584 // correctly aligned.
1585 // FIXME: Can we get the real alignment here?
1586 if (AM.BaseOffs % 4 != 0)
1587 return isLegalMUBUFAddressingMode(AM);
1588
1589 if (!Subtarget->hasScalarSubwordLoads()) {
1590 // There are no SMRD extloads, so if we have to do a small type access we
1591 // will use a MUBUF load.
1592 // FIXME?: We also need to do this if unaligned, but we don't know the
1593 // alignment here.
1594 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1595 return isLegalGlobalAddressingMode(AM);
1596 }
1597
1598 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1599 // SMRD instructions have an 8-bit, dword offset on SI.
1600 if (!isUInt<8>(x: AM.BaseOffs / 4))
1601 return false;
1602 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1603 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1604 // in 8-bits, it can use a smaller encoding.
1605 if (!isUInt<32>(x: AM.BaseOffs / 4))
1606 return false;
1607 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1608 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1609 if (!isUInt<20>(x: AM.BaseOffs))
1610 return false;
1611 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1612 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1613 // for S_BUFFER_* instructions).
1614 if (!isInt<21>(x: AM.BaseOffs))
1615 return false;
1616 } else {
1617 // On GFX12, all offsets are signed 24-bit in bytes.
1618 if (!isInt<24>(x: AM.BaseOffs))
1619 return false;
1620 }
1621
1622 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1623 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1624 AM.BaseOffs < 0) {
1625 // Scalar (non-buffer) loads can only use a negative offset if
1626 // soffset+offset is non-negative. Since the compiler can only prove that
1627 // in a few special cases, it is safer to claim that negative offsets are
1628 // not supported.
1629 return false;
1630 }
1631
1632 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1633 return true;
1634
1635 if (AM.Scale == 1 && AM.HasBaseReg)
1636 return true;
1637
1638 return false;
1639 }
1640
1641 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1642 return Subtarget->enableFlatScratch()
1643 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
1644 : isLegalMUBUFAddressingMode(AM);
1645
1646 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1647 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1648 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1649 // field.
1650 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1651 // an 8-bit dword offset but we don't know the alignment here.
1652 if (!isUInt<16>(x: AM.BaseOffs))
1653 return false;
1654
1655 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1656 return true;
1657
1658 if (AM.Scale == 1 && AM.HasBaseReg)
1659 return true;
1660
1661 return false;
1662 }
1663
1664 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1665 // For an unknown address space, this usually means that this is for some
1666 // reason being used for pure arithmetic, and not based on some addressing
1667 // computation. We don't have instructions that compute pointers with any
1668 // addressing modes, so treat them as having no offset like flat
1669 // instructions.
1670 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1671 }
1672
1673 // Assume a user alias of global for unknown address spaces.
1674 return isLegalGlobalAddressingMode(AM);
1675}
1676
1677bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1678 const MachineFunction &MF) const {
1679 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1680 return (MemVT.getSizeInBits() <= 4 * 32);
1681 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1682 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1683 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1684 }
1685 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
1686 return (MemVT.getSizeInBits() <= 2 * 32);
1687 return true;
1688}
1689
1690bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1691 unsigned Size, unsigned AddrSpace, Align Alignment,
1692 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1693 if (IsFast)
1694 *IsFast = 0;
1695
1696 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1697 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1698 // Check if alignment requirements for ds_read/write instructions are
1699 // disabled.
1700 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1701 return false;
1702
1703 Align RequiredAlignment(PowerOf2Ceil(A: Size/8)); // Natural alignment.
1704 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1705 Alignment < RequiredAlignment)
1706 return false;
1707
1708 // Either, the alignment requirements are "enabled", or there is an
1709 // unaligned LDS access related hardware bug though alignment requirements
1710 // are "disabled". In either case, we need to check for proper alignment
1711 // requirements.
1712 //
1713 switch (Size) {
1714 case 64:
1715 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1716 // address is negative, then the instruction is incorrectly treated as
1717 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1718 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1719 // load later in the SILoadStoreOptimizer.
1720 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1721 return false;
1722
1723 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1724 // can do a 4 byte aligned, 8 byte access in a single operation using
1725 // ds_read2/write2_b32 with adjacent offsets.
1726 RequiredAlignment = Align(4);
1727
1728 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1729 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1730 // ds_write2_b32 depending on the alignment. In either case with either
1731 // alignment there is no faster way of doing this.
1732
1733 // The numbers returned here and below are not additive, it is a 'speed
1734 // rank'. They are just meant to be compared to decide if a certain way
1735 // of lowering an operation is faster than another. For that purpose
1736 // naturally aligned operation gets it bitsize to indicate that "it
1737 // operates with a speed comparable to N-bit wide load". With the full
1738 // alignment ds128 is slower than ds96 for example. If underaligned it
1739 // is comparable to a speed of a single dword access, which would then
1740 // mean 32 < 128 and it is faster to issue a wide load regardless.
1741 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1742 // wider load which will not be aligned anymore the latter is slower.
1743 if (IsFast)
1744 *IsFast = (Alignment >= RequiredAlignment) ? 64
1745 : (Alignment < Align(4)) ? 32
1746 : 1;
1747 return true;
1748 }
1749
1750 break;
1751 case 96:
1752 if (!Subtarget->hasDS96AndDS128())
1753 return false;
1754
1755 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1756 // gfx8 and older.
1757
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1759 // Naturally aligned access is fastest. However, also report it is Fast
1760 // if memory is aligned less than DWORD. A narrow load or store will be
1761 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1762 // be more of them, so overall we will pay less penalty issuing a single
1763 // instruction.
1764
1765 // See comment on the values above.
1766 if (IsFast)
1767 *IsFast = (Alignment >= RequiredAlignment) ? 96
1768 : (Alignment < Align(4)) ? 32
1769 : 1;
1770 return true;
1771 }
1772
1773 break;
1774 case 128:
1775 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1776 return false;
1777
1778 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1779 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1780 // single operation using ds_read2/write2_b64.
1781 RequiredAlignment = Align(8);
1782
1783 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1784 // Naturally aligned access is fastest. However, also report it is Fast
1785 // if memory is aligned less than DWORD. A narrow load or store will be
1786 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1787 // will be more of them, so overall we will pay less penalty issuing a
1788 // single instruction.
1789
1790 // See comment on the values above.
1791 if (IsFast)
1792 *IsFast = (Alignment >= RequiredAlignment) ? 128
1793 : (Alignment < Align(4)) ? 32
1794 : 1;
1795 return true;
1796 }
1797
1798 break;
1799 default:
1800 if (Size > 32)
1801 return false;
1802
1803 break;
1804 }
1805
1806 // See comment on the values above.
1807 // Note that we have a single-dword or sub-dword here, so if underaligned
1808 // it is a slowest possible access, hence returned value is 0.
1809 if (IsFast)
1810 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1811
1812 return Alignment >= RequiredAlignment ||
1813 Subtarget->hasUnalignedDSAccessEnabled();
1814 }
1815
1816 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1817 bool AlignedBy4 = Alignment >= Align(4);
1818 if (IsFast)
1819 *IsFast = AlignedBy4;
1820
1821 return AlignedBy4 ||
1822 Subtarget->enableFlatScratch() ||
1823 Subtarget->hasUnalignedScratchAccess();
1824 }
1825
1826 // FIXME: We have to be conservative here and assume that flat operations
1827 // will access scratch. If we had access to the IR function, then we
1828 // could determine if any private memory was used in the function.
1829 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1830 !Subtarget->hasUnalignedScratchAccess()) {
1831 bool AlignedBy4 = Alignment >= Align(4);
1832 if (IsFast)
1833 *IsFast = AlignedBy4;
1834
1835 return AlignedBy4;
1836 }
1837
1838 // So long as they are correct, wide global memory operations perform better
1839 // than multiple smaller memory ops -- even when misaligned
1840 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
1841 if (IsFast)
1842 *IsFast = Size;
1843
1844 return Alignment >= Align(4) ||
1845 Subtarget->hasUnalignedBufferAccessEnabled();
1846 }
1847
1848 // Smaller than dword value must be aligned.
1849 if (Size < 32)
1850 return false;
1851
1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1853 // byte-address are ignored, thus forcing Dword alignment.
1854 // This applies to private, global, and constant memory.
1855 if (IsFast)
1856 *IsFast = 1;
1857
1858 return Size >= 32 && Alignment >= Align(4);
1859}
1860
1861bool SITargetLowering::allowsMisalignedMemoryAccesses(
1862 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1863 unsigned *IsFast) const {
1864 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
1865 Alignment, Flags, IsFast);
1866}
1867
1868EVT SITargetLowering::getOptimalMemOpType(
1869 const MemOp &Op, const AttributeList &FuncAttributes) const {
1870 // FIXME: Should account for address space here.
1871
1872 // The default fallback uses the private pointer size as a guess for a type to
1873 // use. Make sure we switch these to 64-bit accesses.
1874
1875 if (Op.size() >= 16 &&
1876 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
1877 return MVT::v4i32;
1878
1879 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
1880 return MVT::v2i32;
1881
1882 // Use the default.
1883 return MVT::Other;
1884}
1885
1886bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1887 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
1888 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1889}
1890
1891bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1892 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1893 AS == AMDGPUAS::PRIVATE_ADDRESS;
1894}
1895
1896bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1897 unsigned DestAS) const {
1898 // Flat -> private/local is a simple truncate.
1899 // Flat -> global is no-op
1900 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1901 return true;
1902
1903 const GCNTargetMachine &TM =
1904 static_cast<const GCNTargetMachine &>(getTargetMachine());
1905 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1906}
1907
1908bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1909 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
1910
1911 return AMDGPUInstrInfo::isUniformMMO(MMO: MemNode->getMemOperand());
1912}
1913
1914TargetLoweringBase::LegalizeTypeAction
1915SITargetLowering::getPreferredVectorAction(MVT VT) const {
1916 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1917 VT.getScalarType().bitsLE(VT: MVT::i16))
1918 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1919 return TargetLoweringBase::getPreferredVectorAction(VT);
1920}
1921
1922bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1923 Type *Ty) const {
1924 // FIXME: Could be smarter if called for vector constants.
1925 return true;
1926}
1927
1928bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1929 unsigned Index) const {
1930 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
1931 return false;
1932
1933 // TODO: Add more cases that are cheap.
1934 return Index == 0;
1935}
1936
1937bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1938 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1939 switch (Op) {
1940 case ISD::LOAD:
1941 case ISD::STORE:
1942
1943 // These operations are done with 32-bit instructions anyway.
1944 case ISD::AND:
1945 case ISD::OR:
1946 case ISD::XOR:
1947 case ISD::SELECT:
1948 // TODO: Extensions?
1949 return true;
1950 default:
1951 return false;
1952 }
1953 }
1954
1955 // SimplifySetCC uses this function to determine whether or not it should
1956 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1957 if (VT == MVT::i1 && Op == ISD::SETCC)
1958 return false;
1959
1960 return TargetLowering::isTypeDesirableForOp(Op, VT);
1961}
1962
1963SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1964 const SDLoc &SL,
1965 SDValue Chain,
1966 uint64_t Offset) const {
1967 const DataLayout &DL = DAG.getDataLayout();
1968 MachineFunction &MF = DAG.getMachineFunction();
1969 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1970
1971 const ArgDescriptor *InputPtrReg;
1972 const TargetRegisterClass *RC;
1973 LLT ArgTy;
1974 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
1975
1976 std::tie(args&: InputPtrReg, args&: RC, args&: ArgTy) =
1977 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1978
1979 // We may not have the kernarg segment argument if we have no kernel
1980 // arguments.
1981 if (!InputPtrReg)
1982 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
1983
1984 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1985 SDValue BasePtr = DAG.getCopyFromReg(Chain, dl: SL,
1986 Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
1987
1988 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
1989}
1990
1991SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1992 const SDLoc &SL) const {
1993 uint64_t Offset = getImplicitParameterOffset(MF: DAG.getMachineFunction(),
1994 Param: FIRST_IMPLICIT);
1995 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
1996}
1997
1998SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1999 const SDLoc &SL) const {
2000
2001 Function &F = DAG.getMachineFunction().getFunction();
2002 std::optional<uint32_t> KnownSize =
2003 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2004 if (KnownSize.has_value())
2005 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2006 return SDValue();
2007}
2008
2009SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2010 const SDLoc &SL, SDValue Val,
2011 bool Signed,
2012 const ISD::InputArg *Arg) const {
2013 // First, if it is a widened vector, narrow it.
2014 if (VT.isVector() &&
2015 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2016 EVT NarrowedVT =
2017 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2018 NumElements: VT.getVectorNumElements());
2019 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2020 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2021 }
2022
2023 // Then convert the vector elements or scalar value.
2024 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2025 VT.bitsLT(VT: MemVT)) {
2026 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2027 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2028 }
2029
2030 if (MemVT.isFloatingPoint())
2031 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2032 else if (Signed)
2033 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2034 else
2035 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2036
2037 return Val;
2038}
2039
2040SDValue SITargetLowering::lowerKernargMemParameter(
2041 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2042 uint64_t Offset, Align Alignment, bool Signed,
2043 const ISD::InputArg *Arg) const {
2044 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2045
2046 // Try to avoid using an extload by loading earlier than the argument address,
2047 // and extracting the relevant bits. The load should hopefully be merged with
2048 // the previous argument.
2049 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2050 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2051 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2052 int64_t OffsetDiff = Offset - AlignDownOffset;
2053
2054 EVT IntVT = MemVT.changeTypeToInteger();
2055
2056 // TODO: If we passed in the base kernel offset we could have a better
2057 // alignment than 4, but we don't really need it.
2058 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2059 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr, PtrInfo, Alignment: Align(4),
2060 MMOFlags: MachineMemOperand::MODereferenceable |
2061 MachineMemOperand::MOInvariant);
2062
2063 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2064 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2065
2066 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2067 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2068 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2069
2070
2071 return DAG.getMergeValues(Ops: { ArgVal, Load.getValue(R: 1) }, dl: SL);
2072 }
2073
2074 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2075 SDValue Load = DAG.getLoad(VT: MemVT, dl: SL, Chain, Ptr, PtrInfo, Alignment,
2076 MMOFlags: MachineMemOperand::MODereferenceable |
2077 MachineMemOperand::MOInvariant);
2078
2079 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2080 return DAG.getMergeValues(Ops: { Val, Load.getValue(R: 1) }, dl: SL);
2081}
2082
2083SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2084 const SDLoc &SL, SDValue Chain,
2085 const ISD::InputArg &Arg) const {
2086 MachineFunction &MF = DAG.getMachineFunction();
2087 MachineFrameInfo &MFI = MF.getFrameInfo();
2088
2089 if (Arg.Flags.isByVal()) {
2090 unsigned Size = Arg.Flags.getByValSize();
2091 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2092 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2093 }
2094
2095 unsigned ArgOffset = VA.getLocMemOffset();
2096 unsigned ArgSize = VA.getValVT().getStoreSize();
2097
2098 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2099
2100 // Create load nodes to retrieve arguments from the stack.
2101 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2102 SDValue ArgValue;
2103
2104 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2105 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2106 MVT MemVT = VA.getValVT();
2107
2108 switch (VA.getLocInfo()) {
2109 default:
2110 break;
2111 case CCValAssign::BCvt:
2112 MemVT = VA.getLocVT();
2113 break;
2114 case CCValAssign::SExt:
2115 ExtType = ISD::SEXTLOAD;
2116 break;
2117 case CCValAssign::ZExt:
2118 ExtType = ISD::ZEXTLOAD;
2119 break;
2120 case CCValAssign::AExt:
2121 ExtType = ISD::EXTLOAD;
2122 break;
2123 }
2124
2125 ArgValue = DAG.getExtLoad(
2126 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2127 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI),
2128 MemVT);
2129 return ArgValue;
2130}
2131
2132SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2133 const SIMachineFunctionInfo &MFI,
2134 EVT VT,
2135 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2136 const ArgDescriptor *Reg = nullptr;
2137 const TargetRegisterClass *RC;
2138 LLT Ty;
2139
2140 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2141 const ArgDescriptor WorkGroupIDX =
2142 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2143 // If GridZ is not programmed in an entry function then the hardware will set
2144 // it to all zeros, so there is no need to mask the GridY value in the low
2145 // order bits.
2146 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2147 Reg: AMDGPU::TTMP7,
2148 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2149 const ArgDescriptor WorkGroupIDZ =
2150 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2151 if (Subtarget->hasArchitectedSGPRs() &&
2152 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2153 switch (PVID) {
2154 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2157 Ty = LLT::scalar(SizeInBits: 32);
2158 break;
2159 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2162 Ty = LLT::scalar(SizeInBits: 32);
2163 break;
2164 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2167 Ty = LLT::scalar(SizeInBits: 32);
2168 break;
2169 default:
2170 break;
2171 }
2172 }
2173
2174 if (!Reg)
2175 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2176 if (!Reg) {
2177 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2178 // It's possible for a kernarg intrinsic call to appear in a kernel with
2179 // no allocated segment, in which case we do not add the user sgpr
2180 // argument, so just return null.
2181 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2182 }
2183
2184 // It's undefined behavior if a function marked with the amdgpu-no-*
2185 // attributes uses the corresponding intrinsic.
2186 return DAG.getUNDEF(VT);
2187 }
2188
2189 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2190}
2191
2192static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2193 CallingConv::ID CallConv,
2194 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2195 FunctionType *FType,
2196 SIMachineFunctionInfo *Info) {
2197 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2198 const ISD::InputArg *Arg = &Ins[I];
2199
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2201 "vector type argument should have been split");
2202
2203 // First check if it's a PS input addr.
2204 if (CallConv == CallingConv::AMDGPU_PS &&
2205 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2207
2208 // Inconveniently only the first part of the split is marked as isSplit,
2209 // so skip to the end. We only want to increment PSInputNum once for the
2210 // entire split argument.
2211 if (Arg->Flags.isSplit()) {
2212 while (!Arg->Flags.isSplitEnd()) {
2213 assert((!Arg->VT.isVector() ||
2214 Arg->VT.getScalarSizeInBits() == 16) &&
2215 "unexpected vector split in ps argument type");
2216 if (!SkipArg)
2217 Splits.push_back(Elt: *Arg);
2218 Arg = &Ins[++I];
2219 }
2220 }
2221
2222 if (SkipArg) {
2223 // We can safely skip PS inputs.
2224 Skipped.set(Arg->getOrigArgIndex());
2225 ++PSInputNum;
2226 continue;
2227 }
2228
2229 Info->markPSInputAllocated(Index: PSInputNum);
2230 if (Arg->Used)
2231 Info->markPSInputEnabled(Index: PSInputNum);
2232
2233 ++PSInputNum;
2234 }
2235
2236 Splits.push_back(Elt: *Arg);
2237 }
2238}
2239
2240// Allocate special inputs passed in VGPRs.
2241void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
2242 MachineFunction &MF,
2243 const SIRegisterInfo &TRI,
2244 SIMachineFunctionInfo &Info) const {
2245 const LLT S32 = LLT::scalar(SizeInBits: 32);
2246 MachineRegisterInfo &MRI = MF.getRegInfo();
2247
2248 if (Info.hasWorkItemIDX()) {
2249 Register Reg = AMDGPU::VGPR0;
2250 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2251
2252 CCInfo.AllocateReg(Reg);
2253 unsigned Mask = (Subtarget->hasPackedTID() &&
2254 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2255 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2256 }
2257
2258 if (Info.hasWorkItemIDY()) {
2259 assert(Info.hasWorkItemIDX());
2260 if (Subtarget->hasPackedTID()) {
2261 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0,
2262 Mask: 0x3ff << 10));
2263 } else {
2264 unsigned Reg = AMDGPU::VGPR1;
2265 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2266
2267 CCInfo.AllocateReg(Reg);
2268 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2269 }
2270 }
2271
2272 if (Info.hasWorkItemIDZ()) {
2273 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2274 if (Subtarget->hasPackedTID()) {
2275 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0,
2276 Mask: 0x3ff << 20));
2277 } else {
2278 unsigned Reg = AMDGPU::VGPR2;
2279 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2280
2281 CCInfo.AllocateReg(Reg);
2282 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2283 }
2284 }
2285}
2286
2287// Try to allocate a VGPR at the end of the argument list, or if no argument
2288// VGPRs are left allocating a stack slot.
2289// If \p Mask is is given it indicates bitfield position in the register.
2290// If \p Arg is given use it with new ]p Mask instead of allocating new.
2291static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2292 ArgDescriptor Arg = ArgDescriptor()) {
2293 if (Arg.isSet())
2294 return ArgDescriptor::createArg(Arg, Mask);
2295
2296 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2297 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2298 if (RegIdx == ArgVGPRs.size()) {
2299 // Spill to stack required.
2300 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2301
2302 return ArgDescriptor::createStack(Offset, Mask);
2303 }
2304
2305 unsigned Reg = ArgVGPRs[RegIdx];
2306 Reg = CCInfo.AllocateReg(Reg);
2307 assert(Reg != AMDGPU::NoRegister);
2308
2309 MachineFunction &MF = CCInfo.getMachineFunction();
2310 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2311 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2312 return ArgDescriptor::createRegister(Reg, Mask);
2313}
2314
2315static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2316 const TargetRegisterClass *RC,
2317 unsigned NumArgRegs) {
2318 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2319 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2320 if (RegIdx == ArgSGPRs.size())
2321 report_fatal_error(reason: "ran out of SGPRs for arguments");
2322
2323 unsigned Reg = ArgSGPRs[RegIdx];
2324 Reg = CCInfo.AllocateReg(Reg);
2325 assert(Reg != AMDGPU::NoRegister);
2326
2327 MachineFunction &MF = CCInfo.getMachineFunction();
2328 MF.addLiveIn(PReg: Reg, RC);
2329 return ArgDescriptor::createRegister(Reg);
2330}
2331
2332// If this has a fixed position, we still should allocate the register in the
2333// CCInfo state. Technically we could get away with this for values passed
2334// outside of the normal argument range.
2335static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2336 const TargetRegisterClass *RC,
2337 MCRegister Reg) {
2338 Reg = CCInfo.AllocateReg(Reg);
2339 assert(Reg != AMDGPU::NoRegister);
2340 MachineFunction &MF = CCInfo.getMachineFunction();
2341 MF.addLiveIn(PReg: Reg, RC);
2342}
2343
2344static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2345 if (Arg) {
2346 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2347 Reg: Arg.getRegister());
2348 } else
2349 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2350}
2351
2352static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2353 if (Arg) {
2354 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2355 Reg: Arg.getRegister());
2356 } else
2357 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2358}
2359
2360/// Allocate implicit function VGPR arguments at the end of allocated user
2361/// arguments.
2362void SITargetLowering::allocateSpecialInputVGPRs(
2363 CCState &CCInfo, MachineFunction &MF,
2364 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2365 const unsigned Mask = 0x3ff;
2366 ArgDescriptor Arg;
2367
2368 if (Info.hasWorkItemIDX()) {
2369 Arg = allocateVGPR32Input(CCInfo, Mask);
2370 Info.setWorkItemIDX(Arg);
2371 }
2372
2373 if (Info.hasWorkItemIDY()) {
2374 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2375 Info.setWorkItemIDY(Arg);
2376 }
2377
2378 if (Info.hasWorkItemIDZ())
2379 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2380}
2381
2382/// Allocate implicit function VGPR arguments in fixed registers.
2383void SITargetLowering::allocateSpecialInputVGPRsFixed(
2384 CCState &CCInfo, MachineFunction &MF,
2385 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2386 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2387 if (!Reg)
2388 report_fatal_error(reason: "failed to allocated VGPR for implicit arguments");
2389
2390 const unsigned Mask = 0x3ff;
2391 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2392 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2393 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2394}
2395
2396void SITargetLowering::allocateSpecialInputSGPRs(
2397 CCState &CCInfo,
2398 MachineFunction &MF,
2399 const SIRegisterInfo &TRI,
2400 SIMachineFunctionInfo &Info) const {
2401 auto &ArgInfo = Info.getArgInfo();
2402 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2403
2404 // TODO: Unify handling with private memory pointers.
2405 if (UserSGPRInfo.hasDispatchPtr())
2406 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2407
2408 const Module *M = MF.getFunction().getParent();
2409 if (UserSGPRInfo.hasQueuePtr() &&
2410 AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5)
2411 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2412
2413 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2414 // constant offset from the kernarg segment.
2415 if (Info.hasImplicitArgPtr())
2416 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2417
2418 if (UserSGPRInfo.hasDispatchID())
2419 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2420
2421 // flat_scratch_init is not applicable for non-kernel functions.
2422
2423 if (Info.hasWorkGroupIDX())
2424 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2425
2426 if (Info.hasWorkGroupIDY())
2427 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2428
2429 if (Info.hasWorkGroupIDZ())
2430 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2431
2432 if (Info.hasLDSKernelId())
2433 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2434}
2435
2436// Allocate special inputs passed in user SGPRs.
2437void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2438 MachineFunction &MF,
2439 const SIRegisterInfo &TRI,
2440 SIMachineFunctionInfo &Info) const {
2441 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2442 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2443 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2444 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2445 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2446 }
2447
2448 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2449 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2450 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2451 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2452 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2453 }
2454
2455 if (UserSGPRInfo.hasDispatchPtr()) {
2456 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2457 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2458 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2459 }
2460
2461 const Module *M = MF.getFunction().getParent();
2462 if (UserSGPRInfo.hasQueuePtr() &&
2463 AMDGPU::getAMDHSACodeObjectVersion(M: *M) < AMDGPU::AMDHSA_COV5) {
2464 Register QueuePtrReg = Info.addQueuePtr(TRI);
2465 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
2466 CCInfo.AllocateReg(Reg: QueuePtrReg);
2467 }
2468
2469 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2470 MachineRegisterInfo &MRI = MF.getRegInfo();
2471 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2472 CCInfo.AllocateReg(Reg: InputPtrReg);
2473
2474 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2475 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
2476 }
2477
2478 if (UserSGPRInfo.hasDispatchID()) {
2479 Register DispatchIDReg = Info.addDispatchID(TRI);
2480 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
2481 CCInfo.AllocateReg(Reg: DispatchIDReg);
2482 }
2483
2484 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2485 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2486 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
2487 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
2488 }
2489
2490 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2491 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2492 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
2493 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
2494 }
2495
2496 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2497 // these from the dispatch pointer.
2498}
2499
2500// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2501// sequential starting from the first argument.
2502void SITargetLowering::allocatePreloadKernArgSGPRs(
2503 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2504 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
2505 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2506 Function &F = MF.getFunction();
2507 unsigned LastExplicitArgOffset =
2508 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2509 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2510 bool InPreloadSequence = true;
2511 unsigned InIdx = 0;
2512 for (auto &Arg : F.args()) {
2513 if (!InPreloadSequence || !Arg.hasInRegAttr())
2514 break;
2515
2516 int ArgIdx = Arg.getArgNo();
2517 // Don't preload non-original args or parts not in the current preload
2518 // sequence.
2519 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2520 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2521 break;
2522
2523 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2524 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2525 InIdx++) {
2526 assert(ArgLocs[ArgIdx].isMemLoc());
2527 auto &ArgLoc = ArgLocs[InIdx];
2528 const Align KernelArgBaseAlign = Align(16);
2529 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2530 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
2531 unsigned NumAllocSGPRs =
2532 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
2533
2534 // Arg is preloaded into the previous SGPR.
2535 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2536 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2537 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2538 continue;
2539 }
2540
2541 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2542 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
2543 // Check for free user SGPRs for preloading.
2544 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2545 SGPRInfo.getNumFreeUserSGPRs()) {
2546 InPreloadSequence = false;
2547 break;
2548 }
2549
2550 // Preload this argument.
2551 const TargetRegisterClass *RC =
2552 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
2553 SmallVectorImpl<MCRegister> *PreloadRegs =
2554 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
2555
2556 if (PreloadRegs->size() > 1)
2557 RC = &AMDGPU::SGPR_32RegClass;
2558 for (auto &Reg : *PreloadRegs) {
2559 assert(Reg);
2560 MF.addLiveIn(PReg: Reg, RC);
2561 CCInfo.AllocateReg(Reg);
2562 }
2563
2564 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2565 }
2566 }
2567}
2568
2569void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
2570 const SIRegisterInfo &TRI,
2571 SIMachineFunctionInfo &Info) const {
2572 // Always allocate this last since it is a synthetic preload.
2573 if (Info.hasLDSKernelId()) {
2574 Register Reg = Info.addLDSKernelId();
2575 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2576 CCInfo.AllocateReg(Reg);
2577 }
2578}
2579
2580// Allocate special input registers that are initialized per-wave.
2581void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
2582 MachineFunction &MF,
2583 SIMachineFunctionInfo &Info,
2584 CallingConv::ID CallConv,
2585 bool IsShader) const {
2586 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2587 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2588 // Note: user SGPRs are handled by the front-end for graphics shaders
2589 // Pad up the used user SGPRs with dead inputs.
2590
2591 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2592 // before enabling architected SGPRs for workgroup IDs.
2593 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2594
2595 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2596 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2597 // rely on it to reach 16 since if we end up having no stack usage, it will
2598 // not really be added.
2599 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2600 Info.hasWorkGroupIDY() +
2601 Info.hasWorkGroupIDZ() +
2602 Info.hasWorkGroupInfo();
2603 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2604 Register Reg = Info.addReservedUserSGPR();
2605 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2606 CCInfo.AllocateReg(Reg);
2607 }
2608 }
2609
2610 if (!HasArchitectedSGPRs) {
2611 if (Info.hasWorkGroupIDX()) {
2612 Register Reg = Info.addWorkGroupIDX();
2613 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2614 CCInfo.AllocateReg(Reg);
2615 }
2616
2617 if (Info.hasWorkGroupIDY()) {
2618 Register Reg = Info.addWorkGroupIDY();
2619 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622
2623 if (Info.hasWorkGroupIDZ()) {
2624 Register Reg = Info.addWorkGroupIDZ();
2625 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2626 CCInfo.AllocateReg(Reg);
2627 }
2628 }
2629
2630 if (Info.hasWorkGroupInfo()) {
2631 Register Reg = Info.addWorkGroupInfo();
2632 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
2633 CCInfo.AllocateReg(Reg);
2634 }
2635
2636 if (Info.hasPrivateSegmentWaveByteOffset()) {
2637 // Scratch wave offset passed in system SGPR.
2638 unsigned PrivateSegmentWaveByteOffsetReg;
2639
2640 if (IsShader) {
2641 PrivateSegmentWaveByteOffsetReg =
2642 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2643
2644 // This is true if the scratch wave byte offset doesn't have a fixed
2645 // location.
2646 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2647 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2648 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2649 }
2650 } else
2651 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2652
2653 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
2654 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
2655 }
2656
2657 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2658 Info.getNumPreloadedSGPRs() >= 16);
2659}
2660
2661static void reservePrivateMemoryRegs(const TargetMachine &TM,
2662 MachineFunction &MF,
2663 const SIRegisterInfo &TRI,
2664 SIMachineFunctionInfo &Info) {
2665 // Now that we've figured out where the scratch register inputs are, see if
2666 // should reserve the arguments and use them directly.
2667 MachineFrameInfo &MFI = MF.getFrameInfo();
2668 bool HasStackObjects = MFI.hasStackObjects();
2669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2670
2671 // Record that we know we have non-spill stack objects so we don't need to
2672 // check all stack objects later.
2673 if (HasStackObjects)
2674 Info.setHasNonSpillStackObjects(true);
2675
2676 // Everything live out of a block is spilled with fast regalloc, so it's
2677 // almost certain that spilling will be required.
2678 if (TM.getOptLevel() == CodeGenOptLevel::None)
2679 HasStackObjects = true;
2680
2681 // For now assume stack access is needed in any callee functions, so we need
2682 // the scratch registers to pass in.
2683 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2684
2685 if (!ST.enableFlatScratch()) {
2686 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
2687 // If we have stack objects, we unquestionably need the private buffer
2688 // resource. For the Code Object V2 ABI, this will be the first 4 user
2689 // SGPR inputs. We can reserve those and use them directly.
2690
2691 Register PrivateSegmentBufferReg =
2692 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2693 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2694 } else {
2695 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2696 // We tentatively reserve the last registers (skipping the last registers
2697 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2698 // we'll replace these with the ones immediately after those which were
2699 // really allocated. In the prologue copies will be inserted from the
2700 // argument to these reserved registers.
2701
2702 // Without HSA, relocations are used for the scratch pointer and the
2703 // buffer resource setup is always inserted in the prologue. Scratch wave
2704 // offset is still in an input SGPR.
2705 Info.setScratchRSrcReg(ReservedBufferReg);
2706 }
2707 }
2708
2709 MachineRegisterInfo &MRI = MF.getRegInfo();
2710
2711 // For entry functions we have to set up the stack pointer if we use it,
2712 // whereas non-entry functions get this "for free". This means there is no
2713 // intrinsic advantage to using S32 over S34 in cases where we do not have
2714 // calls but do need a frame pointer (i.e. if we are requested to have one
2715 // because frame pointer elimination is disabled). To keep things simple we
2716 // only ever use S32 as the call ABI stack pointer, and so using it does not
2717 // imply we need a separate frame pointer.
2718 //
2719 // Try to use s32 as the SP, but move it if it would interfere with input
2720 // arguments. This won't work with calls though.
2721 //
2722 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2723 // registers.
2724 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
2725 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2726 } else {
2727 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2728
2729 if (MFI.hasCalls())
2730 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
2731
2732 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2733 if (!MRI.isLiveIn(Reg)) {
2734 Info.setStackPtrOffsetReg(Reg);
2735 break;
2736 }
2737 }
2738
2739 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2740 report_fatal_error(reason: "failed to find register for SP");
2741 }
2742
2743 // hasFP should be accurate for entry functions even before the frame is
2744 // finalized, because it does not rely on the known stack size, only
2745 // properties like whether variable sized objects are present.
2746 if (ST.getFrameLowering()->hasFP(MF)) {
2747 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2748 }
2749}
2750
2751bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2752 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2753 return !Info->isEntryFunction();
2754}
2755
2756void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2757
2758}
2759
2760void SITargetLowering::insertCopiesSplitCSR(
2761 MachineBasicBlock *Entry,
2762 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2763 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2764
2765 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
2766 if (!IStart)
2767 return;
2768
2769 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2770 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2771 MachineBasicBlock::iterator MBBI = Entry->begin();
2772 for (const MCPhysReg *I = IStart; *I; ++I) {
2773 const TargetRegisterClass *RC = nullptr;
2774 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
2775 RC = &AMDGPU::SGPR_64RegClass;
2776 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
2777 RC = &AMDGPU::SGPR_32RegClass;
2778 else
2779 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2780
2781 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
2782 // Create copy from CSR to a virtual register.
2783 Entry->addLiveIn(PhysReg: *I);
2784 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
2785 .addReg(RegNo: *I);
2786
2787 // Insert the copy-back instructions right before the terminator.
2788 for (auto *Exit : Exits)
2789 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
2790 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
2791 .addReg(RegNo: NewVR);
2792 }
2793}
2794
2795SDValue SITargetLowering::LowerFormalArguments(
2796 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2797 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2798 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2799 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2800
2801 MachineFunction &MF = DAG.getMachineFunction();
2802 const Function &Fn = MF.getFunction();
2803 FunctionType *FType = MF.getFunction().getFunctionType();
2804 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2805
2806 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
2807 DiagnosticInfoUnsupported NoGraphicsHSA(
2808 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2809 DAG.getContext()->diagnose(DI: NoGraphicsHSA);
2810 return DAG.getEntryNode();
2811 }
2812
2813 SmallVector<ISD::InputArg, 16> Splits;
2814 SmallVector<CCValAssign, 16> ArgLocs;
2815 BitVector Skipped(Ins.size());
2816 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2817 *DAG.getContext());
2818
2819 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
2820 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
2821 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
2822
2823 if (IsGraphics) {
2824 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2825 assert(!UserSGPRInfo.hasDispatchPtr() &&
2826 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2827 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2828 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2829 (void)UserSGPRInfo;
2830 if (!Subtarget->enableFlatScratch())
2831 assert(!UserSGPRInfo.hasFlatScratchInit());
2832 if ((CallConv != CallingConv::AMDGPU_CS &&
2833 CallConv != CallingConv::AMDGPU_Gfx) ||
2834 !Subtarget->hasArchitectedSGPRs())
2835 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2836 !Info->hasWorkGroupIDZ());
2837 }
2838
2839 if (CallConv == CallingConv::AMDGPU_PS) {
2840 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2841
2842 // At least one interpolation mode must be enabled or else the GPU will
2843 // hang.
2844 //
2845 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2846 // set PSInputAddr, the user wants to enable some bits after the compilation
2847 // based on run-time states. Since we can't know what the final PSInputEna
2848 // will look like, so we shouldn't do anything here and the user should take
2849 // responsibility for the correct programming.
2850 //
2851 // Otherwise, the following restrictions apply:
2852 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2853 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2854 // enabled too.
2855 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2856 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
2857 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
2858 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
2859 Info->markPSInputAllocated(Index: 0);
2860 Info->markPSInputEnabled(Index: 0);
2861 }
2862 if (Subtarget->isAmdPalOS()) {
2863 // For isAmdPalOS, the user does not enable some bits after compilation
2864 // based on run-time states; the register values being generated here are
2865 // the final ones set in hardware. Therefore we need to apply the
2866 // workaround to PSInputAddr and PSInputEnable together. (The case where
2867 // a bit is set in PSInputAddr but not PSInputEnable is where the
2868 // frontend set up an input arg for a particular interpolation mode, but
2869 // nothing uses that input arg. Really we should have an earlier pass
2870 // that removes such an arg.)
2871 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2872 if ((PsInputBits & 0x7F) == 0 ||
2873 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2874 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
2875 }
2876 } else if (IsKernel) {
2877 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2878 } else {
2879 Splits.append(in_start: Ins.begin(), in_end: Ins.end());
2880 }
2881
2882 if (IsKernel)
2883 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
2884
2885 if (IsEntryFunc) {
2886 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2887 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2888 if (IsKernel && Subtarget->hasKernargPreload())
2889 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
2890
2891 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
2892 } else if (!IsGraphics) {
2893 // For the fixed ABI, pass workitem IDs in the last argument register.
2894 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
2895
2896 // FIXME: Sink this into allocateSpecialInputSGPRs
2897 if (!Subtarget->enableFlatScratch())
2898 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
2899
2900 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
2901 }
2902
2903 if (!IsKernel) {
2904 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
2905 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
2906 }
2907
2908 SmallVector<SDValue, 16> Chains;
2909
2910 // FIXME: This is the minimum kernel argument alignment. We should improve
2911 // this to the maximum alignment of the arguments.
2912 //
2913 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2914 // kern arg offset.
2915 const Align KernelArgBaseAlign = Align(16);
2916
2917 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2918 const ISD::InputArg &Arg = Ins[i];
2919 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2920 InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT));
2921 continue;
2922 }
2923
2924 CCValAssign &VA = ArgLocs[ArgIdx++];
2925 MVT VT = VA.getLocVT();
2926
2927 if (IsEntryFunc && VA.isMemLoc()) {
2928 VT = Ins[i].VT;
2929 EVT MemVT = VA.getLocVT();
2930
2931 const uint64_t Offset = VA.getLocMemOffset();
2932 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
2933
2934 if (Arg.Flags.isByRef()) {
2935 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
2936
2937 const GCNTargetMachine &TM =
2938 static_cast<const GCNTargetMachine &>(getTargetMachine());
2939 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
2940 DestAS: Arg.Flags.getPointerAddrSpace())) {
2941 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
2942 DestAS: Arg.Flags.getPointerAddrSpace());
2943 }
2944
2945 InVals.push_back(Elt: Ptr);
2946 continue;
2947 }
2948
2949 SDValue NewArg;
2950 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
2951 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2952 // In this case the argument is packed into the previous preload SGPR.
2953 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2954 int64_t OffsetDiff = Offset - AlignDownOffset;
2955 EVT IntVT = MemVT.changeTypeToInteger();
2956
2957 const SIMachineFunctionInfo *Info =
2958 MF.getInfo<SIMachineFunctionInfo>();
2959 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2960 Register Reg =
2961 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
2962
2963 assert(Reg);
2964 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
2965 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
2966
2967 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
2968 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
2969
2970 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
2971 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
2972 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
2973 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
2974
2975 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
2976 } else {
2977 const SIMachineFunctionInfo *Info =
2978 MF.getInfo<SIMachineFunctionInfo>();
2979 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2980 const SmallVectorImpl<MCRegister> &PreloadRegs =
2981 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
2982
2983 SDValue Copy;
2984 if (PreloadRegs.size() == 1) {
2985 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
2986 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
2987 NewArg = DAG.getCopyFromReg(
2988 Chain, dl: DL, Reg: VReg,
2989 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
2990 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
2991
2992 } else {
2993 // If the kernarg alignment does not match the alignment of the SGPR
2994 // tuple RC that can accommodate this argument, it will be built up
2995 // via copies from from the individual SGPRs that the argument was
2996 // preloaded to.
2997 SmallVector<SDValue, 4> Elts;
2998 for (auto Reg : PreloadRegs) {
2999 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3000 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3001 Elts.push_back(Elt: Copy);
3002 }
3003 NewArg =
3004 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3005 NumElements: PreloadRegs.size()),
3006 DL, Ops: Elts);
3007 }
3008
3009 // If the argument was preloaded to multiple consecutive 32-bit
3010 // registers because of misalignment between addressable SGPR tuples
3011 // and the argument size, we can still assume that because of kernarg
3012 // segment alignment restrictions that NewArg's size is the same as
3013 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3014 // truncate since we cannot preload to less than a single SGPR and the
3015 // MemVT may be smaller.
3016 EVT MemVTInt =
3017 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3018 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3019 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3020
3021 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3022 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3023 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3024 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3025 }
3026 } else {
3027 NewArg =
3028 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3029 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3030 }
3031 Chains.push_back(Elt: NewArg.getValue(R: 1));
3032
3033 auto *ParamTy =
3034 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3035 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3036 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3037 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3038 // On SI local pointers are just offsets into LDS, so they are always
3039 // less than 16-bits. On CI and newer they could potentially be
3040 // real pointers, so we can't guarantee their size.
3041 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3042 N2: DAG.getValueType(MVT::i16));
3043 }
3044
3045 InVals.push_back(Elt: NewArg);
3046 continue;
3047 }
3048 if (!IsEntryFunc && VA.isMemLoc()) {
3049 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3050 InVals.push_back(Elt: Val);
3051 if (!Arg.Flags.isByVal())
3052 Chains.push_back(Elt: Val.getValue(R: 1));
3053 continue;
3054 }
3055
3056 assert(VA.isRegLoc() && "Parameter must be in a register!");
3057
3058 Register Reg = VA.getLocReg();
3059 const TargetRegisterClass *RC = nullptr;
3060 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3061 RC = &AMDGPU::VGPR_32RegClass;
3062 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3063 RC = &AMDGPU::SGPR_32RegClass;
3064 else
3065 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3066 EVT ValVT = VA.getValVT();
3067
3068 Reg = MF.addLiveIn(PReg: Reg, RC);
3069 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3070
3071 if (Arg.Flags.isSRet()) {
3072 // The return object should be reasonably addressable.
3073
3074 // FIXME: This helps when the return is a real sret. If it is a
3075 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3076 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3077 unsigned NumBits
3078 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3079 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val,
3080 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3081 }
3082
3083 // If this is an 8 or 16-bit value, it is really passed promoted
3084 // to 32 bits. Insert an assert[sz]ext to capture this, then
3085 // truncate to the right size.
3086 switch (VA.getLocInfo()) {
3087 case CCValAssign::Full:
3088 break;
3089 case CCValAssign::BCvt:
3090 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ValVT, Operand: Val);
3091 break;
3092 case CCValAssign::SExt:
3093 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT, N1: Val,
3094 N2: DAG.getValueType(ValVT));
3095 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3096 break;
3097 case CCValAssign::ZExt:
3098 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT, N1: Val,
3099 N2: DAG.getValueType(ValVT));
3100 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3101 break;
3102 case CCValAssign::AExt:
3103 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ValVT, Operand: Val);
3104 break;
3105 default:
3106 llvm_unreachable("Unknown loc info!");
3107 }
3108
3109 InVals.push_back(Elt: Val);
3110 }
3111
3112 // Start adding system SGPRs.
3113 if (IsEntryFunc)
3114 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3115
3116 // DAG.getPass() returns nullptr when using new pass manager.
3117 // TODO: Use DAG.getMFAM() to access analysis result.
3118 if (DAG.getPass()) {
3119 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3120 ArgUsageInfo.setFuncArgInfo(F: Fn, ArgInfo: Info->getArgInfo());
3121 }
3122
3123 unsigned StackArgSize = CCInfo.getStackSize();
3124 Info->setBytesInStackArgArea(StackArgSize);
3125
3126 return Chains.empty() ? Chain :
3127 DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3128}
3129
3130// TODO: If return values can't fit in registers, we should return as many as
3131// possible in registers before passing on stack.
3132bool SITargetLowering::CanLowerReturn(
3133 CallingConv::ID CallConv,
3134 MachineFunction &MF, bool IsVarArg,
3135 const SmallVectorImpl<ISD::OutputArg> &Outs,
3136 LLVMContext &Context) const {
3137 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3138 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3139 // for shaders. Vector types should be explicitly handled by CC.
3140 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3141 return true;
3142
3143 SmallVector<CCValAssign, 16> RVLocs;
3144 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3145 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3146 return false;
3147
3148 // We must use the stack if return would require unavailable registers.
3149 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3150 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3151 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3152 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3153 return false;
3154
3155 return true;
3156}
3157
3158SDValue
3159SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3160 bool isVarArg,
3161 const SmallVectorImpl<ISD::OutputArg> &Outs,
3162 const SmallVectorImpl<SDValue> &OutVals,
3163 const SDLoc &DL, SelectionDAG &DAG) const {
3164 MachineFunction &MF = DAG.getMachineFunction();
3165 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3166
3167 if (AMDGPU::isKernel(CC: CallConv)) {
3168 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3169 OutVals, DL, DAG);
3170 }
3171
3172 bool IsShader = AMDGPU::isShader(CC: CallConv);
3173
3174 Info->setIfReturnsVoid(Outs.empty());
3175 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3176
3177 // CCValAssign - represent the assignment of the return value to a location.
3178 SmallVector<CCValAssign, 48> RVLocs;
3179 SmallVector<ISD::OutputArg, 48> Splits;
3180
3181 // CCState - Info about the registers and stack slots.
3182 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3183 *DAG.getContext());
3184
3185 // Analyze outgoing return values.
3186 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3187
3188 SDValue Glue;
3189 SmallVector<SDValue, 48> RetOps;
3190 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3191
3192 // Copy the result values into the output registers.
3193 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3194 ++I, ++RealRVLocIdx) {
3195 CCValAssign &VA = RVLocs[I];
3196 assert(VA.isRegLoc() && "Can only return in registers!");
3197 // TODO: Partially return in registers if return values don't fit.
3198 SDValue Arg = OutVals[RealRVLocIdx];
3199
3200 // Copied from other backends.
3201 switch (VA.getLocInfo()) {
3202 case CCValAssign::Full:
3203 break;
3204 case CCValAssign::BCvt:
3205 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3206 break;
3207 case CCValAssign::SExt:
3208 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3209 break;
3210 case CCValAssign::ZExt:
3211 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3212 break;
3213 case CCValAssign::AExt:
3214 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3215 break;
3216 default:
3217 llvm_unreachable("Unknown loc info!");
3218 }
3219
3220 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3221 Glue = Chain.getValue(R: 1);
3222 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3223 }
3224
3225 // FIXME: Does sret work properly?
3226 if (!Info->isEntryFunction()) {
3227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3228 const MCPhysReg *I =
3229 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3230 if (I) {
3231 for (; *I; ++I) {
3232 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3233 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3234 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3235 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3236 else
3237 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3238 }
3239 }
3240 }
3241
3242 // Update chain and glue.
3243 RetOps[0] = Chain;
3244 if (Glue.getNode())
3245 RetOps.push_back(Elt: Glue);
3246
3247 unsigned Opc = AMDGPUISD::ENDPGM;
3248 if (!IsWaveEnd)
3249 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
3250 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3251}
3252
3253SDValue SITargetLowering::LowerCallResult(
3254 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3255 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3256 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3257 SDValue ThisVal) const {
3258 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3259
3260 // Assign locations to each value returned by this call.
3261 SmallVector<CCValAssign, 16> RVLocs;
3262 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3263 *DAG.getContext());
3264 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3265
3266 // Copy all of the result registers out of their specified physreg.
3267 for (CCValAssign VA : RVLocs) {
3268 SDValue Val;
3269
3270 if (VA.isRegLoc()) {
3271 Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3272 Chain = Val.getValue(R: 1);
3273 InGlue = Val.getValue(R: 2);
3274 } else if (VA.isMemLoc()) {
3275 report_fatal_error(reason: "TODO: return values in memory");
3276 } else
3277 llvm_unreachable("unknown argument location type");
3278
3279 switch (VA.getLocInfo()) {
3280 case CCValAssign::Full:
3281 break;
3282 case CCValAssign::BCvt:
3283 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3284 break;
3285 case CCValAssign::ZExt:
3286 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3287 N2: DAG.getValueType(VA.getValVT()));
3288 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3289 break;
3290 case CCValAssign::SExt:
3291 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3292 N2: DAG.getValueType(VA.getValVT()));
3293 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3294 break;
3295 case CCValAssign::AExt:
3296 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3297 break;
3298 default:
3299 llvm_unreachable("Unknown loc info!");
3300 }
3301
3302 InVals.push_back(Elt: Val);
3303 }
3304
3305 return Chain;
3306}
3307
3308// Add code to pass special inputs required depending on used features separate
3309// from the explicit user arguments present in the IR.
3310void SITargetLowering::passSpecialInputs(
3311 CallLoweringInfo &CLI,
3312 CCState &CCInfo,
3313 const SIMachineFunctionInfo &Info,
3314 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3315 SmallVectorImpl<SDValue> &MemOpChains,
3316 SDValue Chain) const {
3317 // If we don't have a call site, this was a call inserted by
3318 // legalization. These can never use special inputs.
3319 if (!CLI.CB)
3320 return;
3321
3322 SelectionDAG &DAG = CLI.DAG;
3323 const SDLoc &DL = CLI.DL;
3324 const Function &F = DAG.getMachineFunction().getFunction();
3325
3326 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3327 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3328
3329 const AMDGPUFunctionArgInfo *CalleeArgInfo
3330 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
3331 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3332 // DAG.getPass() returns nullptr when using new pass manager.
3333 // TODO: Use DAG.getMFAM() to access analysis result.
3334 if (DAG.getPass()) {
3335 auto &ArgUsageInfo =
3336 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3337 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(F: *CalleeFunc);
3338 }
3339 }
3340
3341 // TODO: Unify with private memory register handling. This is complicated by
3342 // the fact that at least in kernels, the input argument is not necessarily
3343 // in the same location as the input.
3344 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3345 StringLiteral> ImplicitAttrs[] = {
3346 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3347 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3348 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3349 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3350 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3351 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3353 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3354 };
3355
3356 for (auto Attr : ImplicitAttrs) {
3357 const ArgDescriptor *OutgoingArg;
3358 const TargetRegisterClass *ArgRC;
3359 LLT ArgTy;
3360
3361 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3362
3363 // If the callee does not use the attribute value, skip copying the value.
3364 if (CLI.CB->hasFnAttr(Kind: Attr.second))
3365 continue;
3366
3367 std::tie(args&: OutgoingArg, args&: ArgRC, args&: ArgTy) =
3368 CalleeArgInfo->getPreloadedValue(Value: InputID);
3369 if (!OutgoingArg)
3370 continue;
3371
3372 const ArgDescriptor *IncomingArg;
3373 const TargetRegisterClass *IncomingArgRC;
3374 LLT Ty;
3375 std::tie(args&: IncomingArg, args&: IncomingArgRC, args&: Ty) =
3376 CallerArgInfo.getPreloadedValue(Value: InputID);
3377 assert(IncomingArgRC == ArgRC);
3378
3379 // All special arguments are ints for now.
3380 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3381 SDValue InputReg;
3382
3383 if (IncomingArg) {
3384 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3385 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3386 // The implicit arg ptr is special because it doesn't have a corresponding
3387 // input for kernels, and is computed from the kernarg segment pointer.
3388 InputReg = getImplicitArgPtr(DAG, SL: DL);
3389 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3390 std::optional<uint32_t> Id =
3391 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3392 if (Id.has_value()) {
3393 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3394 } else {
3395 InputReg = DAG.getUNDEF(VT: ArgVT);
3396 }
3397 } else {
3398 // We may have proven the input wasn't needed, although the ABI is
3399 // requiring it. We just need to allocate the register appropriately.
3400 InputReg = DAG.getUNDEF(VT: ArgVT);
3401 }
3402
3403 if (OutgoingArg->isRegister()) {
3404 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3405 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3406 report_fatal_error(reason: "failed to allocate implicit input argument");
3407 } else {
3408 unsigned SpecialArgOffset =
3409 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3410 SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg,
3411 Offset: SpecialArgOffset);
3412 MemOpChains.push_back(Elt: ArgStore);
3413 }
3414 }
3415
3416 // Pack workitem IDs into a single register or pass it as is if already
3417 // packed.
3418 const ArgDescriptor *OutgoingArg;
3419 const TargetRegisterClass *ArgRC;
3420 LLT Ty;
3421
3422 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3423 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3424 if (!OutgoingArg)
3425 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3426 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3427 if (!OutgoingArg)
3428 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3429 CalleeArgInfo->getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3430 if (!OutgoingArg)
3431 return;
3432
3433 const ArgDescriptor *IncomingArgX = std::get<0>(
3434 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3435 const ArgDescriptor *IncomingArgY = std::get<0>(
3436 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3437 const ArgDescriptor *IncomingArgZ = std::get<0>(
3438 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3439
3440 SDValue InputReg;
3441 SDLoc SL;
3442
3443 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3444 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3445 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3446
3447 // If incoming ids are not packed we need to pack them.
3448 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3449 NeedWorkItemIDX) {
3450 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
3451 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3452 } else {
3453 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3454 }
3455 }
3456
3457 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3458 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
3459 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
3460 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
3461 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
3462 InputReg = InputReg.getNode() ?
3463 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y) : Y;
3464 }
3465
3466 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3467 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
3468 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
3469 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
3470 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
3471 InputReg = InputReg.getNode() ?
3472 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z) : Z;
3473 }
3474
3475 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3476 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3477 // We're in a situation where the outgoing function requires the workitem
3478 // ID, but the calling function does not have it (e.g a graphics function
3479 // calling a C calling convention function). This is illegal, but we need
3480 // to produce something.
3481 InputReg = DAG.getUNDEF(VT: MVT::i32);
3482 } else {
3483 // Workitem ids are already packed, any of present incoming arguments
3484 // will carry all required fields.
3485 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
3486 Arg: IncomingArgX ? *IncomingArgX :
3487 IncomingArgY ? *IncomingArgY :
3488 *IncomingArgZ, Mask: ~0u);
3489 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
3490 }
3491 }
3492
3493 if (OutgoingArg->isRegister()) {
3494 if (InputReg)
3495 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3496
3497 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
3498 } else {
3499 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
3500 if (InputReg) {
3501 SDValue ArgStore = storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg,
3502 Offset: SpecialArgOffset);
3503 MemOpChains.push_back(Elt: ArgStore);
3504 }
3505 }
3506}
3507
3508static bool canGuaranteeTCO(CallingConv::ID CC) {
3509 return CC == CallingConv::Fast;
3510}
3511
3512/// Return true if we might ever do TCO for calls with this calling convention.
3513static bool mayTailCallThisCC(CallingConv::ID CC) {
3514 switch (CC) {
3515 case CallingConv::C:
3516 case CallingConv::AMDGPU_Gfx:
3517 return true;
3518 default:
3519 return canGuaranteeTCO(CC);
3520 }
3521}
3522
3523bool SITargetLowering::isEligibleForTailCallOptimization(
3524 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3525 const SmallVectorImpl<ISD::OutputArg> &Outs,
3526 const SmallVectorImpl<SDValue> &OutVals,
3527 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3528 if (AMDGPU::isChainCC(CC: CalleeCC))
3529 return true;
3530
3531 if (!mayTailCallThisCC(CC: CalleeCC))
3532 return false;
3533
3534 // For a divergent call target, we need to do a waterfall loop over the
3535 // possible callees which precludes us from using a simple jump.
3536 if (Callee->isDivergent())
3537 return false;
3538
3539 MachineFunction &MF = DAG.getMachineFunction();
3540 const Function &CallerF = MF.getFunction();
3541 CallingConv::ID CallerCC = CallerF.getCallingConv();
3542 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3543 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3544
3545 // Kernels aren't callable, and don't have a live in return address so it
3546 // doesn't make sense to do a tail call with entry functions.
3547 if (!CallerPreserved)
3548 return false;
3549
3550 bool CCMatch = CallerCC == CalleeCC;
3551
3552 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3553 if (canGuaranteeTCO(CC: CalleeCC) && CCMatch)
3554 return true;
3555 return false;
3556 }
3557
3558 // TODO: Can we handle var args?
3559 if (IsVarArg)
3560 return false;
3561
3562 for (const Argument &Arg : CallerF.args()) {
3563 if (Arg.hasByValAttr())
3564 return false;
3565 }
3566
3567 LLVMContext &Ctx = *DAG.getContext();
3568
3569 // Check that the call results are passed in the same way.
3570 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
3571 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
3572 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
3573 return false;
3574
3575 // The callee has to preserve all registers the caller needs to preserve.
3576 if (!CCMatch) {
3577 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3578 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
3579 return false;
3580 }
3581
3582 // Nothing more to check if the callee is taking no arguments.
3583 if (Outs.empty())
3584 return true;
3585
3586 SmallVector<CCValAssign, 16> ArgLocs;
3587 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3588
3589 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
3590
3591 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3592 // If the stack arguments for this call do not fit into our own save area then
3593 // the call cannot be made tail.
3594 // TODO: Is this really necessary?
3595 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3596 return false;
3597
3598 const MachineRegisterInfo &MRI = MF.getRegInfo();
3599 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
3600}
3601
3602bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3603 if (!CI->isTailCall())
3604 return false;
3605
3606 const Function *ParentFn = CI->getParent()->getParent();
3607 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
3608 return false;
3609 return true;
3610}
3611
3612// The wave scratch offset register is used as the global base pointer.
3613SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3614 SmallVectorImpl<SDValue> &InVals) const {
3615 CallingConv::ID CallConv = CLI.CallConv;
3616 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
3617
3618 SelectionDAG &DAG = CLI.DAG;
3619
3620 TargetLowering::ArgListEntry RequestedExec;
3621 if (IsChainCallConv) {
3622 // The last argument should be the value that we need to put in EXEC.
3623 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3624 // don't treat it like the rest of the arguments.
3625 RequestedExec = CLI.Args.back();
3626 assert(RequestedExec.Node && "No node for EXEC");
3627
3628 if (!RequestedExec.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
3629 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
3630
3631 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3632 CLI.Outs.pop_back();
3633 CLI.OutVals.pop_back();
3634
3635 if (RequestedExec.Ty->isIntegerTy(Bitwidth: 64)) {
3636 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3637 CLI.Outs.pop_back();
3638 CLI.OutVals.pop_back();
3639 }
3640
3641 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3642 "Haven't popped all the pieces of the EXEC mask");
3643 }
3644
3645 const SDLoc &DL = CLI.DL;
3646 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3647 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3648 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3649 SDValue Chain = CLI.Chain;
3650 SDValue Callee = CLI.Callee;
3651 bool &IsTailCall = CLI.IsTailCall;
3652 bool IsVarArg = CLI.IsVarArg;
3653 bool IsSibCall = false;
3654 MachineFunction &MF = DAG.getMachineFunction();
3655
3656 if (Callee.isUndef() || isNullConstant(V: Callee)) {
3657 if (!CLI.IsTailCall) {
3658 for (ISD::InputArg &Arg : CLI.Ins)
3659 InVals.push_back(Elt: DAG.getUNDEF(VT: Arg.VT));
3660 }
3661
3662 return Chain;
3663 }
3664
3665 if (IsVarArg) {
3666 return lowerUnhandledCall(CLI, InVals,
3667 Reason: "unsupported call to variadic function ");
3668 }
3669
3670 if (!CLI.CB)
3671 report_fatal_error(reason: "unsupported libcall legalization");
3672
3673 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3674 return lowerUnhandledCall(CLI, InVals,
3675 Reason: "unsupported required tail call to function ");
3676 }
3677
3678 if (IsTailCall) {
3679 IsTailCall = isEligibleForTailCallOptimization(
3680 Callee, CalleeCC: CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3681 if (!IsTailCall &&
3682 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3683 report_fatal_error(reason: "failed to perform tail call elimination on a call "
3684 "site marked musttail or on llvm.amdgcn.cs.chain");
3685 }
3686
3687 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3688
3689 // A sibling call is one where we're under the usual C ABI and not planning
3690 // to change that but can still do a tail call:
3691 if (!TailCallOpt && IsTailCall)
3692 IsSibCall = true;
3693
3694 if (IsTailCall)
3695 ++NumTailCalls;
3696 }
3697
3698 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3699 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3700 SmallVector<SDValue, 8> MemOpChains;
3701
3702 // Analyze operands of the call, assigning locations to each operand.
3703 SmallVector<CCValAssign, 16> ArgLocs;
3704 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3705 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
3706
3707 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv)) {
3708 // With a fixed ABI, allocate fixed registers before user arguments.
3709 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
3710 }
3711
3712 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
3713
3714 // Get a count of how many bytes are to be pushed on the stack.
3715 unsigned NumBytes = CCInfo.getStackSize();
3716
3717 if (IsSibCall) {
3718 // Since we're not changing the ABI to make this a tail call, the memory
3719 // operands are already available in the caller's incoming argument space.
3720 NumBytes = 0;
3721 }
3722
3723 // FPDiff is the byte offset of the call's argument area from the callee's.
3724 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3725 // by this amount for a tail call. In a sibling call it must be 0 because the
3726 // caller will deallocate the entire stack and the callee still expects its
3727 // arguments to begin at SP+0. Completely unused for non-tail calls.
3728 int32_t FPDiff = 0;
3729 MachineFrameInfo &MFI = MF.getFrameInfo();
3730
3731 // Adjust the stack pointer for the new arguments...
3732 // These operations are automatically eliminated by the prolog/epilog pass
3733 if (!IsSibCall)
3734 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
3735
3736 if (!IsSibCall || IsChainCallConv) {
3737 if (!Subtarget->enableFlatScratch()) {
3738 SmallVector<SDValue, 4> CopyFromChains;
3739
3740 // In the HSA case, this should be an identity copy.
3741 SDValue ScratchRSrcReg
3742 = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
3743 RegsToPass.emplace_back(Args: IsChainCallConv
3744 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3745 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3746 Args&: ScratchRSrcReg);
3747 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
3748 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
3749 }
3750 }
3751
3752 MVT PtrVT = MVT::i32;
3753
3754 // Walk the register/memloc assignments, inserting copies/loads.
3755 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3756 CCValAssign &VA = ArgLocs[i];
3757 SDValue Arg = OutVals[i];
3758
3759 // Promote the value if needed.
3760 switch (VA.getLocInfo()) {
3761 case CCValAssign::Full:
3762 break;
3763 case CCValAssign::BCvt:
3764 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3765 break;
3766 case CCValAssign::ZExt:
3767 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3768 break;
3769 case CCValAssign::SExt:
3770 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3771 break;
3772 case CCValAssign::AExt:
3773 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3774 break;
3775 case CCValAssign::FPExt:
3776 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3777 break;
3778 default:
3779 llvm_unreachable("Unknown loc info!");
3780 }
3781
3782 if (VA.isRegLoc()) {
3783 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
3784 } else {
3785 assert(VA.isMemLoc());
3786
3787 SDValue DstAddr;
3788 MachinePointerInfo DstInfo;
3789
3790 unsigned LocMemOffset = VA.getLocMemOffset();
3791 int32_t Offset = LocMemOffset;
3792
3793 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
3794 MaybeAlign Alignment;
3795
3796 if (IsTailCall) {
3797 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3798 unsigned OpSize = Flags.isByVal() ?
3799 Flags.getByValSize() : VA.getValVT().getStoreSize();
3800
3801 // FIXME: We can have better than the minimum byval required alignment.
3802 Alignment =
3803 Flags.isByVal()
3804 ? Flags.getNonZeroByValAlign()
3805 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
3806
3807 Offset = Offset + FPDiff;
3808 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
3809
3810 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
3811 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3812
3813 // Make sure any stack arguments overlapping with where we're storing
3814 // are loaded before this eventual operation. Otherwise they'll be
3815 // clobbered.
3816
3817 // FIXME: Why is this really necessary? This seems to just result in a
3818 // lot of code to copy the stack and write them back to the same
3819 // locations, which are supposed to be immutable?
3820 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
3821 } else {
3822 // Stores to the argument stack area are relative to the stack pointer.
3823 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
3824 VT: MVT::i32);
3825 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
3826 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
3827 Alignment =
3828 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
3829 }
3830
3831 if (Outs[i].Flags.isByVal()) {
3832 SDValue SizeNode =
3833 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
3834 SDValue Cpy =
3835 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
3836 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
3837 /*isVol = */ false, /*AlwaysInline = */ true,
3838 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
3839 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3840
3841 MemOpChains.push_back(Elt: Cpy);
3842 } else {
3843 SDValue Store =
3844 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
3845 MemOpChains.push_back(Elt: Store);
3846 }
3847 }
3848 }
3849
3850 if (!MemOpChains.empty())
3851 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
3852
3853 // Build a sequence of copy-to-reg nodes chained together with token chain
3854 // and flag operands which copy the outgoing args into the appropriate regs.
3855 SDValue InGlue;
3856 for (auto &RegToPass : RegsToPass) {
3857 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
3858 N: RegToPass.second, Glue: InGlue);
3859 InGlue = Chain.getValue(R: 1);
3860 }
3861
3862
3863 // We don't usually want to end the call-sequence here because we would tidy
3864 // the frame up *after* the call, however in the ABI-changing tail-call case
3865 // we've carefully laid out the parameters so that when sp is reset they'll be
3866 // in the correct location.
3867 if (IsTailCall && !IsSibCall) {
3868 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
3869 InGlue = Chain.getValue(R: 1);
3870 }
3871
3872 std::vector<SDValue> Ops;
3873 Ops.push_back(x: Chain);
3874 Ops.push_back(x: Callee);
3875 // Add a redundant copy of the callee global which will not be legalized, as
3876 // we need direct access to the callee later.
3877 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
3878 const GlobalValue *GV = GSD->getGlobal();
3879 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
3880 } else {
3881 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
3882 }
3883
3884 if (IsTailCall) {
3885 // Each tail call may have to adjust the stack by a different amount, so
3886 // this information must travel along with the operation for eventual
3887 // consumption by emitEpilogue.
3888 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
3889 }
3890
3891 if (IsChainCallConv)
3892 Ops.push_back(x: RequestedExec.Node);
3893
3894 // Add argument registers to the end of the list so that they are known live
3895 // into the call.
3896 for (auto &RegToPass : RegsToPass) {
3897 Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
3898 VT: RegToPass.second.getValueType()));
3899 }
3900
3901 // Add a register mask operand representing the call-preserved registers.
3902 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3903 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3904 assert(Mask && "Missing call preserved mask for calling convention");
3905 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
3906
3907 if (SDValue Token = CLI.ConvergenceControlToken) {
3908 SmallVector<SDValue, 2> GlueOps;
3909 GlueOps.push_back(Elt: Token);
3910 if (InGlue)
3911 GlueOps.push_back(Elt: InGlue);
3912
3913 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
3914 VT: MVT::Glue, Ops: GlueOps),
3915 0);
3916 }
3917
3918 if (InGlue)
3919 Ops.push_back(x: InGlue);
3920
3921 SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
3922
3923 // If we're doing a tall call, use a TC_RETURN here rather than an
3924 // actual call instruction.
3925 if (IsTailCall) {
3926 MFI.setHasTailCall();
3927 unsigned OPC = AMDGPUISD::TC_RETURN;
3928 switch (CallConv) {
3929 case CallingConv::AMDGPU_Gfx:
3930 OPC = AMDGPUISD::TC_RETURN_GFX;
3931 break;
3932 case CallingConv::AMDGPU_CS_Chain:
3933 case CallingConv::AMDGPU_CS_ChainPreserve:
3934 OPC = AMDGPUISD::TC_RETURN_CHAIN;
3935 break;
3936 }
3937
3938 return DAG.getNode(Opcode: OPC, DL, VTList: NodeTys, Ops);
3939 }
3940
3941 // Returns a chain and a flag for retval copy to use.
3942 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, VTList: NodeTys, Ops);
3943 Chain = Call.getValue(R: 0);
3944 InGlue = Call.getValue(R: 1);
3945
3946 uint64_t CalleePopBytes = NumBytes;
3947 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
3948 if (!Ins.empty())
3949 InGlue = Chain.getValue(R: 1);
3950
3951 // Handle result values, copying them out of physregs into vregs that we
3952 // return.
3953 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3954 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
3955}
3956
3957// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3958// except for applying the wave size scale to the increment amount.
3959SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3960 SDValue Op, SelectionDAG &DAG) const {
3961 const MachineFunction &MF = DAG.getMachineFunction();
3962 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3963
3964 SDLoc dl(Op);
3965 EVT VT = Op.getValueType();
3966 SDValue Tmp1 = Op;
3967 SDValue Tmp2 = Op.getValue(R: 1);
3968 SDValue Tmp3 = Op.getOperand(i: 2);
3969 SDValue Chain = Tmp1.getOperand(i: 0);
3970
3971 Register SPReg = Info->getStackPtrOffsetReg();
3972
3973 // Chain the dynamic stack allocation so that it doesn't modify the stack
3974 // pointer when other instructions are using the stack.
3975 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
3976
3977 SDValue Size = Tmp2.getOperand(i: 1);
3978 SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
3979 Chain = SP.getValue(R: 1);
3980 MaybeAlign Alignment = cast<ConstantSDNode>(Val&: Tmp3)->getMaybeAlignValue();
3981 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3982 unsigned Opc =
3983 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3984 ISD::ADD : ISD::SUB;
3985
3986 SDValue ScaledSize = DAG.getNode(
3987 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
3988 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
3989
3990 Align StackAlign = TFL->getStackAlign();
3991 Tmp1 = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: SP, N2: ScaledSize); // Value
3992 if (Alignment && *Alignment > StackAlign) {
3993 Tmp1 = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: Tmp1,
3994 N2: DAG.getConstant(Val: -(uint64_t)Alignment->value()
3995 << Subtarget->getWavefrontSizeLog2(),
3996 DL: dl, VT));
3997 }
3998
3999 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: Tmp1); // Output chain
4000 Tmp2 = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4001
4002 return DAG.getMergeValues(Ops: {Tmp1, Tmp2}, dl);
4003}
4004
4005SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4006 SelectionDAG &DAG) const {
4007 // We only handle constant sizes here to allow non-entry block, static sized
4008 // allocas. A truly dynamic value is more difficult to support because we
4009 // don't know if the size value is uniform or not. If the size isn't uniform,
4010 // we would need to do a wave reduction to get the maximum size to know how
4011 // much to increment the uniform stack pointer.
4012 SDValue Size = Op.getOperand(i: 1);
4013 if (isa<ConstantSDNode>(Val: Size))
4014 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4015
4016 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
4017}
4018
4019SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4020 if (Op.getValueType() != MVT::i32)
4021 return Op; // Defer to cannot select error.
4022
4023 Register SP = getStackPointerRegisterToSaveRestore();
4024 SDLoc SL(Op);
4025
4026 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4027
4028 // Convert from wave uniform to swizzled vector address. This should protect
4029 // from any edge cases where the stacksave result isn't directly used with
4030 // stackrestore.
4031 SDValue VectorAddress =
4032 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4033 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4034}
4035
4036SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4037 SelectionDAG &DAG) const {
4038 SDLoc SL(Op);
4039 assert(Op.getValueType() == MVT::i32);
4040
4041 uint32_t BothRoundHwReg =
4042 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4043 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4044
4045 SDValue IntrinID =
4046 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4047 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4048 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4049
4050 // There are two rounding modes, one for f32 and one for f64/f16. We only
4051 // report in the standard value range if both are the same.
4052 //
4053 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4054 // ties away from zero is not supported, and the other values are rotated by
4055 // 1.
4056 //
4057 // If the two rounding modes are not the same, report a target defined value.
4058
4059 // Mode register rounding mode fields:
4060 //
4061 // [1:0] Single-precision round mode.
4062 // [3:2] Double/Half-precision round mode.
4063 //
4064 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4065 //
4066 // Hardware Spec
4067 // Toward-0 3 0
4068 // Nearest Even 0 1
4069 // +Inf 1 2
4070 // -Inf 2 3
4071 // NearestAway0 N/A 4
4072 //
4073 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4074 // table we can index by the raw hardware mode.
4075 //
4076 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4077
4078 SDValue BitTable =
4079 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4080
4081 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4082 SDValue RoundModeTimesNumBits =
4083 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4084
4085 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4086 // knew only one mode was demanded.
4087 SDValue TableValue =
4088 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4089 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4090
4091 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4092 SDValue TableEntry =
4093 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4094
4095 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4096 // if it's an extended value.
4097 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4098 SDValue IsStandardValue =
4099 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4100 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4101 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4102 N2: TableEntry, N3: EnumOffset);
4103
4104 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4105}
4106
4107SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4108 SelectionDAG &DAG) const {
4109 SDLoc SL(Op);
4110
4111 SDValue NewMode = Op.getOperand(i: 1);
4112 assert(NewMode.getValueType() == MVT::i32);
4113
4114 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4115 // hardware MODE.fp_round values.
4116 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4117 uint32_t ClampedVal = std::min(
4118 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4119 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4120 NewMode = DAG.getConstant(
4121 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4122 } else {
4123 // If we know the input can only be one of the supported standard modes in
4124 // the range 0-3, we can use a simplified mapping to hardware values.
4125 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4126 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4127 // The supported standard values are 0-3. The extended values start at 8. We
4128 // need to offset by 4 if the value is in the extended range.
4129
4130 if (UseReducedTable) {
4131 // Truncate to the low 32-bits.
4132 SDValue BitTable = DAG.getConstant(
4133 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4134
4135 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4136 SDValue RoundModeTimesNumBits =
4137 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4138
4139 NewMode =
4140 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4141
4142 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4143 // the table extracted bits into inline immediates.
4144 } else {
4145 // table_index = umin(value, value - 4)
4146 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4147 SDValue BitTable =
4148 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4149
4150 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4151 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4152 SDValue IndexVal =
4153 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4154
4155 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4156 SDValue RoundModeTimesNumBits =
4157 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4158
4159 SDValue TableValue =
4160 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4161 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4162
4163 // No need to mask out the high bits since the setreg will ignore them
4164 // anyway.
4165 NewMode = TruncTable;
4166 }
4167
4168 // Insert a readfirstlane in case the value is a VGPR. We could do this
4169 // earlier and keep more operations scalar, but that interferes with
4170 // combining the source.
4171 SDValue ReadFirstLaneID =
4172 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4173 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4174 N1: ReadFirstLaneID, N2: NewMode);
4175 }
4176
4177 // N.B. The setreg will be later folded into s_round_mode on supported
4178 // targets.
4179 SDValue IntrinID =
4180 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4181 uint32_t BothRoundHwReg =
4182 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4183 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4184
4185 SDValue SetReg =
4186 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4187 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4188
4189 return SetReg;
4190}
4191
4192SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4193 if (Op->isDivergent())
4194 return SDValue();
4195
4196 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4197 case AMDGPUAS::FLAT_ADDRESS:
4198 case AMDGPUAS::GLOBAL_ADDRESS:
4199 case AMDGPUAS::CONSTANT_ADDRESS:
4200 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4201 break;
4202 default:
4203 return SDValue();
4204 }
4205
4206 return Op;
4207}
4208
4209// Work around DAG legality rules only based on the result type.
4210SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4211 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4212 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4213 EVT SrcVT = Src.getValueType();
4214
4215 if (SrcVT.getScalarType() != MVT::bf16)
4216 return Op;
4217
4218 SDLoc SL(Op);
4219 SDValue BitCast =
4220 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4221
4222 EVT DstVT = Op.getValueType();
4223 if (IsStrict)
4224 llvm_unreachable("Need STRICT_BF16_TO_FP");
4225
4226 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4227}
4228
4229SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4230 SDLoc SL(Op);
4231 if (Op.getValueType() != MVT::i64)
4232 return Op;
4233
4234 uint32_t ModeHwReg =
4235 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4236 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4237 uint32_t TrapHwReg =
4238 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4239 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4240
4241 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4242 SDValue IntrinID =
4243 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4244 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4245 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4246 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4247 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4248 SDValue TokenReg =
4249 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
4250 N2: GetTrapReg.getValue(R: 1));
4251
4252 SDValue CvtPtr =
4253 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4254 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4255
4256 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4257}
4258
4259SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4260 SDLoc SL(Op);
4261 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
4262 return Op;
4263
4264 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
4265 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4266 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
4267 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4268 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
4269
4270 SDValue ReadFirstLaneID =
4271 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4272 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4273 N1: ReadFirstLaneID, N2: NewModeReg);
4274 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4275 N1: ReadFirstLaneID, N2: NewTrapReg);
4276
4277 unsigned ModeHwReg =
4278 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4279 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4280 unsigned TrapHwReg =
4281 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4282 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4283
4284 SDValue IntrinID =
4285 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4286 SDValue SetModeReg =
4287 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4288 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4289 SDValue SetTrapReg =
4290 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4291 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4292 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4293}
4294
4295Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
4296 const MachineFunction &MF) const {
4297 Register Reg = StringSwitch<Register>(RegName)
4298 .Case(S: "m0", Value: AMDGPU::M0)
4299 .Case(S: "exec", Value: AMDGPU::EXEC)
4300 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4301 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4302 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4303 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4304 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4305 .Default(Value: Register());
4306
4307 if (Reg == AMDGPU::NoRegister) {
4308 report_fatal_error(reason: Twine("invalid register name \""
4309 + StringRef(RegName) + "\"."));
4310
4311 }
4312
4313 if (!Subtarget->hasFlatScrRegister() &&
4314 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4315 report_fatal_error(reason: Twine("invalid register \""
4316 + StringRef(RegName) + "\" for subtarget."));
4317 }
4318
4319 switch (Reg) {
4320 case AMDGPU::M0:
4321 case AMDGPU::EXEC_LO:
4322 case AMDGPU::EXEC_HI:
4323 case AMDGPU::FLAT_SCR_LO:
4324 case AMDGPU::FLAT_SCR_HI:
4325 if (VT.getSizeInBits() == 32)
4326 return Reg;
4327 break;
4328 case AMDGPU::EXEC:
4329 case AMDGPU::FLAT_SCR:
4330 if (VT.getSizeInBits() == 64)
4331 return Reg;
4332 break;
4333 default:
4334 llvm_unreachable("missing register type checking");
4335 }
4336
4337 report_fatal_error(reason: Twine("invalid type for register \""
4338 + StringRef(RegName) + "\"."));
4339}
4340
4341// If kill is not the last instruction, split the block so kill is always a
4342// proper terminator.
4343MachineBasicBlock *
4344SITargetLowering::splitKillBlock(MachineInstr &MI,
4345 MachineBasicBlock *BB) const {
4346 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
4347 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4348 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
4349 return SplitBB;
4350}
4351
4352// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4353// \p MI will be the only instruction in the loop body block. Otherwise, it will
4354// be the first instruction in the remainder block.
4355//
4356/// \returns { LoopBody, Remainder }
4357static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4358splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
4359 MachineFunction *MF = MBB.getParent();
4360 MachineBasicBlock::iterator I(&MI);
4361
4362 // To insert the loop we need to split the block. Move everything after this
4363 // point to a new block, and insert a new empty block between the two.
4364 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
4365 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4366 MachineFunction::iterator MBBI(MBB);
4367 ++MBBI;
4368
4369 MF->insert(MBBI, MBB: LoopBB);
4370 MF->insert(MBBI, MBB: RemainderBB);
4371
4372 LoopBB->addSuccessor(Succ: LoopBB);
4373 LoopBB->addSuccessor(Succ: RemainderBB);
4374
4375 // Move the rest of the block into a new block.
4376 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
4377
4378 if (InstInLoop) {
4379 auto Next = std::next(x: I);
4380
4381 // Move instruction to loop body.
4382 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
4383
4384 // Move the rest of the block.
4385 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
4386 } else {
4387 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
4388 }
4389
4390 MBB.addSuccessor(Succ: LoopBB);
4391
4392 return std::pair(LoopBB, RemainderBB);
4393}
4394
4395/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4396void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
4397 MachineBasicBlock *MBB = MI.getParent();
4398 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4399 auto I = MI.getIterator();
4400 auto E = std::next(x: I);
4401
4402 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
4403 .addImm(Val: 0);
4404
4405 MIBundleBuilder Bundler(*MBB, I, E);
4406 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
4407}
4408
4409MachineBasicBlock *
4410SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
4411 MachineBasicBlock *BB) const {
4412 const DebugLoc &DL = MI.getDebugLoc();
4413
4414 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4415
4416 MachineBasicBlock *LoopBB;
4417 MachineBasicBlock *RemainderBB;
4418 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4419
4420 // Apparently kill flags are only valid if the def is in the same block?
4421 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
4422 Src->setIsKill(false);
4423
4424 std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
4425
4426 MachineBasicBlock::iterator I = LoopBB->end();
4427
4428 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4429 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
4430
4431 // Clear TRAP_STS.MEM_VIOL
4432 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
4433 .addImm(Val: 0)
4434 .addImm(Val: EncodedReg);
4435
4436 bundleInstWithWaitcnt(MI);
4437
4438 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4439
4440 // Load and check TRAP_STS.MEM_VIOL
4441 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
4442 .addImm(Val: EncodedReg);
4443
4444 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4445 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
4446 .addReg(RegNo: Reg, flags: RegState::Kill)
4447 .addImm(Val: 0);
4448 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
4449 .addMBB(MBB: LoopBB);
4450
4451 return RemainderBB;
4452}
4453
4454// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4455// wavefront. If the value is uniform and just happens to be in a VGPR, this
4456// will only do one iteration. In the worst case, this will loop 64 times.
4457//
4458// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4459static MachineBasicBlock::iterator
4460emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
4461 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4462 const DebugLoc &DL, const MachineOperand &Idx,
4463 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4464 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4465 Register &SGPRIdxReg) {
4466
4467 MachineFunction *MF = OrigBB.getParent();
4468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4469 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4470 MachineBasicBlock::iterator I = LoopBB.begin();
4471
4472 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4473 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
4474 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
4475 Register CurrentIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
4476 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
4477
4478 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
4479 .addReg(RegNo: InitReg)
4480 .addMBB(MBB: &OrigBB)
4481 .addReg(RegNo: ResultReg)
4482 .addMBB(MBB: &LoopBB);
4483
4484 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
4485 .addReg(RegNo: InitSaveExecReg)
4486 .addMBB(MBB: &OrigBB)
4487 .addReg(RegNo: NewExec)
4488 .addMBB(MBB: &LoopBB);
4489
4490 // Read the next variant <- also loop target.
4491 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
4492 .addReg(RegNo: Idx.getReg(), flags: getUndefRegState(B: Idx.isUndef()));
4493
4494 // Compare the just read M0 value to all possible Idx values.
4495 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
4496 .addReg(RegNo: CurrentIdxReg)
4497 .addReg(RegNo: Idx.getReg(), flags: 0, SubReg: Idx.getSubReg());
4498
4499 // Update EXEC, save the original EXEC value to VCC.
4500 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4501 : AMDGPU::S_AND_SAVEEXEC_B64),
4502 DestReg: NewExec)
4503 .addReg(RegNo: CondReg, flags: RegState::Kill);
4504
4505 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
4506
4507 if (UseGPRIdxMode) {
4508 if (Offset == 0) {
4509 SGPRIdxReg = CurrentIdxReg;
4510 } else {
4511 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
4512 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
4513 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4514 .addImm(Val: Offset);
4515 }
4516 } else {
4517 // Move index from VCC into M0
4518 if (Offset == 0) {
4519 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
4520 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill);
4521 } else {
4522 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4523 .addReg(RegNo: CurrentIdxReg, flags: RegState::Kill)
4524 .addImm(Val: Offset);
4525 }
4526 }
4527
4528 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4529 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4530 MachineInstr *InsertPt =
4531 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: ST.isWave32() ? AMDGPU::S_XOR_B32_term
4532 : AMDGPU::S_XOR_B64_term), DestReg: Exec)
4533 .addReg(RegNo: Exec)
4534 .addReg(RegNo: NewExec);
4535
4536 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4537 // s_cbranch_scc0?
4538
4539 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4540 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
4541 .addMBB(MBB: &LoopBB);
4542
4543 return InsertPt->getIterator();
4544}
4545
4546// This has slightly sub-optimal regalloc when the source vector is killed by
4547// the read. The register allocator does not understand that the kill is
4548// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4549// subregister from it, using 1 more VGPR than necessary. This was saved when
4550// this was expanded after register allocation.
4551static MachineBasicBlock::iterator
4552loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
4553 unsigned InitResultReg, unsigned PhiReg, int Offset,
4554 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4555 MachineFunction *MF = MBB.getParent();
4556 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4557 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4558 MachineRegisterInfo &MRI = MF->getRegInfo();
4559 const DebugLoc &DL = MI.getDebugLoc();
4560 MachineBasicBlock::iterator I(&MI);
4561
4562 const auto *BoolXExecRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID);
4563 Register DstReg = MI.getOperand(i: 0).getReg();
4564 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4565 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
4566 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4567 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4568
4569 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
4570
4571 // Save the EXEC mask
4572 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: SaveExec)
4573 .addReg(RegNo: Exec);
4574
4575 MachineBasicBlock *LoopBB;
4576 MachineBasicBlock *RemainderBB;
4577 std::tie(args&: LoopBB, args&: RemainderBB) = splitBlockForLoop(MI, MBB, InstInLoop: false);
4578
4579 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4580
4581 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
4582 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
4583 Offset, UseGPRIdxMode, SGPRIdxReg);
4584
4585 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4586 MachineFunction::iterator MBBI(LoopBB);
4587 ++MBBI;
4588 MF->insert(MBBI, MBB: LandingPad);
4589 LoopBB->removeSuccessor(Succ: RemainderBB);
4590 LandingPad->addSuccessor(Succ: RemainderBB);
4591 LoopBB->addSuccessor(Succ: LandingPad);
4592 MachineBasicBlock::iterator First = LandingPad->begin();
4593 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: MovExecOpc), DestReg: Exec)
4594 .addReg(RegNo: SaveExec);
4595
4596 return InsPt;
4597}
4598
4599// Returns subreg index, offset
4600static std::pair<unsigned, int>
4601computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
4602 const TargetRegisterClass *SuperRC,
4603 unsigned VecReg,
4604 int Offset) {
4605 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
4606
4607 // Skip out of bounds offsets, or else we would end up using an undefined
4608 // register.
4609 if (Offset >= NumElts || Offset < 0)
4610 return std::pair(AMDGPU::sub0, Offset);
4611
4612 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
4613}
4614
4615static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
4616 MachineRegisterInfo &MRI, MachineInstr &MI,
4617 int Offset) {
4618 MachineBasicBlock *MBB = MI.getParent();
4619 const DebugLoc &DL = MI.getDebugLoc();
4620 MachineBasicBlock::iterator I(&MI);
4621
4622 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4623
4624 assert(Idx->getReg() != AMDGPU::NoRegister);
4625
4626 if (Offset == 0) {
4627 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0).add(MO: *Idx);
4628 } else {
4629 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
4630 .add(MO: *Idx)
4631 .addImm(Val: Offset);
4632 }
4633}
4634
4635static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
4636 MachineRegisterInfo &MRI, MachineInstr &MI,
4637 int Offset) {
4638 MachineBasicBlock *MBB = MI.getParent();
4639 const DebugLoc &DL = MI.getDebugLoc();
4640 MachineBasicBlock::iterator I(&MI);
4641
4642 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4643
4644 if (Offset == 0)
4645 return Idx->getReg();
4646
4647 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
4648 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
4649 .add(MO: *Idx)
4650 .addImm(Val: Offset);
4651 return Tmp;
4652}
4653
4654static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
4655 MachineBasicBlock &MBB,
4656 const GCNSubtarget &ST) {
4657 const SIInstrInfo *TII = ST.getInstrInfo();
4658 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4659 MachineFunction *MF = MBB.getParent();
4660 MachineRegisterInfo &MRI = MF->getRegInfo();
4661
4662 Register Dst = MI.getOperand(i: 0).getReg();
4663 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4664 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
4665 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4666
4667 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
4668 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4669
4670 unsigned SubReg;
4671 std::tie(args&: SubReg, args&: Offset)
4672 = computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
4673
4674 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4675
4676 // Check for a SGPR index.
4677 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4678 MachineBasicBlock::iterator I(&MI);
4679 const DebugLoc &DL = MI.getDebugLoc();
4680
4681 if (UseGPRIdxMode) {
4682 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4683 // to avoid interfering with other uses, so probably requires a new
4684 // optimization pass.
4685 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4686
4687 const MCInstrDesc &GPRIDXDesc =
4688 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
4689 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4690 .addReg(RegNo: SrcReg)
4691 .addReg(RegNo: Idx)
4692 .addImm(Val: SubReg);
4693 } else {
4694 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4695
4696 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4697 .addReg(RegNo: SrcReg, flags: 0, SubReg)
4698 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
4699 }
4700
4701 MI.eraseFromParent();
4702
4703 return &MBB;
4704 }
4705
4706 // Control flow needs to be inserted if indexing with a VGPR.
4707 const DebugLoc &DL = MI.getDebugLoc();
4708 MachineBasicBlock::iterator I(&MI);
4709
4710 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4711 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
4712
4713 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
4714
4715 Register SGPRIdxReg;
4716 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
4717 UseGPRIdxMode, SGPRIdxReg);
4718
4719 MachineBasicBlock *LoopBB = InsPt->getParent();
4720
4721 if (UseGPRIdxMode) {
4722 const MCInstrDesc &GPRIDXDesc =
4723 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
4724
4725 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4726 .addReg(RegNo: SrcReg)
4727 .addReg(RegNo: SGPRIdxReg)
4728 .addImm(Val: SubReg);
4729 } else {
4730 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
4731 .addReg(RegNo: SrcReg, flags: 0, SubReg)
4732 .addReg(RegNo: SrcReg, flags: RegState::Implicit);
4733 }
4734
4735 MI.eraseFromParent();
4736
4737 return LoopBB;
4738}
4739
4740static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
4741 MachineBasicBlock &MBB,
4742 const GCNSubtarget &ST) {
4743 const SIInstrInfo *TII = ST.getInstrInfo();
4744 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4745 MachineFunction *MF = MBB.getParent();
4746 MachineRegisterInfo &MRI = MF->getRegInfo();
4747
4748 Register Dst = MI.getOperand(i: 0).getReg();
4749 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
4750 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
4751 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
4752 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
4753 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
4754 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
4755
4756 // This can be an immediate, but will be folded later.
4757 assert(Val->getReg());
4758
4759 unsigned SubReg;
4760 std::tie(args&: SubReg, args&: Offset) = computeIndirectRegAndOffset(TRI, SuperRC: VecRC,
4761 VecReg: SrcVec->getReg(),
4762 Offset);
4763 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4764
4765 if (Idx->getReg() == AMDGPU::NoRegister) {
4766 MachineBasicBlock::iterator I(&MI);
4767 const DebugLoc &DL = MI.getDebugLoc();
4768
4769 assert(Offset == 0);
4770
4771 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
4772 .add(MO: *SrcVec)
4773 .add(MO: *Val)
4774 .addImm(Val: SubReg);
4775
4776 MI.eraseFromParent();
4777 return &MBB;
4778 }
4779
4780 // Check for a SGPR index.
4781 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
4782 MachineBasicBlock::iterator I(&MI);
4783 const DebugLoc &DL = MI.getDebugLoc();
4784
4785 if (UseGPRIdxMode) {
4786 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
4787
4788 const MCInstrDesc &GPRIDXDesc =
4789 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
4790 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4791 .addReg(RegNo: SrcVec->getReg())
4792 .add(MO: *Val)
4793 .addReg(RegNo: Idx)
4794 .addImm(Val: SubReg);
4795 } else {
4796 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
4797
4798 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4799 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
4800 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
4801 .addReg(RegNo: SrcVec->getReg())
4802 .add(MO: *Val)
4803 .addImm(Val: SubReg);
4804 }
4805 MI.eraseFromParent();
4806 return &MBB;
4807 }
4808
4809 // Control flow needs to be inserted if indexing with a VGPR.
4810 if (Val->isReg())
4811 MRI.clearKillFlags(Reg: Val->getReg());
4812
4813 const DebugLoc &DL = MI.getDebugLoc();
4814
4815 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
4816
4817 Register SGPRIdxReg;
4818 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
4819 UseGPRIdxMode, SGPRIdxReg);
4820 MachineBasicBlock *LoopBB = InsPt->getParent();
4821
4822 if (UseGPRIdxMode) {
4823 const MCInstrDesc &GPRIDXDesc =
4824 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
4825
4826 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
4827 .addReg(RegNo: PhiReg)
4828 .add(MO: *Val)
4829 .addReg(RegNo: SGPRIdxReg)
4830 .addImm(Val: SubReg);
4831 } else {
4832 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4833 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
4834 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
4835 .addReg(RegNo: PhiReg)
4836 .add(MO: *Val)
4837 .addImm(Val: SubReg);
4838 }
4839
4840 MI.eraseFromParent();
4841 return LoopBB;
4842}
4843
4844static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
4845 MachineBasicBlock &BB,
4846 const GCNSubtarget &ST,
4847 unsigned Opc) {
4848 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
4849 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4850 const DebugLoc &DL = MI.getDebugLoc();
4851 const SIInstrInfo *TII = ST.getInstrInfo();
4852
4853 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4854 Register SrcReg = MI.getOperand(i: 1).getReg();
4855 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
4856 Register DstReg = MI.getOperand(i: 0).getReg();
4857 MachineBasicBlock *RetBB = nullptr;
4858 if (isSGPR) {
4859 // These operations with a uniform value i.e. SGPR are idempotent.
4860 // Reduced value will be same as given sgpr.
4861 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
4862 RetBB = &BB;
4863 } else {
4864 // TODO: Implement DPP Strategy and switch based on immediate strategy
4865 // operand. For now, for all the cases (default, Iterative and DPP we use
4866 // iterative approach by default.)
4867
4868 // To reduce the VGPR using iterative approach, we need to iterate
4869 // over all the active lanes. Lowering consists of ComputeLoop,
4870 // which iterate over only active lanes. We use copy of EXEC register
4871 // as induction variable and every active lane modifies it using bitset0
4872 // so that we will get the next active lane for next iteration.
4873 MachineBasicBlock::iterator I = BB.end();
4874 Register SrcReg = MI.getOperand(i: 1).getReg();
4875
4876 // Create Control flow for loop
4877 // Split MI's Machine Basic block into For loop
4878 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
4879
4880 // Create virtual registers required for lowering.
4881 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4882 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
4883 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4884 Register InitalValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4885
4886 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4887 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4888 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
4889
4890 Register FF1Reg = MRI.createVirtualRegister(RegClass: DstRegClass);
4891 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
4892
4893 bool IsWave32 = ST.isWave32();
4894 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4895 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4896
4897 // Create initail values of induction variable from Exec, Accumulator and
4898 // insert branch instr to newly created ComputeBlockk
4899 uint32_t InitalValue =
4900 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4901 auto TmpSReg =
4902 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: LoopIterator).addReg(RegNo: ExecReg);
4903 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: InitalValReg)
4904 .addImm(Val: InitalValue);
4905 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH)).addMBB(MBB: ComputeLoop);
4906
4907 // Start constructing ComputeLoop
4908 I = ComputeLoop->end();
4909 auto Accumulator =
4910 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
4911 .addReg(RegNo: InitalValReg)
4912 .addMBB(MBB: &BB);
4913 auto ActiveBits =
4914 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
4915 .addReg(RegNo: TmpSReg->getOperand(i: 0).getReg())
4916 .addMBB(MBB: &BB);
4917
4918 // Perform the computations
4919 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4920 auto FF1 = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
4921 .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg());
4922 auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
4923 MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32), DestReg: LaneValueReg)
4924 .addReg(RegNo: SrcReg)
4925 .addReg(RegNo: FF1->getOperand(i: 0).getReg());
4926 auto NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
4927 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
4928 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
4929
4930 // Manipulate the iterator to get the next active lane
4931 unsigned BITSETOpc =
4932 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4933 auto NewActiveBits =
4934 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
4935 .addReg(RegNo: FF1->getOperand(i: 0).getReg())
4936 .addReg(RegNo: ActiveBits->getOperand(i: 0).getReg());
4937
4938 // Add phi nodes
4939 Accumulator.addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
4940 .addMBB(MBB: ComputeLoop);
4941 ActiveBits.addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg())
4942 .addMBB(MBB: ComputeLoop);
4943
4944 // Creating branching
4945 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4946 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
4947 .addReg(RegNo: NewActiveBits->getOperand(i: 0).getReg())
4948 .addImm(Val: 0);
4949 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
4950 .addMBB(MBB: ComputeLoop);
4951
4952 RetBB = ComputeEnd;
4953 }
4954 MI.eraseFromParent();
4955 return RetBB;
4956}
4957
4958MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
4959 MachineInstr &MI, MachineBasicBlock *BB) const {
4960
4961 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4962 MachineFunction *MF = BB->getParent();
4963 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
4964
4965 switch (MI.getOpcode()) {
4966 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4967 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
4968 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4969 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
4970 case AMDGPU::S_UADDO_PSEUDO:
4971 case AMDGPU::S_USUBO_PSEUDO: {
4972 const DebugLoc &DL = MI.getDebugLoc();
4973 MachineOperand &Dest0 = MI.getOperand(i: 0);
4974 MachineOperand &Dest1 = MI.getOperand(i: 1);
4975 MachineOperand &Src0 = MI.getOperand(i: 2);
4976 MachineOperand &Src1 = MI.getOperand(i: 3);
4977
4978 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4979 ? AMDGPU::S_ADD_I32
4980 : AMDGPU::S_SUB_I32;
4981 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg()).add(MO: Src0).add(MO: Src1);
4982
4983 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: Dest1.getReg())
4984 .addImm(Val: 1)
4985 .addImm(Val: 0);
4986
4987 MI.eraseFromParent();
4988 return BB;
4989 }
4990 case AMDGPU::S_ADD_U64_PSEUDO:
4991 case AMDGPU::S_SUB_U64_PSEUDO: {
4992 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4993 // For GFX12, we emit s_add_u64 and s_sub_u64.
4994 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4995 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4996 const DebugLoc &DL = MI.getDebugLoc();
4997 MachineOperand &Dest = MI.getOperand(i: 0);
4998 MachineOperand &Src0 = MI.getOperand(i: 1);
4999 MachineOperand &Src1 = MI.getOperand(i: 2);
5000 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5001 if (Subtarget->hasScalarAddSub64()) {
5002 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5003 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5004 .add(MO: Src0)
5005 .add(MO: Src1);
5006 } else {
5007 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5008 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5009
5010 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5011 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5012
5013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5014 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5015 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5016 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5017
5018 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5019 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5020 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5021 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5022
5023 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5024 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5025 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5026 .add(MO: Src0Sub0)
5027 .add(MO: Src1Sub0);
5028 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5029 .add(MO: Src0Sub1)
5030 .add(MO: Src1Sub1);
5031 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5032 .addReg(RegNo: DestSub0)
5033 .addImm(Val: AMDGPU::sub0)
5034 .addReg(RegNo: DestSub1)
5035 .addImm(Val: AMDGPU::sub1);
5036 }
5037 MI.eraseFromParent();
5038 return BB;
5039 }
5040 case AMDGPU::V_ADD_U64_PSEUDO:
5041 case AMDGPU::V_SUB_U64_PSEUDO: {
5042 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5044 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5045 const DebugLoc &DL = MI.getDebugLoc();
5046
5047 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5048
5049 MachineOperand &Dest = MI.getOperand(i: 0);
5050 MachineOperand &Src0 = MI.getOperand(i: 1);
5051 MachineOperand &Src1 = MI.getOperand(i: 2);
5052
5053 if (IsAdd && ST.hasLshlAddB64()) {
5054 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
5055 DestReg: Dest.getReg())
5056 .add(MO: Src0)
5057 .addImm(Val: 0)
5058 .add(MO: Src1);
5059 TII->legalizeOperands(MI&: *Add);
5060 MI.eraseFromParent();
5061 return BB;
5062 }
5063
5064 const auto *CarryRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID);
5065
5066 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5067 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5068
5069 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5070 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
5071
5072 const TargetRegisterClass *Src0RC = Src0.isReg()
5073 ? MRI.getRegClass(Reg: Src0.getReg())
5074 : &AMDGPU::VReg_64RegClass;
5075 const TargetRegisterClass *Src1RC = Src1.isReg()
5076 ? MRI.getRegClass(Reg: Src1.getReg())
5077 : &AMDGPU::VReg_64RegClass;
5078
5079 const TargetRegisterClass *Src0SubRC =
5080 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5081 const TargetRegisterClass *Src1SubRC =
5082 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5083
5084 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5085 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5086 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5087 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5088
5089 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5090 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5091 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5092 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5093
5094 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5095 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
5096 .addReg(RegNo: CarryReg, flags: RegState::Define)
5097 .add(MO: SrcReg0Sub0)
5098 .add(MO: SrcReg1Sub0)
5099 .addImm(Val: 0); // clamp bit
5100
5101 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5102 MachineInstr *HiHalf =
5103 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
5104 .addReg(RegNo: DeadCarryReg, flags: RegState::Define | RegState::Dead)
5105 .add(MO: SrcReg0Sub1)
5106 .add(MO: SrcReg1Sub1)
5107 .addReg(RegNo: CarryReg, flags: RegState::Kill)
5108 .addImm(Val: 0); // clamp bit
5109
5110 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5111 .addReg(RegNo: DestSub0)
5112 .addImm(Val: AMDGPU::sub0)
5113 .addReg(RegNo: DestSub1)
5114 .addImm(Val: AMDGPU::sub1);
5115 TII->legalizeOperands(MI&: *LoHalf);
5116 TII->legalizeOperands(MI&: *HiHalf);
5117 MI.eraseFromParent();
5118 return BB;
5119 }
5120 case AMDGPU::S_ADD_CO_PSEUDO:
5121 case AMDGPU::S_SUB_CO_PSEUDO: {
5122 // This pseudo has a chance to be selected
5123 // only from uniform add/subcarry node. All the VGPR operands
5124 // therefore assumed to be splat vectors.
5125 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5127 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5128 MachineBasicBlock::iterator MII = MI;
5129 const DebugLoc &DL = MI.getDebugLoc();
5130 MachineOperand &Dest = MI.getOperand(i: 0);
5131 MachineOperand &CarryDest = MI.getOperand(i: 1);
5132 MachineOperand &Src0 = MI.getOperand(i: 2);
5133 MachineOperand &Src1 = MI.getOperand(i: 3);
5134 MachineOperand &Src2 = MI.getOperand(i: 4);
5135 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5136 ? AMDGPU::S_ADDC_U32
5137 : AMDGPU::S_SUBB_U32;
5138 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
5139 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5140 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
5141 .addReg(RegNo: Src0.getReg());
5142 Src0.setReg(RegOp0);
5143 }
5144 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
5145 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5146 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
5147 .addReg(RegNo: Src1.getReg());
5148 Src1.setReg(RegOp1);
5149 }
5150 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5151 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
5152 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
5153 .addReg(RegNo: Src2.getReg());
5154 Src2.setReg(RegOp2);
5155 }
5156
5157 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
5158 unsigned WaveSize = TRI->getRegSizeInBits(RC: *Src2RC);
5159 assert(WaveSize == 64 || WaveSize == 32);
5160
5161 if (WaveSize == 64) {
5162 if (ST.hasScalarCompareEq64()) {
5163 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
5164 .addReg(RegNo: Src2.getReg())
5165 .addImm(Val: 0);
5166 } else {
5167 const TargetRegisterClass *SubRC =
5168 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5169 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5170 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
5171 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5172 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
5173 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5174
5175 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
5176 .add(MO: Src2Sub0)
5177 .add(MO: Src2Sub1);
5178
5179 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5180 .addReg(RegNo: Src2_32, flags: RegState::Kill)
5181 .addImm(Val: 0);
5182 }
5183 } else {
5184 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5185 .addReg(RegNo: Src2.getReg())
5186 .addImm(Val: 0);
5187 }
5188
5189 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
5190
5191 unsigned SelOpc =
5192 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5193
5194 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
5195 .addImm(Val: -1)
5196 .addImm(Val: 0);
5197
5198 MI.eraseFromParent();
5199 return BB;
5200 }
5201 case AMDGPU::SI_INIT_M0: {
5202 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
5203 MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: AMDGPU::M0)
5204 .add(MO: MI.getOperand(i: 0));
5205 MI.eraseFromParent();
5206 return BB;
5207 }
5208 case AMDGPU::GET_GROUPSTATICSIZE: {
5209 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5210 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5211 DebugLoc DL = MI.getDebugLoc();
5212 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
5213 .add(MO: MI.getOperand(i: 0))
5214 .addImm(Val: MFI->getLDSSize());
5215 MI.eraseFromParent();
5216 return BB;
5217 }
5218 case AMDGPU::GET_SHADERCYCLESHILO: {
5219 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
5220 MachineRegisterInfo &MRI = MF->getRegInfo();
5221 const DebugLoc &DL = MI.getDebugLoc();
5222 // The algorithm is:
5223 //
5224 // hi1 = getreg(SHADER_CYCLES_HI)
5225 // lo1 = getreg(SHADER_CYCLES_LO)
5226 // hi2 = getreg(SHADER_CYCLES_HI)
5227 //
5228 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5229 // Otherwise there was overflow and the result is hi2:0. In both cases the
5230 // result should represent the actual time at some point during the sequence
5231 // of three getregs.
5232 using namespace AMDGPU::Hwreg;
5233 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5234 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
5235 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
5236 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5237 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
5238 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
5239 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5240 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
5241 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
5242 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
5243 .addReg(RegNo: RegHi1)
5244 .addReg(RegNo: RegHi2);
5245 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5246 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
5247 .addReg(RegNo: RegLo1)
5248 .addImm(Val: 0);
5249 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
5250 .add(MO: MI.getOperand(i: 0))
5251 .addReg(RegNo: RegLo)
5252 .addImm(Val: AMDGPU::sub0)
5253 .addReg(RegNo: RegHi2)
5254 .addImm(Val: AMDGPU::sub1);
5255 MI.eraseFromParent();
5256 return BB;
5257 }
5258 case AMDGPU::SI_INDIRECT_SRC_V1:
5259 case AMDGPU::SI_INDIRECT_SRC_V2:
5260 case AMDGPU::SI_INDIRECT_SRC_V4:
5261 case AMDGPU::SI_INDIRECT_SRC_V8:
5262 case AMDGPU::SI_INDIRECT_SRC_V9:
5263 case AMDGPU::SI_INDIRECT_SRC_V10:
5264 case AMDGPU::SI_INDIRECT_SRC_V11:
5265 case AMDGPU::SI_INDIRECT_SRC_V12:
5266 case AMDGPU::SI_INDIRECT_SRC_V16:
5267 case AMDGPU::SI_INDIRECT_SRC_V32:
5268 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
5269 case AMDGPU::SI_INDIRECT_DST_V1:
5270 case AMDGPU::SI_INDIRECT_DST_V2:
5271 case AMDGPU::SI_INDIRECT_DST_V4:
5272 case AMDGPU::SI_INDIRECT_DST_V8:
5273 case AMDGPU::SI_INDIRECT_DST_V9:
5274 case AMDGPU::SI_INDIRECT_DST_V10:
5275 case AMDGPU::SI_INDIRECT_DST_V11:
5276 case AMDGPU::SI_INDIRECT_DST_V12:
5277 case AMDGPU::SI_INDIRECT_DST_V16:
5278 case AMDGPU::SI_INDIRECT_DST_V32:
5279 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
5280 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5281 case AMDGPU::SI_KILL_I1_PSEUDO:
5282 return splitKillBlock(MI, BB);
5283 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5284 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5285 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5286 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5287
5288 Register Dst = MI.getOperand(i: 0).getReg();
5289 const MachineOperand &Src0 = MI.getOperand(i: 1);
5290 const MachineOperand &Src1 = MI.getOperand(i: 2);
5291 const DebugLoc &DL = MI.getDebugLoc();
5292 Register SrcCond = MI.getOperand(i: 3).getReg();
5293
5294 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5295 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5296 const auto *CondRC = TRI->getRegClass(RCID: AMDGPU::SReg_1_XEXECRegClassID);
5297 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
5298
5299 const TargetRegisterClass *Src0RC = Src0.isReg()
5300 ? MRI.getRegClass(Reg: Src0.getReg())
5301 : &AMDGPU::VReg_64RegClass;
5302 const TargetRegisterClass *Src1RC = Src1.isReg()
5303 ? MRI.getRegClass(Reg: Src1.getReg())
5304 : &AMDGPU::VReg_64RegClass;
5305
5306 const TargetRegisterClass *Src0SubRC =
5307 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5308 const TargetRegisterClass *Src1SubRC =
5309 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5310
5311 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5312 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
5313 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5314 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5315
5316 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5317 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
5318 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5319 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5320
5321 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy)
5322 .addReg(RegNo: SrcCond);
5323 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
5324 .addImm(Val: 0)
5325 .add(MO: Src0Sub0)
5326 .addImm(Val: 0)
5327 .add(MO: Src1Sub0)
5328 .addReg(RegNo: SrcCondCopy);
5329 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
5330 .addImm(Val: 0)
5331 .add(MO: Src0Sub1)
5332 .addImm(Val: 0)
5333 .add(MO: Src1Sub1)
5334 .addReg(RegNo: SrcCondCopy);
5335
5336 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
5337 .addReg(RegNo: DstLo)
5338 .addImm(Val: AMDGPU::sub0)
5339 .addReg(RegNo: DstHi)
5340 .addImm(Val: AMDGPU::sub1);
5341 MI.eraseFromParent();
5342 return BB;
5343 }
5344 case AMDGPU::SI_BR_UNDEF: {
5345 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5346 const DebugLoc &DL = MI.getDebugLoc();
5347 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5348 .add(MO: MI.getOperand(i: 0));
5349 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
5350 MI.eraseFromParent();
5351 return BB;
5352 }
5353 case AMDGPU::ADJCALLSTACKUP:
5354 case AMDGPU::ADJCALLSTACKDOWN: {
5355 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
5356 MachineInstrBuilder MIB(*MF, &MI);
5357 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::ImplicitDefine)
5358 .addReg(RegNo: Info->getStackPtrOffsetReg(), flags: RegState::Implicit);
5359 return BB;
5360 }
5361 case AMDGPU::SI_CALL_ISEL: {
5362 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5363 const DebugLoc &DL = MI.getDebugLoc();
5364
5365 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
5366
5367 MachineInstrBuilder MIB;
5368 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
5369
5370 for (const MachineOperand &MO : MI.operands())
5371 MIB.add(MO);
5372
5373 MIB.cloneMemRefs(OtherMI: MI);
5374 MI.eraseFromParent();
5375 return BB;
5376 }
5377 case AMDGPU::V_ADD_CO_U32_e32:
5378 case AMDGPU::V_SUB_CO_U32_e32:
5379 case AMDGPU::V_SUBREV_CO_U32_e32: {
5380 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5381 const DebugLoc &DL = MI.getDebugLoc();
5382 unsigned Opc = MI.getOpcode();
5383
5384 bool NeedClampOperand = false;
5385 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
5386 Opc = AMDGPU::getVOPe64(Opcode: Opc);
5387 NeedClampOperand = true;
5388 }
5389
5390 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
5391 if (TII->isVOP3(MI: *I)) {
5392 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5393 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5394 I.addReg(RegNo: TRI->getVCC(), flags: RegState::Define);
5395 }
5396 I.add(MO: MI.getOperand(i: 1))
5397 .add(MO: MI.getOperand(i: 2));
5398 if (NeedClampOperand)
5399 I.addImm(Val: 0); // clamp bit for e64 encoding
5400
5401 TII->legalizeOperands(MI&: *I);
5402
5403 MI.eraseFromParent();
5404 return BB;
5405 }
5406 case AMDGPU::V_ADDC_U32_e32:
5407 case AMDGPU::V_SUBB_U32_e32:
5408 case AMDGPU::V_SUBBREV_U32_e32:
5409 // These instructions have an implicit use of vcc which counts towards the
5410 // constant bus limit.
5411 TII->legalizeOperands(MI);
5412 return BB;
5413 case AMDGPU::DS_GWS_INIT:
5414 case AMDGPU::DS_GWS_SEMA_BR:
5415 case AMDGPU::DS_GWS_BARRIER:
5416 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::data0);
5417 [[fallthrough]];
5418 case AMDGPU::DS_GWS_SEMA_V:
5419 case AMDGPU::DS_GWS_SEMA_P:
5420 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5421 // A s_waitcnt 0 is required to be the instruction immediately following.
5422 if (getSubtarget()->hasGWSAutoReplay()) {
5423 bundleInstWithWaitcnt(MI);
5424 return BB;
5425 }
5426
5427 return emitGWSMemViolTestLoop(MI, BB);
5428 case AMDGPU::S_SETREG_B32: {
5429 // Try to optimize cases that only set the denormal mode or rounding mode.
5430 //
5431 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5432 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5433 // instead.
5434 //
5435 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5436 // allow you to have a no side effect instruction in the output of a
5437 // sideeffecting pattern.
5438 auto [ID, Offset, Width] =
5439 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
5440 if (ID != AMDGPU::Hwreg::ID_MODE)
5441 return BB;
5442
5443 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
5444 const unsigned SetMask = WidthMask << Offset;
5445
5446 if (getSubtarget()->hasDenormModeInst()) {
5447 unsigned SetDenormOp = 0;
5448 unsigned SetRoundOp = 0;
5449
5450 // The dedicated instructions can only set the whole denorm or round mode
5451 // at once, not a subset of bits in either.
5452 if (SetMask ==
5453 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
5454 // If this fully sets both the round and denorm mode, emit the two
5455 // dedicated instructions for these.
5456 SetRoundOp = AMDGPU::S_ROUND_MODE;
5457 SetDenormOp = AMDGPU::S_DENORM_MODE;
5458 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5459 SetRoundOp = AMDGPU::S_ROUND_MODE;
5460 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5461 SetDenormOp = AMDGPU::S_DENORM_MODE;
5462 }
5463
5464 if (SetRoundOp || SetDenormOp) {
5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5466 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
5467 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
5468 unsigned ImmVal = Def->getOperand(i: 1).getImm();
5469 if (SetRoundOp) {
5470 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
5471 .addImm(Val: ImmVal & 0xf);
5472
5473 // If we also have the denorm mode, get just the denorm mode bits.
5474 ImmVal >>= 4;
5475 }
5476
5477 if (SetDenormOp) {
5478 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
5479 .addImm(Val: ImmVal & 0xf);
5480 }
5481
5482 MI.eraseFromParent();
5483 return BB;
5484 }
5485 }
5486 }
5487
5488 // If only FP bits are touched, used the no side effects pseudo.
5489 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5490 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5491 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
5492
5493 return BB;
5494 }
5495 case AMDGPU::S_INVERSE_BALLOT_U32:
5496 case AMDGPU::S_INVERSE_BALLOT_U64:
5497 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5498 // necessary. After that they are equivalent to a COPY.
5499 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
5500 return BB;
5501 case AMDGPU::ENDPGM_TRAP: {
5502 const DebugLoc &DL = MI.getDebugLoc();
5503 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
5504 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
5505 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
5506 return BB;
5507 }
5508
5509 // We need a block split to make the real endpgm a terminator. We also don't
5510 // want to break phis in successor blocks, so we can't just delete to the
5511 // end of the block.
5512
5513 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
5514 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
5515 MF->push_back(MBB: TrapBB);
5516 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
5517 .addImm(Val: 0);
5518 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5519 .addMBB(MBB: TrapBB);
5520
5521 BB->addSuccessor(Succ: TrapBB);
5522 MI.eraseFromParent();
5523 return SplitBB;
5524 }
5525 case AMDGPU::SIMULATED_TRAP: {
5526 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5527 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5528 MachineBasicBlock *SplitBB =
5529 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
5530 MI.eraseFromParent();
5531 return SplitBB;
5532 }
5533 default:
5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5535 if (!MI.mayStore())
5536 AddMemOpInit(MI);
5537 return BB;
5538 }
5539 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
5540 }
5541}
5542
5543bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
5544 // This currently forces unfolding various combinations of fsub into fma with
5545 // free fneg'd operands. As long as we have fast FMA (controlled by
5546 // isFMAFasterThanFMulAndFAdd), we should perform these.
5547
5548 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5549 // most of these combines appear to be cycle neutral but save on instruction
5550 // count / code size.
5551 return true;
5552}
5553
5554bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
5555
5556EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
5557 EVT VT) const {
5558 if (!VT.isVector()) {
5559 return MVT::i1;
5560 }
5561 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
5562}
5563
5564MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
5565 // TODO: Should i16 be used always if legal? For now it would force VALU
5566 // shifts.
5567 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5568}
5569
5570LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5572 ? Ty.changeElementSize(NewEltSize: 16)
5573 : Ty.changeElementSize(NewEltSize: 32);
5574}
5575
5576// Answering this is somewhat tricky and depends on the specific device which
5577// have different rates for fma or all f64 operations.
5578//
5579// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5580// regardless of which device (although the number of cycles differs between
5581// devices), so it is always profitable for f64.
5582//
5583// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5584// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5585// which we can always do even without fused FP ops since it returns the same
5586// result as the separate operations and since it is always full
5587// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5588// however does not support denormals, so we do report fma as faster if we have
5589// a fast fma device and require denormals.
5590//
5591bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5592 EVT VT) const {
5593 VT = VT.getScalarType();
5594
5595 switch (VT.getSimpleVT().SimpleTy) {
5596 case MVT::f32: {
5597 // If mad is not available this depends only on if f32 fma is full rate.
5598 if (!Subtarget->hasMadMacF32Insts())
5599 return Subtarget->hasFastFMAF32();
5600
5601 // Otherwise f32 mad is always full rate and returns the same result as
5602 // the separate operations so should be preferred over fma.
5603 // However does not support denormals.
5604 if (!denormalModeIsFlushAllF32(MF))
5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5606
5607 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5609 }
5610 case MVT::f64:
5611 return true;
5612 case MVT::f16:
5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5614 default:
5615 break;
5616 }
5617
5618 return false;
5619}
5620
5621bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
5622 LLT Ty) const {
5623 switch (Ty.getScalarSizeInBits()) {
5624 case 16:
5625 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
5626 case 32:
5627 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
5628 case 64:
5629 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
5630 default:
5631 break;
5632 }
5633
5634 return false;
5635}
5636
5637bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
5638 if (!Ty.isScalar())
5639 return false;
5640
5641 if (Ty.getScalarSizeInBits() == 16)
5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
5643 if (Ty.getScalarSizeInBits() == 32)
5644 return Subtarget->hasMadMacF32Insts() &&
5645 denormalModeIsFlushAllF32(MF: *MI.getMF());
5646
5647 return false;
5648}
5649
5650bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
5651 const SDNode *N) const {
5652 // TODO: Check future ftz flag
5653 // v_mad_f32/v_mac_f32 do not support denormals.
5654 EVT VT = N->getValueType(ResNo: 0);
5655 if (VT == MVT::f32)
5656 return Subtarget->hasMadMacF32Insts() &&
5657 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
5658 if (VT == MVT::f16) {
5659 return Subtarget->hasMadF16() &&
5660 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
5661 }
5662
5663 return false;
5664}
5665
5666//===----------------------------------------------------------------------===//
5667// Custom DAG Lowering Operations
5668//===----------------------------------------------------------------------===//
5669
5670// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5671// wider vector type is legal.
5672SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
5673 SelectionDAG &DAG) const {
5674 unsigned Opc = Op.getOpcode();
5675 EVT VT = Op.getValueType();
5676 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5677 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5678 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5679 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5680
5681 SDValue Lo, Hi;
5682 std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
5683
5684 SDLoc SL(Op);
5685 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo,
5686 Flags: Op->getFlags());
5687 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi,
5688 Flags: Op->getFlags());
5689
5690 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5691}
5692
5693// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5694// wider vector type is legal.
5695SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
5696 SelectionDAG &DAG) const {
5697 unsigned Opc = Op.getOpcode();
5698 EVT VT = Op.getValueType();
5699 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5700 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5701 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5702 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5703
5704 SDValue Lo0, Hi0;
5705 std::tie(args&: Lo0, args&: Hi0) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
5706 SDValue Lo1, Hi1;
5707 std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
5708
5709 SDLoc SL(Op);
5710
5711 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1,
5712 Flags: Op->getFlags());
5713 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1,
5714 Flags: Op->getFlags());
5715
5716 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5717}
5718
5719SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
5720 SelectionDAG &DAG) const {
5721 unsigned Opc = Op.getOpcode();
5722 EVT VT = Op.getValueType();
5723 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5724 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5725 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5726 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5727 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5728 VT == MVT::v32bf16);
5729
5730 SDValue Lo0, Hi0;
5731 SDValue Op0 = Op.getOperand(i: 0);
5732 std::tie(args&: Lo0, args&: Hi0) = Op0.getValueType().isVector()
5733 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
5734 : std::pair(Op0, Op0);
5735 SDValue Lo1, Hi1;
5736 std::tie(args&: Lo1, args&: Hi1) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
5737 SDValue Lo2, Hi2;
5738 std::tie(args&: Lo2, args&: Hi2) = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
5739
5740 SDLoc SL(Op);
5741 auto ResVT = DAG.GetSplitDestVTs(VT);
5742
5743 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2,
5744 Flags: Op->getFlags());
5745 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2,
5746 Flags: Op->getFlags());
5747
5748 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
5749}
5750
5751
5752SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
5753 switch (Op.getOpcode()) {
5754 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5755 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5756 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5757 case ISD::LOAD: {
5758 SDValue Result = LowerLOAD(Op, DAG);
5759 assert((!Result.getNode() ||
5760 Result.getNode()->getNumValues() == 2) &&
5761 "Load should return a value and a chain");
5762 return Result;
5763 }
5764 case ISD::FSQRT: {
5765 EVT VT = Op.getValueType();
5766 if (VT == MVT::f32)
5767 return lowerFSQRTF32(Op, DAG);
5768 if (VT == MVT::f64)
5769 return lowerFSQRTF64(Op, DAG);
5770 return SDValue();
5771 }
5772 case ISD::FSIN:
5773 case ISD::FCOS:
5774 return LowerTrig(Op, DAG);
5775 case ISD::SELECT: return LowerSELECT(Op, DAG);
5776 case ISD::FDIV: return LowerFDIV(Op, DAG);
5777 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5778 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5779 case ISD::STORE: return LowerSTORE(Op, DAG);
5780 case ISD::GlobalAddress: {
5781 MachineFunction &MF = DAG.getMachineFunction();
5782 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
5783 return LowerGlobalAddress(MFI, Op, DAG);
5784 }
5785 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5786 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5787 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5788 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5789 case ISD::INSERT_SUBVECTOR:
5790 return lowerINSERT_SUBVECTOR(Op, DAG);
5791 case ISD::INSERT_VECTOR_ELT:
5792 return lowerINSERT_VECTOR_ELT(Op, DAG);
5793 case ISD::EXTRACT_VECTOR_ELT:
5794 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5795 case ISD::VECTOR_SHUFFLE:
5796 return lowerVECTOR_SHUFFLE(Op, DAG);
5797 case ISD::SCALAR_TO_VECTOR:
5798 return lowerSCALAR_TO_VECTOR(Op, DAG);
5799 case ISD::BUILD_VECTOR:
5800 return lowerBUILD_VECTOR(Op, DAG);
5801 case ISD::FP_ROUND:
5802 case ISD::STRICT_FP_ROUND:
5803 return lowerFP_ROUND(Op, DAG);
5804 case ISD::FPTRUNC_ROUND: {
5805 unsigned Opc;
5806 SDLoc DL(Op);
5807
5808 if (Op.getOperand(i: 0)->getValueType(ResNo: 0) != MVT::f32)
5809 return SDValue();
5810
5811 // Get the rounding mode from the last operand
5812 int RoundMode = Op.getConstantOperandVal(i: 1);
5813 if (RoundMode == (int)RoundingMode::TowardPositive)
5814 Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
5815 else if (RoundMode == (int)RoundingMode::TowardNegative)
5816 Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
5817 else
5818 return SDValue();
5819
5820 return DAG.getNode(Opcode: Opc, DL, VTList: Op.getNode()->getVTList(), N: Op->getOperand(Num: 0));
5821 }
5822 case ISD::TRAP:
5823 return lowerTRAP(Op, DAG);
5824 case ISD::DEBUGTRAP:
5825 return lowerDEBUGTRAP(Op, DAG);
5826 case ISD::ABS:
5827 case ISD::FABS:
5828 case ISD::FNEG:
5829 case ISD::FCANONICALIZE:
5830 case ISD::BSWAP:
5831 return splitUnaryVectorOp(Op, DAG);
5832 case ISD::FMINNUM:
5833 case ISD::FMAXNUM:
5834 return lowerFMINNUM_FMAXNUM(Op, DAG);
5835 case ISD::FLDEXP:
5836 case ISD::STRICT_FLDEXP:
5837 return lowerFLDEXP(Op, DAG);
5838 case ISD::FMA:
5839 return splitTernaryVectorOp(Op, DAG);
5840 case ISD::FP_TO_SINT:
5841 case ISD::FP_TO_UINT:
5842 return LowerFP_TO_INT(Op, DAG);
5843 case ISD::SHL:
5844 case ISD::SRA:
5845 case ISD::SRL:
5846 case ISD::ADD:
5847 case ISD::SUB:
5848 case ISD::SMIN:
5849 case ISD::SMAX:
5850 case ISD::UMIN:
5851 case ISD::UMAX:
5852 case ISD::FADD:
5853 case ISD::FMUL:
5854 case ISD::FMINNUM_IEEE:
5855 case ISD::FMAXNUM_IEEE:
5856 case ISD::FMINIMUM:
5857 case ISD::FMAXIMUM:
5858 case ISD::UADDSAT:
5859 case ISD::USUBSAT:
5860 case ISD::SADDSAT:
5861 case ISD::SSUBSAT:
5862 return splitBinaryVectorOp(Op, DAG);
5863 case ISD::MUL:
5864 return lowerMUL(Op, DAG);
5865 case ISD::SMULO:
5866 case ISD::UMULO:
5867 return lowerXMULO(Op, DAG);
5868 case ISD::SMUL_LOHI:
5869 case ISD::UMUL_LOHI:
5870 return lowerXMUL_LOHI(Op, DAG);
5871 case ISD::DYNAMIC_STACKALLOC:
5872 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5873 case ISD::STACKSAVE:
5874 return LowerSTACKSAVE(Op, DAG);
5875 case ISD::GET_ROUNDING:
5876 return lowerGET_ROUNDING(Op, DAG);
5877 case ISD::SET_ROUNDING:
5878 return lowerSET_ROUNDING(Op, DAG);
5879 case ISD::PREFETCH:
5880 return lowerPREFETCH(Op, DAG);
5881 case ISD::FP_EXTEND:
5882 case ISD::STRICT_FP_EXTEND:
5883 return lowerFP_EXTEND(Op, DAG);
5884 case ISD::GET_FPENV:
5885 return lowerGET_FPENV(Op, DAG);
5886 case ISD::SET_FPENV:
5887 return lowerSET_FPENV(Op, DAG);
5888 }
5889 return SDValue();
5890}
5891
5892// Used for D16: Casts the result of an instruction into the right vector,
5893// packs values if loads return unpacked values.
5894static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
5895 const SDLoc &DL,
5896 SelectionDAG &DAG, bool Unpacked) {
5897 if (!LoadVT.isVector())
5898 return Result;
5899
5900 // Cast back to the original packed type or to a larger type that is a
5901 // multiple of 32 bit for D16. Widening the return type is a required for
5902 // legalization.
5903 EVT FittingLoadVT = LoadVT;
5904 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5905 FittingLoadVT =
5906 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
5907 NumElements: LoadVT.getVectorNumElements() + 1);
5908 }
5909
5910 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5911 // Truncate to v2i16/v4i16.
5912 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5913
5914 // Workaround legalizer not scalarizing truncate after vector op
5915 // legalization but not creating intermediate vector trunc.
5916 SmallVector<SDValue, 4> Elts;
5917 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
5918 for (SDValue &Elt : Elts)
5919 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
5920
5921 // Pad illegal v1i16/v3fi6 to v4i16
5922 if ((LoadVT.getVectorNumElements() % 2) == 1)
5923 Elts.push_back(Elt: DAG.getUNDEF(VT: MVT::i16));
5924
5925 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
5926
5927 // Bitcast to original type (v2f16/v4f16).
5928 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
5929 }
5930
5931 // Cast back to the original packed type.
5932 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
5933}
5934
5935SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5936 MemSDNode *M,
5937 SelectionDAG &DAG,
5938 ArrayRef<SDValue> Ops,
5939 bool IsIntrinsic) const {
5940 SDLoc DL(M);
5941
5942 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5943 EVT LoadVT = M->getValueType(ResNo: 0);
5944
5945 EVT EquivLoadVT = LoadVT;
5946 if (LoadVT.isVector()) {
5947 if (Unpacked) {
5948 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
5949 NumElements: LoadVT.getVectorNumElements());
5950 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5951 // Widen v3f16 to legal type
5952 EquivLoadVT =
5953 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
5954 NumElements: LoadVT.getVectorNumElements() + 1);
5955 }
5956 }
5957
5958 // Change from v4f16/v2f16 to EquivLoadVT.
5959 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
5960
5961 SDValue Load
5962 = DAG.getMemIntrinsicNode(
5963 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL,
5964 VTList, Ops, MemVT: M->getMemoryVT(),
5965 MMO: M->getMemOperand());
5966
5967 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
5968
5969 return DAG.getMergeValues(Ops: { Adjusted, Load.getValue(R: 1) }, dl: DL);
5970}
5971
5972SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5973 SelectionDAG &DAG,
5974 ArrayRef<SDValue> Ops) const {
5975 SDLoc DL(M);
5976 EVT LoadVT = M->getValueType(ResNo: 0);
5977 EVT EltType = LoadVT.getScalarType();
5978 EVT IntVT = LoadVT.changeTypeToInteger();
5979
5980 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5981
5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5983 bool IsTFE = M->getNumValues() == 3;
5984
5985 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5986 : AMDGPUISD::BUFFER_LOAD_FORMAT)
5987 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
5988 : AMDGPUISD::BUFFER_LOAD;
5989
5990 if (IsD16) {
5991 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5992 }
5993
5994 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5995 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
5997 IsTFE);
5998
5999 if (isTypeLegal(VT: LoadVT)) {
6000 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
6001 MMO: M->getMemOperand(), DAG);
6002 }
6003
6004 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
6005 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
6006 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
6007 MMO: M->getMemOperand(), DAG);
6008 return DAG.getMergeValues(
6009 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
6010 dl: DL);
6011}
6012
6013static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
6014 SDNode *N, SelectionDAG &DAG) {
6015 EVT VT = N->getValueType(ResNo: 0);
6016 unsigned CondCode = N->getConstantOperandVal(Num: 3);
6017 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
6018 return DAG.getUNDEF(VT);
6019
6020 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6021
6022 SDValue LHS = N->getOperand(Num: 1);
6023 SDValue RHS = N->getOperand(Num: 2);
6024
6025 SDLoc DL(N);
6026
6027 EVT CmpVT = LHS.getValueType();
6028 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
6029 unsigned PromoteOp = ICmpInst::isSigned(predicate: IcInput) ?
6030 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6031 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
6032 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
6033 }
6034
6035 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
6036
6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6038 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6039
6040 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
6041 N3: DAG.getCondCode(Cond: CCOpcode));
6042 if (VT.bitsEq(VT: CCVT))
6043 return SetCC;
6044 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
6045}
6046
6047static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
6048 SDNode *N, SelectionDAG &DAG) {
6049 EVT VT = N->getValueType(ResNo: 0);
6050
6051 unsigned CondCode = N->getConstantOperandVal(Num: 3);
6052 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
6053 return DAG.getUNDEF(VT);
6054
6055 SDValue Src0 = N->getOperand(Num: 1);
6056 SDValue Src1 = N->getOperand(Num: 2);
6057 EVT CmpVT = Src0.getValueType();
6058 SDLoc SL(N);
6059
6060 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
6061 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
6062 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
6063 }
6064
6065 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6066 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6068 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
6069 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0,
6070 N2: Src1, N3: DAG.getCondCode(Cond: CCOpcode));
6071 if (VT.bitsEq(VT: CCVT))
6072 return SetCC;
6073 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
6074}
6075
6076static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
6077 SelectionDAG &DAG) {
6078 EVT VT = N->getValueType(ResNo: 0);
6079 SDValue Src = N->getOperand(Num: 1);
6080 SDLoc SL(N);
6081
6082 if (Src.getOpcode() == ISD::SETCC) {
6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6084 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Src.getOperand(i: 0),
6085 N2: Src.getOperand(i: 1), N3: Src.getOperand(i: 2));
6086 }
6087 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
6088 // (ballot 0) -> 0
6089 if (Arg->isZero())
6090 return DAG.getConstant(Val: 0, DL: SL, VT);
6091
6092 // (ballot 1) -> EXEC/EXEC_LO
6093 if (Arg->isOne()) {
6094 Register Exec;
6095 if (VT.getScalarSizeInBits() == 32)
6096 Exec = AMDGPU::EXEC_LO;
6097 else if (VT.getScalarSizeInBits() == 64)
6098 Exec = AMDGPU::EXEC;
6099 else
6100 return SDValue();
6101
6102 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
6103 }
6104 }
6105
6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6107 // ISD::SETNE)
6108 return DAG.getNode(
6109 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
6110 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
6111}
6112
6113static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6114 SelectionDAG &DAG) {
6115 EVT VT = N->getValueType(ResNo: 0);
6116 unsigned ValSize = VT.getSizeInBits();
6117 unsigned IID = N->getConstantOperandVal(Num: 0);
6118 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6119 IID == Intrinsic::amdgcn_permlanex16;
6120 SDLoc SL(N);
6121 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
6122
6123 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6124 SDValue Src2, MVT ValT) -> SDValue {
6125 SmallVector<SDValue, 8> Operands;
6126 switch (IID) {
6127 case Intrinsic::amdgcn_permlane16:
6128 case Intrinsic::amdgcn_permlanex16:
6129 Operands.push_back(Elt: N->getOperand(Num: 6));
6130 Operands.push_back(Elt: N->getOperand(Num: 5));
6131 Operands.push_back(Elt: N->getOperand(Num: 4));
6132 [[fallthrough]];
6133 case Intrinsic::amdgcn_writelane:
6134 Operands.push_back(Elt: Src2);
6135 [[fallthrough]];
6136 case Intrinsic::amdgcn_readlane:
6137 Operands.push_back(Elt: Src1);
6138 [[fallthrough]];
6139 case Intrinsic::amdgcn_readfirstlane:
6140 case Intrinsic::amdgcn_permlane64:
6141 Operands.push_back(Elt: Src0);
6142 break;
6143 default:
6144 llvm_unreachable("unhandled lane op");
6145 }
6146
6147 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
6148 std::reverse(first: Operands.begin(), last: Operands.end());
6149
6150 if (SDNode *GL = N->getGluedNode()) {
6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6152 GL = GL->getOperand(Num: 0).getNode();
6153 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6154 Operand: SDValue(GL, 0)));
6155 }
6156
6157 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
6158 };
6159
6160 SDValue Src0 = N->getOperand(Num: 1);
6161 SDValue Src1, Src2;
6162 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6163 IsPermLane16) {
6164 Src1 = N->getOperand(Num: 2);
6165 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6166 Src2 = N->getOperand(Num: 3);
6167 }
6168
6169 if (ValSize == 32) {
6170 // Already legal
6171 return SDValue();
6172 }
6173
6174 if (ValSize < 32) {
6175 bool IsFloat = VT.isFloatingPoint();
6176 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
6177 DL: SL, VT: MVT::i32);
6178
6179 if (IsPermLane16) {
6180 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
6181 DL: SL, VT: MVT::i32);
6182 }
6183
6184 if (IID == Intrinsic::amdgcn_writelane) {
6185 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
6186 DL: SL, VT: MVT::i32);
6187 }
6188
6189 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6190 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
6191 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
6192 }
6193
6194 if (ValSize % 32 != 0)
6195 return SDValue();
6196
6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6198 EVT VT = N->getValueType(ResNo: 0);
6199 unsigned NE = VT.getVectorNumElements();
6200 EVT EltVT = VT.getVectorElementType();
6201 SmallVector<SDValue, 8> Scalars;
6202 unsigned NumOperands = N->getNumOperands();
6203 SmallVector<SDValue, 4> Operands(NumOperands);
6204 SDNode *GL = N->getGluedNode();
6205
6206 // only handle convergencectrl_glue
6207 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6208
6209 for (unsigned i = 0; i != NE; ++i) {
6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6211 ++j) {
6212 SDValue Operand = N->getOperand(Num: j);
6213 EVT OperandVT = Operand.getValueType();
6214 if (OperandVT.isVector()) {
6215 // A vector operand; extract a single element.
6216 EVT OperandEltVT = OperandVT.getVectorElementType();
6217 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
6218 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
6219 } else {
6220 // A scalar operand; just use it as is.
6221 Operands[j] = Operand;
6222 }
6223 }
6224
6225 if (GL)
6226 Operands[NumOperands - 1] =
6227 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
6228 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
6229
6230 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
6231 }
6232
6233 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
6234 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
6235 };
6236
6237 if (VT.isVector()) {
6238 switch (MVT::SimpleValueType EltTy =
6239 VT.getVectorElementType().getSimpleVT().SimpleTy) {
6240 case MVT::i32:
6241 case MVT::f32: {
6242 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6243 return unrollLaneOp(LaneOp.getNode());
6244 }
6245 case MVT::i16:
6246 case MVT::f16:
6247 case MVT::bf16: {
6248 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: 2);
6249 SmallVector<SDValue, 4> Pieces;
6250 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6251 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6252 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
6253 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6254
6255 if (IsPermLane16)
6256 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
6257 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6258
6259 if (IID == Intrinsic::amdgcn_writelane)
6260 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
6261 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
6262
6263 Pieces.push_back(
6264 Elt: IsPermLane16
6265 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6266 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6267 EltIdx += 2;
6268 }
6269 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
6270 }
6271 default:
6272 // Handle all other cases by bitcasting to i32 vectors
6273 break;
6274 }
6275 }
6276
6277 MVT VecVT = MVT::getVectorVT(VT: MVT::i32, NumElements: ValSize / 32);
6278 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
6279
6280 if (IsPermLane16)
6281 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
6282
6283 if (IID == Intrinsic::amdgcn_writelane)
6284 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
6285
6286 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6287 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6288 return DAG.getBitcast(VT, V: UnrolledLaneOp);
6289}
6290
6291void SITargetLowering::ReplaceNodeResults(SDNode *N,
6292 SmallVectorImpl<SDValue> &Results,
6293 SelectionDAG &DAG) const {
6294 switch (N->getOpcode()) {
6295 case ISD::INSERT_VECTOR_ELT: {
6296 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6297 Results.push_back(Elt: Res);
6298 return;
6299 }
6300 case ISD::EXTRACT_VECTOR_ELT: {
6301 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
6302 Results.push_back(Elt: Res);
6303 return;
6304 }
6305 case ISD::INTRINSIC_WO_CHAIN: {
6306 unsigned IID = N->getConstantOperandVal(Num: 0);
6307 switch (IID) {
6308 case Intrinsic::amdgcn_make_buffer_rsrc:
6309 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
6310 return;
6311 case Intrinsic::amdgcn_cvt_pkrtz: {
6312 SDValue Src0 = N->getOperand(Num: 1);
6313 SDValue Src1 = N->getOperand(Num: 2);
6314 SDLoc SL(N);
6315 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32,
6316 N1: Src0, N2: Src1);
6317 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
6318 return;
6319 }
6320 case Intrinsic::amdgcn_cvt_pknorm_i16:
6321 case Intrinsic::amdgcn_cvt_pknorm_u16:
6322 case Intrinsic::amdgcn_cvt_pk_i16:
6323 case Intrinsic::amdgcn_cvt_pk_u16: {
6324 SDValue Src0 = N->getOperand(Num: 1);
6325 SDValue Src1 = N->getOperand(Num: 2);
6326 SDLoc SL(N);
6327 unsigned Opcode;
6328
6329 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6330 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6331 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6332 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6333 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6334 Opcode = AMDGPUISD::CVT_PK_I16_I32;
6335 else
6336 Opcode = AMDGPUISD::CVT_PK_U16_U32;
6337
6338 EVT VT = N->getValueType(ResNo: 0);
6339 if (isTypeLegal(VT))
6340 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
6341 else {
6342 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
6343 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
6344 }
6345 return;
6346 }
6347 case Intrinsic::amdgcn_s_buffer_load: {
6348 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6349 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6350 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6351 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6352 // s_buffer_load_i8.
6353 if (!Subtarget->hasScalarSubwordLoads())
6354 return;
6355 SDValue Op = SDValue(N, 0);
6356 SDValue Rsrc = Op.getOperand(i: 1);
6357 SDValue Offset = Op.getOperand(i: 2);
6358 SDValue CachePolicy = Op.getOperand(i: 3);
6359 EVT VT = Op.getValueType();
6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6361 SDLoc DL(Op);
6362 MachineFunction &MF = DAG.getMachineFunction();
6363 const DataLayout &DataLayout = DAG.getDataLayout();
6364 Align Alignment =
6365 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
6366 MachineMemOperand *MMO = MF.getMachineMemOperand(
6367 PtrInfo: MachinePointerInfo(),
6368 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6369 MachineMemOperand::MOInvariant,
6370 Size: VT.getStoreSize(), BaseAlignment: Alignment);
6371 SDValue LoadVal;
6372 if (!Offset->isDivergent()) {
6373 SDValue Ops[] = {Rsrc, // source register
6374 Offset, CachePolicy};
6375 SDValue BufferLoad =
6376 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
6377 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
6378 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
6379 } else {
6380 SDValue Ops[] = {
6381 DAG.getEntryNode(), // Chain
6382 Rsrc, // rsrc
6383 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
6384 {}, // voffset
6385 {}, // soffset
6386 {}, // offset
6387 CachePolicy, // cachepolicy
6388 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
6389 };
6390 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
6391 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
6392 }
6393 Results.push_back(Elt: LoadVal);
6394 return;
6395 }
6396 }
6397 break;
6398 }
6399 case ISD::INTRINSIC_W_CHAIN: {
6400 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
6401 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6402 // FIXME: Hacky
6403 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6404 Results.push_back(Elt: Res.getOperand(i: I));
6405 }
6406 } else {
6407 Results.push_back(Elt: Res);
6408 Results.push_back(Elt: Res.getValue(R: 1));
6409 }
6410 return;
6411 }
6412
6413 break;
6414 }
6415 case ISD::SELECT: {
6416 SDLoc SL(N);
6417 EVT VT = N->getValueType(ResNo: 0);
6418 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
6419 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
6420 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
6421
6422 EVT SelectVT = NewVT;
6423 if (NewVT.bitsLT(VT: MVT::i32)) {
6424 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
6425 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
6426 SelectVT = MVT::i32;
6427 }
6428
6429 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT,
6430 N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
6431
6432 if (NewVT != SelectVT)
6433 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
6434 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
6435 return;
6436 }
6437 case ISD::FNEG: {
6438 if (N->getValueType(ResNo: 0) != MVT::v2f16)
6439 break;
6440
6441 SDLoc SL(N);
6442 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
6443
6444 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32,
6445 N1: BC,
6446 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
6447 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6448 return;
6449 }
6450 case ISD::FABS: {
6451 if (N->getValueType(ResNo: 0) != MVT::v2f16)
6452 break;
6453
6454 SDLoc SL(N);
6455 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
6456
6457 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
6458 N1: BC,
6459 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
6460 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
6461 return;
6462 }
6463 case ISD::FSQRT: {
6464 if (N->getValueType(ResNo: 0) != MVT::f16)
6465 break;
6466 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
6467 break;
6468 }
6469 default:
6470 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
6471 break;
6472 }
6473}
6474
6475/// Helper function for LowerBRCOND
6476static SDNode *findUser(SDValue Value, unsigned Opcode) {
6477
6478 SDNode *Parent = Value.getNode();
6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6480 I != E; ++I) {
6481
6482 if (I.getUse().get() != Value)
6483 continue;
6484
6485 if (I->getOpcode() == Opcode)
6486 return *I;
6487 }
6488 return nullptr;
6489}
6490
6491unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6493 switch (Intr->getConstantOperandVal(Num: 1)) {
6494 case Intrinsic::amdgcn_if:
6495 return AMDGPUISD::IF;
6496 case Intrinsic::amdgcn_else:
6497 return AMDGPUISD::ELSE;
6498 case Intrinsic::amdgcn_loop:
6499 return AMDGPUISD::LOOP;
6500 case Intrinsic::amdgcn_end_cf:
6501 llvm_unreachable("should not occur");
6502 default:
6503 return 0;
6504 }
6505 }
6506
6507 // break, if_break, else_break are all only used as inputs to loop, not
6508 // directly as branch conditions.
6509 return 0;
6510}
6511
6512bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
6513 const Triple &TT = getTargetMachine().getTargetTriple();
6514 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6515 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6516 AMDGPU::shouldEmitConstantsToTextSection(TT);
6517}
6518
6519bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6521 return false;
6522
6523 // FIXME: Either avoid relying on address space here or change the default
6524 // address space for functions to avoid the explicit check.
6525 return (GV->getValueType()->isFunctionTy() ||
6526 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
6527 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
6528}
6529
6530bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
6531 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6532}
6533
6534bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
6535 if (!GV->hasExternalLinkage())
6536 return true;
6537
6538 const auto OS = getTargetMachine().getTargetTriple().getOS();
6539 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6540}
6541
6542/// This transforms the control flow intrinsics to get the branch destination as
6543/// last parameter, also switches branch target with BR if the need arise
6544SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6545 SelectionDAG &DAG) const {
6546 SDLoc DL(BRCOND);
6547
6548 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
6549 SDValue Target = BRCOND.getOperand(i: 2);
6550 SDNode *BR = nullptr;
6551 SDNode *SetCC = nullptr;
6552
6553 if (Intr->getOpcode() == ISD::SETCC) {
6554 // As long as we negate the condition everything is fine
6555 SetCC = Intr;
6556 Intr = SetCC->getOperand(Num: 0).getNode();
6557
6558 } else {
6559 // Get the target from BR if we don't negate the condition
6560 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
6561 assert(BR && "brcond missing unconditional branch user");
6562 Target = BR->getOperand(Num: 1);
6563 }
6564
6565 unsigned CFNode = isCFIntrinsic(Intr);
6566 if (CFNode == 0) {
6567 // This is a uniform branch so we don't need to legalize.
6568 return BRCOND;
6569 }
6570
6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6573
6574 assert(!SetCC ||
6575 (SetCC->getConstantOperandVal(1) == 1 &&
6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6577 ISD::SETNE));
6578
6579 // operands of the new intrinsic call
6580 SmallVector<SDValue, 4> Ops;
6581 if (HaveChain)
6582 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
6583
6584 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
6585 Ops.push_back(Elt: Target);
6586
6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6588
6589 // build the new intrinsic call
6590 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
6591
6592 if (!HaveChain) {
6593 SDValue Ops[] = {
6594 SDValue(Result, 0),
6595 BRCOND.getOperand(i: 0)
6596 };
6597
6598 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
6599 }
6600
6601 if (BR) {
6602 // Give the branch instruction our target
6603 SDValue Ops[] = {
6604 BR->getOperand(Num: 0),
6605 BRCOND.getOperand(i: 2)
6606 };
6607 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
6608 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
6609 }
6610
6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6612
6613 // Copy the intrinsic results to registers
6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6615 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
6616 if (!CopyToReg)
6617 continue;
6618
6619 Chain = DAG.getCopyToReg(
6620 Chain, dl: DL,
6621 Reg: CopyToReg->getOperand(Num: 1),
6622 N: SDValue(Result, i - 1),
6623 Glue: SDValue());
6624
6625 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
6626 }
6627
6628 // Remove the old intrinsic from the chain
6629 DAG.ReplaceAllUsesOfValueWith(
6630 From: SDValue(Intr, Intr->getNumValues() - 1),
6631 To: Intr->getOperand(Num: 0));
6632
6633 return Chain;
6634}
6635
6636SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6637 SelectionDAG &DAG) const {
6638 MVT VT = Op.getSimpleValueType();
6639 SDLoc DL(Op);
6640 // Checking the depth
6641 if (Op.getConstantOperandVal(i: 0) != 0)
6642 return DAG.getConstant(Val: 0, DL, VT);
6643
6644 MachineFunction &MF = DAG.getMachineFunction();
6645 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6646 // Check for kernel and shader functions
6647 if (Info->isEntryFunction())
6648 return DAG.getConstant(Val: 0, DL, VT);
6649
6650 MachineFrameInfo &MFI = MF.getFrameInfo();
6651 // There is a call to @llvm.returnaddress in this function
6652 MFI.setReturnAddressIsTaken(true);
6653
6654 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
6655 // Get the return address reg and mark it as an implicit live-in
6656 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF), RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
6657
6658 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
6659}
6660
6661SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6662 SDValue Op,
6663 const SDLoc &DL,
6664 EVT VT) const {
6665 return Op.getValueType().bitsLE(VT) ?
6666 DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op) :
6667 DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
6668 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
6669}
6670
6671SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6672 assert(Op.getValueType() == MVT::f16 &&
6673 "Do not know how to custom lower FP_ROUND for non-f16 type");
6674
6675 SDValue Src = Op.getOperand(i: 0);
6676 EVT SrcVT = Src.getValueType();
6677 if (SrcVT != MVT::f64)
6678 return Op;
6679
6680 // TODO: Handle strictfp
6681 if (Op.getOpcode() != ISD::FP_ROUND)
6682 return Op;
6683
6684 SDLoc DL(Op);
6685
6686 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
6687 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
6688 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
6689}
6690
6691SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6692 SelectionDAG &DAG) const {
6693 EVT VT = Op.getValueType();
6694 const MachineFunction &MF = DAG.getMachineFunction();
6695 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6696 bool IsIEEEMode = Info->getMode().IEEE;
6697
6698 // FIXME: Assert during selection that this is only selected for
6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6700 // mode functions, but this happens to be OK since it's only done in cases
6701 // where there is known no sNaN.
6702 if (IsIEEEMode)
6703 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
6704
6705 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6706 VT == MVT::v16bf16)
6707 return splitBinaryVectorOp(Op, DAG);
6708 return Op;
6709}
6710
6711SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6712 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6713 EVT VT = Op.getValueType();
6714 assert(VT == MVT::f16);
6715
6716 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
6717 EVT ExpVT = Exp.getValueType();
6718 if (ExpVT == MVT::i16)
6719 return Op;
6720
6721 SDLoc DL(Op);
6722
6723 // Correct the exponent type for f16 to i16.
6724 // Clamp the range of the exponent to the instruction's range.
6725
6726 // TODO: This should be a generic narrowing legalization, and can easily be
6727 // for GlobalISel.
6728
6729 SDValue MinExp = DAG.getConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
6730 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
6731
6732 SDValue MaxExp = DAG.getConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
6733 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
6734
6735 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
6736
6737 if (IsStrict) {
6738 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
6739 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
6740 }
6741
6742 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
6743}
6744
6745// Custom lowering for vector multiplications and s_mul_u64.
6746SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6747 EVT VT = Op.getValueType();
6748
6749 // Split vector operands.
6750 if (VT.isVector())
6751 return splitBinaryVectorOp(Op, DAG);
6752
6753 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6754
6755 // There are four ways to lower s_mul_u64:
6756 //
6757 // 1. If all the operands are uniform, then we lower it as it is.
6758 //
6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6760 // multiplications because there is not a vector equivalent of s_mul_u64.
6761 //
6762 // 3. If the cost model decides that it is more efficient to use vector
6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6764 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6765 //
6766 // 4. If the cost model decides to use vector registers and both of the
6767 // operands are zero-extended/sign-extended from 32-bits, then we split the
6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6769 // possible to check if the operands are zero-extended or sign-extended in
6770 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6773 // If the cost model decides that we have to use vector registers, then
6774 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6775 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6776 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6777 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6778 // SIInstrInfo.cpp .
6779
6780 if (Op->isDivergent())
6781 return SDValue();
6782
6783 SDValue Op0 = Op.getOperand(i: 0);
6784 SDValue Op1 = Op.getOperand(i: 1);
6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6788 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
6789 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6790 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
6791 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6792 SDLoc SL(Op);
6793 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6794 return SDValue(
6795 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
6796 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
6797 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
6798 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6799 return SDValue(
6800 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
6801 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6802 return Op;
6803}
6804
6805SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6806 EVT VT = Op.getValueType();
6807 SDLoc SL(Op);
6808 SDValue LHS = Op.getOperand(i: 0);
6809 SDValue RHS = Op.getOperand(i: 1);
6810 bool isSigned = Op.getOpcode() == ISD::SMULO;
6811
6812 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
6813 const APInt &C = RHSC->getAPIntValue();
6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6815 if (C.isPowerOf2()) {
6816 // smulo(x, signed_min) is same as umulo(x, signed_min).
6817 bool UseArithShift = isSigned && !C.isMinSignedValue();
6818 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
6819 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
6820 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1,
6821 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL,
6822 DL: SL, VT, N1: Result, N2: ShiftAmt),
6823 RHS: LHS, Cond: ISD::SETNE);
6824 return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL);
6825 }
6826 }
6827
6828 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
6829 SDValue Top = DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU,
6830 DL: SL, VT, N1: LHS, N2: RHS);
6831
6832 SDValue Sign = isSigned
6833 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
6834 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1, DL: SL, VT: MVT::i32))
6835 : DAG.getConstant(Val: 0, DL: SL, VT);
6836 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
6837
6838 return DAG.getMergeValues(Ops: { Result, Overflow }, dl: SL);
6839}
6840
6841SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6842 if (Op->isDivergent()) {
6843 // Select to V_MAD_[IU]64_[IU]32.
6844 return Op;
6845 }
6846 if (Subtarget->hasSMulHi()) {
6847 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6848 return SDValue();
6849 }
6850 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6851 // calculate the high part, so we might as well do the whole thing with
6852 // V_MAD_[IU]64_[IU]32.
6853 return Op;
6854}
6855
6856SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6857 if (!Subtarget->isTrapHandlerEnabled() ||
6858 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6859 return lowerTrapEndpgm(Op, DAG);
6860
6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6862 lowerTrapHsaQueuePtr(Op, DAG);
6863}
6864
6865SDValue SITargetLowering::lowerTrapEndpgm(
6866 SDValue Op, SelectionDAG &DAG) const {
6867 SDLoc SL(Op);
6868 SDValue Chain = Op.getOperand(i: 0);
6869 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
6870}
6871
6872SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6873 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6874 MachineFunction &MF = DAG.getMachineFunction();
6875 uint64_t Offset = getImplicitParameterOffset(MF, Param);
6876 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
6877 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6878 return DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6879 MMOFlags: MachineMemOperand::MODereferenceable |
6880 MachineMemOperand::MOInvariant);
6881}
6882
6883SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6884 SDValue Op, SelectionDAG &DAG) const {
6885 SDLoc SL(Op);
6886 SDValue Chain = Op.getOperand(i: 0);
6887
6888 SDValue QueuePtr;
6889 // For code object version 5, QueuePtr is passed through implicit kernarg.
6890 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6891 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
6892 QueuePtr =
6893 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
6894 } else {
6895 MachineFunction &MF = DAG.getMachineFunction();
6896 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6897 Register UserSGPR = Info->getQueuePtrUserSGPR();
6898
6899 if (UserSGPR == AMDGPU::NoRegister) {
6900 // We probably are in a function incorrectly marked with
6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6902 // trap, so just use a null pointer.
6903 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
6904 } else {
6905 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
6906 VT: MVT::i64);
6907 }
6908 }
6909
6910 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
6911 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01,
6912 N: QueuePtr, Glue: SDValue());
6913
6914 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6915 SDValue Ops[] = {
6916 ToReg,
6917 DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16),
6918 SGPR01,
6919 ToReg.getValue(R: 1)
6920 };
6921 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
6922}
6923
6924SDValue SITargetLowering::lowerTrapHsa(
6925 SDValue Op, SelectionDAG &DAG) const {
6926 SDLoc SL(Op);
6927 SDValue Chain = Op.getOperand(i: 0);
6928
6929 // We need to simulate the 's_trap 2' instruction on targets that run in
6930 // PRIV=1 (where it is treated as a nop).
6931 if (Subtarget->hasPrivEnabledTrap2NopBug())
6932 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
6933
6934 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
6935 SDValue Ops[] = {
6936 Chain,
6937 DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)
6938 };
6939 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
6940}
6941
6942SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6943 SDLoc SL(Op);
6944 SDValue Chain = Op.getOperand(i: 0);
6945 MachineFunction &MF = DAG.getMachineFunction();
6946
6947 if (!Subtarget->isTrapHandlerEnabled() ||
6948 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6949 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
6950 "debugtrap handler not supported",
6951 Op.getDebugLoc(),
6952 DS_Warning);
6953 LLVMContext &Ctx = MF.getFunction().getContext();
6954 Ctx.diagnose(DI: NoTrap);
6955 return Chain;
6956 }
6957
6958 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
6959 SDValue Ops[] = {
6960 Chain,
6961 DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)
6962 };
6963 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
6964}
6965
6966SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6967 SelectionDAG &DAG) const {
6968 if (Subtarget->hasApertureRegs()) {
6969 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6970 ? AMDGPU::SRC_SHARED_BASE
6971 : AMDGPU::SRC_PRIVATE_BASE;
6972 // Note: this feature (register) is broken. When used as a 32-bit operand,
6973 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6974 // bits.
6975 //
6976 // To work around the issue, directly emit a 64 bit mov from this register
6977 // then extract the high bits. Note that this shouldn't even result in a
6978 // shift being emitted and simply become a pair of registers (e.g.):
6979 // s_mov_b64 s[6:7], src_shared_base
6980 // v_mov_b32_e32 v1, s7
6981 //
6982 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6983 // coalescing would kick in and it would think it's okay to use the "HI"
6984 // subregister directly (instead of extracting the HI 32 bits) which is an
6985 // artificial (unusable) register.
6986 // Register TableGen definitions would need an overhaul to get rid of the
6987 // artificial "HI" aperture registers and prevent this kind of issue from
6988 // happening.
6989 SDNode *Mov = DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64,
6990 Op1: DAG.getRegister(Reg: ApertureRegNo, VT: MVT::i64));
6991 return DAG.getNode(
6992 Opcode: ISD::TRUNCATE, DL, VT: MVT::i32,
6993 Operand: DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64,
6994 Ops: {SDValue(Mov, 0), DAG.getConstant(Val: 32, DL, VT: MVT::i64)}));
6995 }
6996
6997 // For code object version 5, private_base and shared_base are passed through
6998 // implicit kernargs.
6999 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7000 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
7001 ImplicitParameter Param =
7002 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
7003 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
7004 }
7005
7006 MachineFunction &MF = DAG.getMachineFunction();
7007 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7008 Register UserSGPR = Info->getQueuePtrUserSGPR();
7009 if (UserSGPR == AMDGPU::NoRegister) {
7010 // We probably are in a function incorrectly marked with
7011 // amdgpu-no-queue-ptr. This is undefined.
7012 return DAG.getUNDEF(VT: MVT::i32);
7013 }
7014
7015 SDValue QueuePtr = CreateLiveInRegister(
7016 DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
7017
7018 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7019 // private_segment_aperture_base_hi.
7020 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7021
7022 SDValue Ptr =
7023 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
7024
7025 // TODO: Use custom target PseudoSourceValue.
7026 // TODO: We should use the value from the IR intrinsic call, but it might not
7027 // be available and how do we get it?
7028 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
7029 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
7030 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
7031 MMOFlags: MachineMemOperand::MODereferenceable |
7032 MachineMemOperand::MOInvariant);
7033}
7034
7035/// Return true if the value is a known valid address, such that a null check is
7036/// not necessary.
7037static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
7038 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7039 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7040 isa<BasicBlockSDNode>(Val))
7041 return true;
7042
7043 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7045
7046 // TODO: Search through arithmetic, handle arguments and loads
7047 // marked nonnull.
7048 return false;
7049}
7050
7051SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7052 SelectionDAG &DAG) const {
7053 SDLoc SL(Op);
7054
7055 const AMDGPUTargetMachine &TM =
7056 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7057
7058 unsigned DestAS, SrcAS;
7059 SDValue Src;
7060 bool IsNonNull = false;
7061 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
7062 SrcAS = ASC->getSrcAddressSpace();
7063 Src = ASC->getOperand(Num: 0);
7064 DestAS = ASC->getDestAddressSpace();
7065 } else {
7066 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7067 Op.getConstantOperandVal(0) ==
7068 Intrinsic::amdgcn_addrspacecast_nonnull);
7069 Src = Op->getOperand(Num: 1);
7070 SrcAS = Op->getConstantOperandVal(Num: 2);
7071 DestAS = Op->getConstantOperandVal(Num: 3);
7072 IsNonNull = true;
7073 }
7074
7075 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
7076
7077 // flat -> local/private
7078 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7079 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7080 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7081 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7082
7083 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7084 return Ptr;
7085
7086 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
7087 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7088 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
7089
7090 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
7091 N3: SegmentNullPtr);
7092 }
7093 }
7094
7095 // local/private -> flat
7096 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7097 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7098 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7099
7100 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
7101 SDValue CvtPtr =
7102 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
7103 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
7104
7105 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
7106 return CvtPtr;
7107
7108 unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
7109 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
7110
7111 SDValue NonNull
7112 = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
7113
7114 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
7115 N3: FlatNullPtr);
7116 }
7117 }
7118
7119 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7120 Op.getValueType() == MVT::i64) {
7121 const SIMachineFunctionInfo *Info =
7122 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
7123 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
7124 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
7125 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
7126 }
7127
7128 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7129 Src.getValueType() == MVT::i64)
7130 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
7131
7132 // global <-> flat are no-ops and never emitted.
7133
7134 const MachineFunction &MF = DAG.getMachineFunction();
7135 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7136 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7137 DAG.getContext()->diagnose(DI: InvalidAddrSpaceCast);
7138
7139 return DAG.getUNDEF(VT: Op->getValueType(ResNo: 0));
7140}
7141
7142// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7143// the small vector and inserting them into the big vector. That is better than
7144// the default expansion of doing it via a stack slot. Even though the use of
7145// the stack slot would be optimized away afterwards, the stack slot itself
7146// remains.
7147SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7148 SelectionDAG &DAG) const {
7149 SDValue Vec = Op.getOperand(i: 0);
7150 SDValue Ins = Op.getOperand(i: 1);
7151 SDValue Idx = Op.getOperand(i: 2);
7152 EVT VecVT = Vec.getValueType();
7153 EVT InsVT = Ins.getValueType();
7154 EVT EltVT = VecVT.getVectorElementType();
7155 unsigned InsNumElts = InsVT.getVectorNumElements();
7156 unsigned IdxVal = Idx->getAsZExtVal();
7157 SDLoc SL(Op);
7158
7159 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7160 // Insert 32-bit registers at a time.
7161 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7162
7163 unsigned VecNumElts = VecVT.getVectorNumElements();
7164 EVT NewVecVT =
7165 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
7166 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7167 : EVT::getVectorVT(Context&: *DAG.getContext(),
7168 VT: MVT::i32, NumElements: InsNumElts / 2);
7169
7170 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
7171 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
7172
7173 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7174 SDValue Elt;
7175 if (InsNumElts == 2) {
7176 Elt = Ins;
7177 } else {
7178 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
7179 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7180 }
7181 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
7182 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
7183 }
7184
7185 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
7186 }
7187
7188 for (unsigned I = 0; I != InsNumElts; ++I) {
7189 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
7190 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
7191 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
7192 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
7193 }
7194 return Vec;
7195}
7196
7197SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7198 SelectionDAG &DAG) const {
7199 SDValue Vec = Op.getOperand(i: 0);
7200 SDValue InsVal = Op.getOperand(i: 1);
7201 SDValue Idx = Op.getOperand(i: 2);
7202 EVT VecVT = Vec.getValueType();
7203 EVT EltVT = VecVT.getVectorElementType();
7204 unsigned VecSize = VecVT.getSizeInBits();
7205 unsigned EltSize = EltVT.getSizeInBits();
7206 SDLoc SL(Op);
7207
7208 // Specially handle the case of v4i16 with static indexing.
7209 unsigned NumElts = VecVT.getVectorNumElements();
7210 auto KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
7211 if (NumElts == 4 && EltSize == 16 && KIdx) {
7212 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
7213
7214 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7215 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
7216 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
7217 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
7218
7219 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
7220 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
7221
7222 unsigned Idx = KIdx->getZExtValue();
7223 bool InsertLo = Idx < 2;
7224 SDValue InsHalf = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16,
7225 N1: InsertLo ? LoVec : HiVec,
7226 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
7227 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
7228
7229 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
7230
7231 SDValue Concat = InsertLo ?
7232 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: { InsHalf, HiHalf }) :
7233 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: { LoHalf, InsHalf });
7234
7235 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
7236 }
7237
7238 // Static indexing does not lower to stack access, and hence there is no need
7239 // for special custom lowering to avoid stack access.
7240 if (isa<ConstantSDNode>(Val: Idx))
7241 return SDValue();
7242
7243 // Avoid stack access for dynamic indexing by custom lowering to
7244 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7245
7246 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7247
7248 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7249
7250 // Convert vector index to bit-index and get the required bit mask.
7251 assert(isPowerOf2_32(EltSize));
7252 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
7253 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7254 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7255 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
7256 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
7257
7258 // 1. Create a congruent vector with the target value in each element.
7259 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
7260 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
7261
7262 // 2. Mask off all other indices except the required index within (1).
7263 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
7264
7265 // 3. Mask off the required index within the target vector.
7266 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7267 SDValue RHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT,
7268 N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
7269
7270 // 4. Get (2) and (3) ORed into the target vector.
7271 SDValue BFI = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS);
7272
7273 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
7274}
7275
7276SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7277 SelectionDAG &DAG) const {
7278 SDLoc SL(Op);
7279
7280 EVT ResultVT = Op.getValueType();
7281 SDValue Vec = Op.getOperand(i: 0);
7282 SDValue Idx = Op.getOperand(i: 1);
7283 EVT VecVT = Vec.getValueType();
7284 unsigned VecSize = VecVT.getSizeInBits();
7285 EVT EltVT = VecVT.getVectorElementType();
7286
7287 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7288
7289 // Make sure we do any optimizations that will make it easier to fold
7290 // source modifiers before obscuring it with bit operations.
7291
7292 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7293 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
7294 return Combined;
7295
7296 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7297 SDValue Lo, Hi;
7298 EVT LoVT, HiVT;
7299 std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: VecVT);
7300
7301 if (VecSize == 128) {
7302 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
7303 Lo = DAG.getBitcast(VT: LoVT,
7304 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7305 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
7306 Hi = DAG.getBitcast(VT: HiVT,
7307 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7308 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
7309 } else if (VecSize == 256) {
7310 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
7311 SDValue Parts[4];
7312 for (unsigned P = 0; P < 4; ++P) {
7313 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7314 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7315 }
7316
7317 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7318 N1: Parts[0], N2: Parts[1]));
7319 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
7320 N1: Parts[2], N2: Parts[3]));
7321 } else {
7322 assert(VecSize == 512);
7323
7324 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
7325 SDValue Parts[8];
7326 for (unsigned P = 0; P < 8; ++P) {
7327 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
7328 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
7329 }
7330
7331 Lo = DAG.getBitcast(VT: LoVT,
7332 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7333 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
7334 Hi = DAG.getBitcast(VT: HiVT,
7335 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
7336 N1: Parts[4], N2: Parts[5],N3: Parts[6], N4: Parts[7]));
7337 }
7338
7339 EVT IdxVT = Idx.getValueType();
7340 unsigned NElem = VecVT.getVectorNumElements();
7341 assert(isPowerOf2_32(NElem));
7342 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
7343 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
7344 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
7345 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
7346 }
7347
7348 assert(VecSize <= 64);
7349
7350 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
7351
7352 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7353 SDValue VecBC = peekThroughBitcasts(V: Vec);
7354 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7355 SDValue Src = VecBC.getOperand(i: 0);
7356 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
7357 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
7358 }
7359
7360 unsigned EltSize = EltVT.getSizeInBits();
7361 assert(isPowerOf2_32(EltSize));
7362
7363 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
7364
7365 // Convert vector index to bit-index (* EltSize)
7366 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
7367
7368 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
7369 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
7370
7371 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7372 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
7373 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
7374 }
7375
7376 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
7377}
7378
7379static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7380 assert(Elt % 2 == 0);
7381 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7382}
7383
7384SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7385 SelectionDAG &DAG) const {
7386 SDLoc SL(Op);
7387 EVT ResultVT = Op.getValueType();
7388 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
7389
7390 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7391 EVT EltVT = PackVT.getVectorElementType();
7392 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
7393
7394 // vector_shuffle <0,1,6,7> lhs, rhs
7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7396 //
7397 // vector_shuffle <6,7,2,3> lhs, rhs
7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7399 //
7400 // vector_shuffle <6,7,0,1> lhs, rhs
7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7402
7403 // Avoid scalarizing when both halves are reading from consecutive elements.
7404 SmallVector<SDValue, 4> Pieces;
7405 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7406 if (elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
7407 const int Idx = SVN->getMaskElt(Idx: I);
7408 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7410 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL,
7411 VT: PackVT, N1: SVN->getOperand(Num: VecIdx),
7412 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7413 Pieces.push_back(Elt: SubVec);
7414 } else {
7415 const int Idx0 = SVN->getMaskElt(Idx: I);
7416 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
7417 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7418 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7421
7422 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
7423 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT,
7424 N1: Vec0, N2: DAG.getConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
7425
7426 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
7427 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT,
7428 N1: Vec1, N2: DAG.getConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
7429 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: { Elt0, Elt1 }));
7430 }
7431 }
7432
7433 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
7434}
7435
7436SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7437 SelectionDAG &DAG) const {
7438 SDValue SVal = Op.getOperand(i: 0);
7439 EVT ResultVT = Op.getValueType();
7440 EVT SValVT = SVal.getValueType();
7441 SDValue UndefVal = DAG.getUNDEF(VT: SValVT);
7442 SDLoc SL(Op);
7443
7444 SmallVector<SDValue, 8> VElts;
7445 VElts.push_back(Elt: SVal);
7446 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7447 VElts.push_back(Elt: UndefVal);
7448
7449 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
7450}
7451
7452SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7453 SelectionDAG &DAG) const {
7454 SDLoc SL(Op);
7455 EVT VT = Op.getValueType();
7456
7457 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7458 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7459 EVT HalfVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7460 NumElements: VT.getVectorNumElements() / 2);
7461 MVT HalfIntVT = MVT::getIntegerVT(BitWidth: HalfVT.getSizeInBits());
7462
7463 // Turn into pair of packed build_vectors.
7464 // TODO: Special case for constants that can be materialized with s_mov_b64.
7465 SmallVector<SDValue, 4> LoOps, HiOps;
7466 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7467 LoOps.push_back(Elt: Op.getOperand(i: I));
7468 HiOps.push_back(Elt: Op.getOperand(i: I + E));
7469 }
7470 SDValue Lo = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: LoOps);
7471 SDValue Hi = DAG.getBuildVector(VT: HalfVT, DL: SL, Ops: HiOps);
7472
7473 SDValue CastLo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Lo);
7474 SDValue CastHi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: HalfIntVT, Operand: Hi);
7475
7476 SDValue Blend = DAG.getBuildVector(VT: MVT::getVectorVT(VT: HalfIntVT, NumElements: 2), DL: SL,
7477 Ops: { CastLo, CastHi });
7478 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7479 }
7480
7481 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7482 EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7483 NumElements: VT.getVectorNumElements() / 4);
7484 MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits());
7485
7486 SmallVector<SDValue, 4> Parts[4];
7487 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7488 for (unsigned P = 0; P < 4; ++P)
7489 Parts[P].push_back(Elt: Op.getOperand(i: I + P * E));
7490 }
7491 SDValue Casts[4];
7492 for (unsigned P = 0; P < 4; ++P) {
7493 SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]);
7494 Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec);
7495 }
7496
7497 SDValue Blend =
7498 DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 4), DL: SL, Ops: Casts);
7499 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7500 }
7501
7502 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7503 EVT QuarterVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(),
7504 NumElements: VT.getVectorNumElements() / 8);
7505 MVT QuarterIntVT = MVT::getIntegerVT(BitWidth: QuarterVT.getSizeInBits());
7506
7507 SmallVector<SDValue, 8> Parts[8];
7508 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7509 for (unsigned P = 0; P < 8; ++P)
7510 Parts[P].push_back(Elt: Op.getOperand(i: I + P * E));
7511 }
7512 SDValue Casts[8];
7513 for (unsigned P = 0; P < 8; ++P) {
7514 SDValue Vec = DAG.getBuildVector(VT: QuarterVT, DL: SL, Ops: Parts[P]);
7515 Casts[P] = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: QuarterIntVT, Operand: Vec);
7516 }
7517
7518 SDValue Blend =
7519 DAG.getBuildVector(VT: MVT::getVectorVT(VT: QuarterIntVT, NumElements: 8), DL: SL, Ops: Casts);
7520 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
7521 }
7522
7523 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7525
7526 SDValue Lo = Op.getOperand(i: 0);
7527 SDValue Hi = Op.getOperand(i: 1);
7528
7529 // Avoid adding defined bits with the zero_extend.
7530 if (Hi.isUndef()) {
7531 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
7532 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
7533 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
7534 }
7535
7536 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
7537 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
7538
7539 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
7540 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
7541 if (Lo.isUndef())
7542 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
7543
7544 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
7545 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
7546
7547 SDValue Or = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi);
7548 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
7549}
7550
7551bool
7552SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
7553 // OSes that use ELF REL relocations (instead of RELA) can only store a
7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7555 // which can create arbitrary 64-bit addends. (This is only a problem for
7556 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7557 // the high 32 bits of the addend.)
7558 //
7559 // This should be kept in sync with how HasRelocationAddend is initialized in
7560 // the constructor of ELFAMDGPUAsmBackend.
7561 if (!Subtarget->isAmdHsaOS())
7562 return false;
7563
7564 // We can fold offsets for anything that doesn't require a GOT relocation.
7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7566 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7567 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7568 !shouldEmitGOTReloc(GV: GA->getGlobal());
7569}
7570
7571static SDValue
7572buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
7573 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7574 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7577 // lowered to the following code sequence:
7578 //
7579 // For constant address space:
7580 // s_getpc_b64 s[0:1]
7581 // s_add_u32 s0, s0, $symbol
7582 // s_addc_u32 s1, s1, 0
7583 //
7584 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7585 // a fixup or relocation is emitted to replace $symbol with a literal
7586 // constant, which is a pc-relative offset from the encoding of the $symbol
7587 // operand to the global variable.
7588 //
7589 // For global address space:
7590 // s_getpc_b64 s[0:1]
7591 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7592 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7593 //
7594 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7595 // fixups or relocations are emitted to replace $symbol@*@lo and
7596 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7598 // operand to the global variable.
7599 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
7600 SDValue PtrHi;
7601 if (GAFlags == SIInstrInfo::MO_NONE)
7602 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
7603 else
7604 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
7605 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
7606}
7607
7608SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7609 SDValue Op,
7610 SelectionDAG &DAG) const {
7611 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
7612 SDLoc DL(GSD);
7613 EVT PtrVT = Op.getValueType();
7614
7615 const GlobalValue *GV = GSD->getGlobal();
7616 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7617 shouldUseLDSConstAddress(GV)) ||
7618 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
7619 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
7620 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
7621 GV->hasExternalLinkage()) {
7622 Type *Ty = GV->getValueType();
7623 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7624 // zero-sized type in other languages to declare the dynamic shared
7625 // memory which size is not known at the compile time. They will be
7626 // allocated by the runtime and placed directly after the static
7627 // allocated ones. They all share the same offset.
7628 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7630 // Adjust alignment for that dynamic shared memory array.
7631 Function &F = DAG.getMachineFunction().getFunction();
7632 MFI->setDynLDSAlign(F, GV: *cast<GlobalVariable>(Val: GV));
7633 MFI->setUsesDynamicLDS(true);
7634 return SDValue(
7635 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
7636 }
7637 }
7638 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
7639 }
7640
7641 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
7643 TargetFlags: SIInstrInfo::MO_ABS32_LO);
7644 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
7645 }
7646
7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7648 SDValue AddrLo = DAG.getTargetGlobalAddress(
7649 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
7650 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
7651
7652 SDValue AddrHi = DAG.getTargetGlobalAddress(
7653 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
7654 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
7655
7656 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
7657 }
7658
7659 if (shouldEmitFixup(GV))
7660 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
7661
7662 if (shouldEmitPCReloc(GV))
7663 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
7664 GAFlags: SIInstrInfo::MO_REL32);
7665
7666 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
7667 GAFlags: SIInstrInfo::MO_GOTPCREL32);
7668
7669 Type *Ty = PtrVT.getTypeForEVT(Context&: *DAG.getContext());
7670 PointerType *PtrTy = PointerType::get(ElementType: Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
7671 const DataLayout &DataLayout = DAG.getDataLayout();
7672 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
7673 MachinePointerInfo PtrInfo
7674 = MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
7675
7676 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
7677 MMOFlags: MachineMemOperand::MODereferenceable |
7678 MachineMemOperand::MOInvariant);
7679}
7680
7681SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
7682 const SDLoc &DL, SDValue V) const {
7683 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7684 // the destination register.
7685 //
7686 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7687 // so we will end up with redundant moves to m0.
7688 //
7689 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7690
7691 // A Null SDValue creates a glue result.
7692 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
7693 Op1: V, Op2: Chain);
7694 return SDValue(M0, 0);
7695}
7696
7697SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7698 SDValue Op,
7699 MVT VT,
7700 unsigned Offset) const {
7701 SDLoc SL(Op);
7702 SDValue Param = lowerKernargMemParameter(
7703 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
7704 // The local size values will have the hi 16-bits as zero.
7705 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
7706 N2: DAG.getValueType(VT));
7707}
7708
7709static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7710 EVT VT) {
7711 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7712 "non-hsa intrinsic with hsa target",
7713 DL.getDebugLoc());
7714 DAG.getContext()->diagnose(DI: BadIntrin);
7715 return DAG.getUNDEF(VT);
7716}
7717
7718static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
7719 EVT VT) {
7720 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
7721 "intrinsic not supported on subtarget",
7722 DL.getDebugLoc());
7723 DAG.getContext()->diagnose(DI: BadIntrin);
7724 return DAG.getUNDEF(VT);
7725}
7726
7727static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
7728 ArrayRef<SDValue> Elts) {
7729 assert(!Elts.empty());
7730 MVT Type;
7731 unsigned NumElts = Elts.size();
7732
7733 if (NumElts <= 12) {
7734 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
7735 } else {
7736 assert(Elts.size() <= 16);
7737 Type = MVT::v16f32;
7738 NumElts = 16;
7739 }
7740
7741 SmallVector<SDValue, 16> VecElts(NumElts);
7742 for (unsigned i = 0; i < Elts.size(); ++i) {
7743 SDValue Elt = Elts[i];
7744 if (Elt.getValueType() != MVT::f32)
7745 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
7746 VecElts[i] = Elt;
7747 }
7748 for (unsigned i = Elts.size(); i < NumElts; ++i)
7749 VecElts[i] = DAG.getUNDEF(VT: MVT::f32);
7750
7751 if (NumElts == 1)
7752 return VecElts[0];
7753 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
7754}
7755
7756static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7757 SDValue Src, int ExtraElts) {
7758 EVT SrcVT = Src.getValueType();
7759
7760 SmallVector<SDValue, 8> Elts;
7761
7762 if (SrcVT.isVector())
7763 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
7764 else
7765 Elts.push_back(Elt: Src);
7766
7767 SDValue Undef = DAG.getUNDEF(VT: SrcVT.getScalarType());
7768 while (ExtraElts--)
7769 Elts.push_back(Elt: Undef);
7770
7771 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
7772}
7773
7774// Re-construct the required return value for a image load intrinsic.
7775// This is more complicated due to the optional use TexFailCtrl which means the required
7776// return type is an aggregate
7777static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
7778 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7779 bool Unpacked, bool IsD16, int DMaskPop,
7780 int NumVDataDwords, bool IsAtomicPacked16Bit,
7781 const SDLoc &DL) {
7782 // Determine the required return type. This is the same regardless of IsTexFail flag
7783 EVT ReqRetVT = ResultTypes[0];
7784 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7785 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7786 ? (ReqRetNumElts + 1) / 2
7787 : ReqRetNumElts;
7788
7789 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7790
7791 MVT DataDwordVT = NumDataDwords == 1 ?
7792 MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
7793
7794 MVT MaskPopVT = MaskPopDwords == 1 ?
7795 MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
7796
7797 SDValue Data(Result, 0);
7798 SDValue TexFail;
7799
7800 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7801 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
7802 if (MaskPopVT.isVector()) {
7803 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
7804 N1: SDValue(Result, 0), N2: ZeroIdx);
7805 } else {
7806 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
7807 N1: SDValue(Result, 0), N2: ZeroIdx);
7808 }
7809 }
7810
7811 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7812 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
7813 ExtraElts: NumDataDwords - MaskPopDwords);
7814
7815 if (IsD16)
7816 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
7817
7818 EVT LegalReqRetVT = ReqRetVT;
7819 if (!ReqRetVT.isVector()) {
7820 if (!Data.getValueType().isInteger())
7821 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
7822 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
7823 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
7824 } else {
7825 // We need to widen the return vector to a legal type
7826 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7827 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7828 LegalReqRetVT =
7829 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
7830 NumElements: ReqRetVT.getVectorNumElements() + 1);
7831 }
7832 }
7833 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
7834
7835 if (IsTexFail) {
7836 TexFail =
7837 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
7838 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
7839
7840 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
7841 }
7842
7843 if (Result->getNumValues() == 1)
7844 return Data;
7845
7846 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
7847}
7848
7849static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7850 SDValue *LWE, bool &IsTexFail) {
7851 auto TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
7852
7853 uint64_t Value = TexFailCtrlConst->getZExtValue();
7854 if (Value) {
7855 IsTexFail = true;
7856 }
7857
7858 SDLoc DL(TexFailCtrlConst);
7859 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
7860 Value &= ~(uint64_t)0x1;
7861 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
7862 Value &= ~(uint64_t)0x2;
7863
7864 return Value == 0;
7865}
7866
7867static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
7868 MVT PackVectorVT,
7869 SmallVectorImpl<SDValue> &PackedAddrs,
7870 unsigned DimIdx, unsigned EndIdx,
7871 unsigned NumGradients) {
7872 SDLoc DL(Op);
7873 for (unsigned I = DimIdx; I < EndIdx; I++) {
7874 SDValue Addr = Op.getOperand(i: I);
7875
7876 // Gradients are packed with undef for each coordinate.
7877 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7878 // 1D: undef,dx/dh; undef,dx/dv
7879 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7880 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7881 if (((I + 1) >= EndIdx) ||
7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7883 I == DimIdx + NumGradients - 1))) {
7884 if (Addr.getValueType() != MVT::i16)
7885 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
7886 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
7887 } else {
7888 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
7889 I++;
7890 }
7891 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
7892 PackedAddrs.push_back(Elt: Addr);
7893 }
7894}
7895
7896SDValue SITargetLowering::lowerImage(SDValue Op,
7897 const AMDGPU::ImageDimIntrinsicInfo *Intr,
7898 SelectionDAG &DAG, bool WithChain) const {
7899 SDLoc DL(Op);
7900 MachineFunction &MF = DAG.getMachineFunction();
7901 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7902 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7903 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
7905 unsigned IntrOpcode = Intr->BaseOpcode;
7906 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
7907 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
7908 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
7909
7910 SmallVector<EVT, 3> ResultTypes(Op->values());
7911 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7912 bool IsD16 = false;
7913 bool IsG16 = false;
7914 bool IsA16 = false;
7915 SDValue VData;
7916 int NumVDataDwords = 0;
7917 bool AdjustRetType = false;
7918 bool IsAtomicPacked16Bit = false;
7919
7920 // Offset of intrinsic arguments
7921 const unsigned ArgOffset = WithChain ? 2 : 1;
7922
7923 unsigned DMask;
7924 unsigned DMaskLanes = 0;
7925
7926 if (BaseOpcode->Atomic) {
7927 VData = Op.getOperand(i: 2);
7928
7929 IsAtomicPacked16Bit =
7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7932
7933 bool Is64Bit = VData.getValueSizeInBits() == 64;
7934 if (BaseOpcode->AtomicX2) {
7935 SDValue VData2 = Op.getOperand(i: 3);
7936 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7937 Ops: {VData, VData2});
7938 if (Is64Bit)
7939 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
7940
7941 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7942 DMask = Is64Bit ? 0xf : 0x3;
7943 NumVDataDwords = Is64Bit ? 4 : 2;
7944 } else {
7945 DMask = Is64Bit ? 0x3 : 0x1;
7946 NumVDataDwords = Is64Bit ? 2 : 1;
7947 }
7948 } else {
7949 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
7951
7952 if (BaseOpcode->Store) {
7953 VData = Op.getOperand(i: 2);
7954
7955 MVT StoreVT = VData.getSimpleValueType();
7956 if (StoreVT.getScalarType() == MVT::f16) {
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7958 return Op; // D16 is unsupported for this instruction
7959
7960 IsD16 = true;
7961 VData = handleD16VData(VData, DAG, ImageStore: true);
7962 }
7963
7964 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7965 } else if (!BaseOpcode->NoReturn) {
7966 // Work out the num dwords based on the dmask popcount and underlying type
7967 // and whether packing is supported.
7968 MVT LoadVT = ResultTypes[0].getSimpleVT();
7969 if (LoadVT.getScalarType() == MVT::f16) {
7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7971 return Op; // D16 is unsupported for this instruction
7972
7973 IsD16 = true;
7974 }
7975
7976 // Confirm that the return type is large enough for the dmask specified
7977 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7978 (!LoadVT.isVector() && DMaskLanes > 1))
7979 return Op;
7980
7981 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7982 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7983 // instructions.
7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7986 NumVDataDwords = (DMaskLanes + 1) / 2;
7987 else
7988 NumVDataDwords = DMaskLanes;
7989
7990 AdjustRetType = true;
7991 }
7992 }
7993
7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7995 SmallVector<SDValue, 4> VAddrs;
7996
7997 // Check for 16 bit addresses or derivatives and pack if true.
7998 MVT VAddrVT =
7999 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
8000 MVT VAddrScalarVT = VAddrVT.getScalarType();
8001 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8002 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8003
8004 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
8005 VAddrScalarVT = VAddrVT.getScalarType();
8006 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8007 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8008
8009 // Push back extra arguments.
8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8011 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8013 // Special handling of bias when A16 is on. Bias is of type half but
8014 // occupies full 32-bit.
8015 SDValue Bias = DAG.getBuildVector(
8016 VT: MVT::v2f16, DL,
8017 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getUNDEF(VT: MVT::f16)});
8018 VAddrs.push_back(Elt: Bias);
8019 } else {
8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8021 "Bias needs to be converted to 16 bit in A16 mode");
8022 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
8023 }
8024 }
8025
8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8027 // 16 bit gradients are supported, but are tied to the A16 control
8028 // so both gradients and addresses must be 16 bit
8029 LLVM_DEBUG(
8030 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8031 "require 16 bit args for both gradients and addresses");
8032 return Op;
8033 }
8034
8035 if (IsA16) {
8036 if (!ST->hasA16()) {
8037 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8038 "support 16 bit addresses\n");
8039 return Op;
8040 }
8041 }
8042
8043 // We've dealt with incorrect input so we know that if IsA16, IsG16
8044 // are set then we have to compress/pack operands (either address,
8045 // gradient or both)
8046 // In the case where a16 and gradients are tied (no G16 support) then we
8047 // have already verified that both IsA16 and IsG16 are true
8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8049 // Activate g16
8050 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8051 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8053 }
8054
8055 // Add gradients (packed or unpacked)
8056 if (IsG16) {
8057 // Pack the gradients
8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8059 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
8060 DimIdx: ArgOffset + Intr->GradientStart,
8061 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
8062 } else {
8063 for (unsigned I = ArgOffset + Intr->GradientStart;
8064 I < ArgOffset + Intr->CoordStart; I++)
8065 VAddrs.push_back(Elt: Op.getOperand(i: I));
8066 }
8067
8068 // Add addresses (packed or unpacked)
8069 if (IsA16) {
8070 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
8071 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
8072 NumGradients: 0 /* No gradients */);
8073 } else {
8074 // Add uncompressed address
8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8076 VAddrs.push_back(Elt: Op.getOperand(i: I));
8077 }
8078
8079 // If the register allocator cannot place the address registers contiguously
8080 // without introducing moves, then using the non-sequential address encoding
8081 // is always preferable, since it saves VALU instructions and is usually a
8082 // wash in terms of code size or even better.
8083 //
8084 // However, we currently have no way of hinting to the register allocator that
8085 // MIMG addresses should be placed contiguously when it is possible to do so,
8086 // so force non-NSA for the common 2-address case as a heuristic.
8087 //
8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8089 // allocation when possible.
8090 //
8091 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8092 // set of the remaining addresses.
8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8095 const bool UseNSA = ST->hasNSAEncoding() &&
8096 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8097 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8098 const bool UsePartialNSA =
8099 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8100
8101 SDValue VAddr;
8102 if (UsePartialNSA) {
8103 VAddr = getBuildDwordsVector(DAG, DL,
8104 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
8105 }
8106 else if (!UseNSA) {
8107 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
8108 }
8109
8110 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
8111 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
8112 SDValue Unorm;
8113 if (!BaseOpcode->Sampler) {
8114 Unorm = True;
8115 } else {
8116 uint64_t UnormConst =
8117 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
8118
8119 Unorm = UnormConst ? True : False;
8120 }
8121
8122 SDValue TFE;
8123 SDValue LWE;
8124 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
8125 bool IsTexFail = false;
8126 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
8127 return Op;
8128
8129 if (IsTexFail) {
8130 if (!DMaskLanes) {
8131 // Expecting to get an error flag since TFC is on - and dmask is 0
8132 // Force dmask to be at least 1 otherwise the instruction will fail
8133 DMask = 0x1;
8134 DMaskLanes = 1;
8135 NumVDataDwords = 1;
8136 }
8137 NumVDataDwords += 1;
8138 AdjustRetType = true;
8139 }
8140
8141 // Has something earlier tagged that the return type needs adjusting
8142 // This happens if the instruction is a load or has set TexFailCtrl flags
8143 if (AdjustRetType) {
8144 // NumVDataDwords reflects the true number of dwords required in the return type
8145 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8146 // This is a no-op load. This can be eliminated
8147 SDValue Undef = DAG.getUNDEF(VT: Op.getValueType());
8148 if (isa<MemSDNode>(Val: Op))
8149 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
8150 return Undef;
8151 }
8152
8153 EVT NewVT = NumVDataDwords > 1 ?
8154 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: NumVDataDwords)
8155 : MVT::i32;
8156
8157 ResultTypes[0] = NewVT;
8158 if (ResultTypes.size() == 3) {
8159 // Original result was aggregate type used for TexFailCtrl results
8160 // The actual instruction returns as a vector type which has now been
8161 // created. Remove the aggregate result.
8162 ResultTypes.erase(CI: &ResultTypes[1]);
8163 }
8164 }
8165
8166 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
8167 if (BaseOpcode->Atomic)
8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8169 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8170 AMDGPU::CPol::VOLATILE))
8171 return Op;
8172
8173 SmallVector<SDValue, 26> Ops;
8174 if (BaseOpcode->Store || BaseOpcode->Atomic)
8175 Ops.push_back(Elt: VData); // vdata
8176 if (UsePartialNSA) {
8177 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
8178 Ops.push_back(Elt: VAddr);
8179 }
8180 else if (UseNSA)
8181 append_range(C&: Ops, R&: VAddrs);
8182 else
8183 Ops.push_back(Elt: VAddr);
8184 Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->RsrcIndex));
8185 if (BaseOpcode->Sampler)
8186 Ops.push_back(Elt: Op.getOperand(i: ArgOffset + Intr->SampIndex));
8187 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
8188 if (IsGFX10Plus)
8189 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8191 Ops.push_back(Elt: Unorm);
8192 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
8193 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
8194 ST->hasFeature(Feature: AMDGPU::FeatureR128A16) ? True : False);
8195 if (IsGFX10Plus)
8196 Ops.push_back(Elt: IsA16 ? True : False);
8197 if (!Subtarget->hasGFX90AInsts()) {
8198 Ops.push_back(Elt: TFE); //tfe
8199 } else if (TFE->getAsZExtVal()) {
8200 report_fatal_error(reason: "TFE is not supported on this GPU");
8201 }
8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8203 Ops.push_back(Elt: LWE); // lwe
8204 if (!IsGFX10Plus)
8205 Ops.push_back(Elt: DimInfo->DA ? True : False);
8206 if (BaseOpcode->HasD16)
8207 Ops.push_back(Elt: IsD16 ? True : False);
8208 if (isa<MemSDNode>(Val: Op))
8209 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
8210
8211 int NumVAddrDwords =
8212 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8213 int Opcode = -1;
8214
8215 if (IsGFX12Plus) {
8216 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
8217 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8218 } else if (IsGFX11Plus) {
8219 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8220 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
8221 : AMDGPU::MIMGEncGfx11Default,
8222 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8223 } else if (IsGFX10Plus) {
8224 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
8225 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
8226 : AMDGPU::MIMGEncGfx10Default,
8227 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8228 } else {
8229 if (Subtarget->hasGFX90AInsts()) {
8230 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
8231 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8232 if (Opcode == -1)
8233 report_fatal_error(
8234 reason: "requested image instruction is not supported on this GPU");
8235 }
8236 if (Opcode == -1 &&
8237 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8238 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
8239 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8240 if (Opcode == -1)
8241 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
8242 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
8243 }
8244 if (Opcode == -1)
8245 return Op;
8246
8247 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
8248 if (auto MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
8249 MachineMemOperand *MemRef = MemOp->getMemOperand();
8250 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
8251 }
8252
8253 if (BaseOpcode->AtomicX2) {
8254 SmallVector<SDValue, 1> Elt;
8255 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
8256 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
8257 }
8258 if (BaseOpcode->NoReturn)
8259 return SDValue(NewNode, 0);
8260 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
8261 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
8262 NumVDataDwords, IsAtomicPacked16Bit, DL);
8263}
8264
8265SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8266 SDValue Offset, SDValue CachePolicy,
8267 SelectionDAG &DAG) const {
8268 MachineFunction &MF = DAG.getMachineFunction();
8269
8270 const DataLayout &DataLayout = DAG.getDataLayout();
8271 Align Alignment =
8272 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
8273
8274 MachineMemOperand *MMO = MF.getMachineMemOperand(
8275 PtrInfo: MachinePointerInfo(),
8276 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
8277 MachineMemOperand::MOInvariant,
8278 Size: VT.getStoreSize(), BaseAlignment: Alignment);
8279
8280 if (!Offset->isDivergent()) {
8281 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8282
8283 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8284 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8285 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8286 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8288 SDValue BufferLoad =
8289 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
8290 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
8291 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
8292 }
8293
8294 // Widen vec3 load to vec4.
8295 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8296 !Subtarget->hasScalarDwordx3Loads()) {
8297 EVT WidenedVT =
8298 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
8299 auto WidenedOp = DAG.getMemIntrinsicNode(
8300 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
8301 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
8302 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
8303 N2: DAG.getVectorIdxConstant(Val: 0, DL));
8304 return Subvector;
8305 }
8306
8307 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
8308 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
8309 }
8310
8311 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8312 // assume that the buffer is unswizzled.
8313 SDValue Ops[] = {
8314 DAG.getEntryNode(), // Chain
8315 Rsrc, // rsrc
8316 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8317 {}, // voffset
8318 {}, // soffset
8319 {}, // offset
8320 CachePolicy, // cachepolicy
8321 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8322 };
8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8324 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
8325 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
8326 }
8327
8328 SmallVector<SDValue, 4> Loads;
8329 unsigned NumLoads = 1;
8330 MVT LoadVT = VT.getSimpleVT();
8331 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8332 assert((LoadVT.getScalarType() == MVT::i32 ||
8333 LoadVT.getScalarType() == MVT::f32));
8334
8335 if (NumElts == 8 || NumElts == 16) {
8336 NumLoads = NumElts / 4;
8337 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
8338 }
8339
8340 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Glue});
8341
8342 // Use the alignment to ensure that the required offsets will fit into the
8343 // immediate offsets.
8344 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
8345 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8346
8347 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8348 for (unsigned i = 0; i < NumLoads; ++i) {
8349 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
8350 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8351 MemVT: LoadVT, MMO, DAG));
8352 }
8353
8354 if (NumElts == 8 || NumElts == 16)
8355 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
8356
8357 return Loads[0];
8358}
8359
8360SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8361 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8362 if (!Subtarget->hasArchitectedSGPRs())
8363 return {};
8364 SDLoc SL(Op);
8365 MVT VT = MVT::i32;
8366 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
8367 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
8368 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
8369}
8370
8371SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8372 unsigned Dim,
8373 const ArgDescriptor &Arg) const {
8374 SDLoc SL(Op);
8375 MachineFunction &MF = DAG.getMachineFunction();
8376 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
8377 if (MaxID == 0)
8378 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
8379
8380 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
8381 SL: SDLoc(DAG.getEntryNode()), Arg);
8382
8383 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8384 // masking operations anyway.
8385 //
8386 // TODO: We could assert the top bit is 0 for the source copy.
8387 if (Arg.isMasked())
8388 return Val;
8389
8390 // Preserve the known bits after expansion to a copy.
8391 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
8392 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
8393 N2: DAG.getValueType(SmallVT));
8394}
8395
8396SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8397 SelectionDAG &DAG) const {
8398 MachineFunction &MF = DAG.getMachineFunction();
8399 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8400
8401 EVT VT = Op.getValueType();
8402 SDLoc DL(Op);
8403 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
8404
8405 // TODO: Should this propagate fast-math-flags?
8406
8407 switch (IntrinsicID) {
8408 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8409 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
8410 return emitNonHSAIntrinsicError(DAG, DL, VT);
8411 return getPreloadedValue(DAG, MFI: *MFI, VT,
8412 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
8413 }
8414 case Intrinsic::amdgcn_dispatch_ptr:
8415 case Intrinsic::amdgcn_queue_ptr: {
8416 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
8417 DiagnosticInfoUnsupported BadIntrin(
8418 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8419 DL.getDebugLoc());
8420 DAG.getContext()->diagnose(DI: BadIntrin);
8421 return DAG.getUNDEF(VT);
8422 }
8423
8424 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8425 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
8426 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
8427 }
8428 case Intrinsic::amdgcn_implicitarg_ptr: {
8429 if (MFI->isEntryFunction())
8430 return getImplicitArgPtr(DAG, SL: DL);
8431 return getPreloadedValue(DAG, MFI: *MFI, VT,
8432 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
8433 }
8434 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8435 if (!AMDGPU::isKernel(CC: MF.getFunction().getCallingConv())) {
8436 // This only makes sense to call in a kernel, so just lower to null.
8437 return DAG.getConstant(Val: 0, DL, VT);
8438 }
8439
8440 return getPreloadedValue(DAG, MFI: *MFI, VT,
8441 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
8442 }
8443 case Intrinsic::amdgcn_dispatch_id: {
8444 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
8445 }
8446 case Intrinsic::amdgcn_rcp:
8447 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
8448 case Intrinsic::amdgcn_rsq:
8449 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
8450 case Intrinsic::amdgcn_rsq_legacy:
8451 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8452 return emitRemovedIntrinsicError(DAG, DL, VT);
8453 return SDValue();
8454 case Intrinsic::amdgcn_rcp_legacy:
8455 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
8456 return emitRemovedIntrinsicError(DAG, DL, VT);
8457 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
8458 case Intrinsic::amdgcn_rsq_clamp: {
8459 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8460 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
8461
8462 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
8463 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
8464 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
8465
8466 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
8467 SDValue Tmp = DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq,
8468 N2: DAG.getConstantFP(Val: Max, DL, VT));
8469 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
8470 N2: DAG.getConstantFP(Val: Min, DL, VT));
8471 }
8472 case Intrinsic::r600_read_ngroups_x:
8473 if (Subtarget->isAmdHsaOS())
8474 return emitNonHSAIntrinsicError(DAG, DL, VT);
8475
8476 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8477 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
8478 Signed: false);
8479 case Intrinsic::r600_read_ngroups_y:
8480 if (Subtarget->isAmdHsaOS())
8481 return emitNonHSAIntrinsicError(DAG, DL, VT);
8482
8483 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8484 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
8485 Signed: false);
8486 case Intrinsic::r600_read_ngroups_z:
8487 if (Subtarget->isAmdHsaOS())
8488 return emitNonHSAIntrinsicError(DAG, DL, VT);
8489
8490 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8491 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
8492 Signed: false);
8493 case Intrinsic::r600_read_global_size_x:
8494 if (Subtarget->isAmdHsaOS())
8495 return emitNonHSAIntrinsicError(DAG, DL, VT);
8496
8497 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8498 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_X,
8499 Alignment: Align(4), Signed: false);
8500 case Intrinsic::r600_read_global_size_y:
8501 if (Subtarget->isAmdHsaOS())
8502 return emitNonHSAIntrinsicError(DAG, DL, VT);
8503
8504 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8505 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Y,
8506 Alignment: Align(4), Signed: false);
8507 case Intrinsic::r600_read_global_size_z:
8508 if (Subtarget->isAmdHsaOS())
8509 return emitNonHSAIntrinsicError(DAG, DL, VT);
8510
8511 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
8512 Offset: SI::KernelInputOffsets::GLOBAL_SIZE_Z,
8513 Alignment: Align(4), Signed: false);
8514 case Intrinsic::r600_read_local_size_x:
8515 if (Subtarget->isAmdHsaOS())
8516 return emitNonHSAIntrinsicError(DAG, DL, VT);
8517
8518 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
8519 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
8520 case Intrinsic::r600_read_local_size_y:
8521 if (Subtarget->isAmdHsaOS())
8522 return emitNonHSAIntrinsicError(DAG, DL, VT);
8523
8524 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
8525 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
8526 case Intrinsic::r600_read_local_size_z:
8527 if (Subtarget->isAmdHsaOS())
8528 return emitNonHSAIntrinsicError(DAG, DL, VT);
8529
8530 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
8531 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
8532 case Intrinsic::amdgcn_workgroup_id_x:
8533 return getPreloadedValue(DAG, MFI: *MFI, VT,
8534 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
8535 case Intrinsic::amdgcn_workgroup_id_y:
8536 return getPreloadedValue(DAG, MFI: *MFI, VT,
8537 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
8538 case Intrinsic::amdgcn_workgroup_id_z:
8539 return getPreloadedValue(DAG, MFI: *MFI, VT,
8540 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
8541 case Intrinsic::amdgcn_wave_id:
8542 return lowerWaveID(DAG, Op);
8543 case Intrinsic::amdgcn_lds_kernel_id: {
8544 if (MFI->isEntryFunction())
8545 return getLDSKernelId(DAG, SL: DL);
8546 return getPreloadedValue(DAG, MFI: *MFI, VT,
8547 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
8548 }
8549 case Intrinsic::amdgcn_workitem_id_x:
8550 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
8551 case Intrinsic::amdgcn_workitem_id_y:
8552 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
8553 case Intrinsic::amdgcn_workitem_id_z:
8554 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
8555 case Intrinsic::amdgcn_wavefrontsize:
8556 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
8557 DL: SDLoc(Op), VT: MVT::i32);
8558 case Intrinsic::amdgcn_s_buffer_load: {
8559 unsigned CPol = Op.getConstantOperandVal(i: 3);
8560 // s_buffer_load, because of how it's optimized, can't be volatile
8561 // so reject ones with the volatile bit set.
8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8563 ? AMDGPU::CPol::ALL
8564 : AMDGPU::CPol::ALL_pregfx12))
8565 return Op;
8566 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2), CachePolicy: Op.getOperand(i: 3),
8567 DAG);
8568 }
8569 case Intrinsic::amdgcn_fdiv_fast:
8570 return lowerFDIV_FAST(Op, DAG);
8571 case Intrinsic::amdgcn_sin:
8572 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
8573
8574 case Intrinsic::amdgcn_cos:
8575 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
8576
8577 case Intrinsic::amdgcn_mul_u24:
8578 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8579 case Intrinsic::amdgcn_mul_i24:
8580 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8581
8582 case Intrinsic::amdgcn_log_clamp: {
8583 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
8584 return SDValue();
8585
8586 return emitRemovedIntrinsicError(DAG, DL, VT);
8587 }
8588 case Intrinsic::amdgcn_fract:
8589 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
8590
8591 case Intrinsic::amdgcn_class:
8592 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT,
8593 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8594 case Intrinsic::amdgcn_div_fmas:
8595 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT,
8596 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3),
8597 N4: Op.getOperand(i: 4));
8598
8599 case Intrinsic::amdgcn_div_fixup:
8600 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT,
8601 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8602
8603 case Intrinsic::amdgcn_div_scale: {
8604 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
8605
8606 // Translate to the operands expected by the machine instruction. The
8607 // first parameter must be the same as the first instruction.
8608 SDValue Numerator = Op.getOperand(i: 1);
8609 SDValue Denominator = Op.getOperand(i: 2);
8610
8611 // Note this order is opposite of the machine instruction's operations,
8612 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8613 // intrinsic has the numerator as the first operand to match a normal
8614 // division operation.
8615
8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8617
8618 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
8619 N2: Denominator, N3: Numerator);
8620 }
8621 case Intrinsic::amdgcn_icmp: {
8622 // There is a Pat that handles this variant, so return it as-is.
8623 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
8624 Op.getConstantOperandVal(i: 2) == 0 &&
8625 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
8626 return Op;
8627 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8628 }
8629 case Intrinsic::amdgcn_fcmp: {
8630 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8631 }
8632 case Intrinsic::amdgcn_ballot:
8633 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
8634 case Intrinsic::amdgcn_fmed3:
8635 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT,
8636 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8637 case Intrinsic::amdgcn_fdot2:
8638 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT,
8639 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3),
8640 N4: Op.getOperand(i: 4));
8641 case Intrinsic::amdgcn_fmul_legacy:
8642 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT,
8643 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8644 case Intrinsic::amdgcn_sffbh:
8645 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
8646 case Intrinsic::amdgcn_sbfe:
8647 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT,
8648 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8649 case Intrinsic::amdgcn_ubfe:
8650 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT,
8651 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8652 case Intrinsic::amdgcn_cvt_pkrtz:
8653 case Intrinsic::amdgcn_cvt_pknorm_i16:
8654 case Intrinsic::amdgcn_cvt_pknorm_u16:
8655 case Intrinsic::amdgcn_cvt_pk_i16:
8656 case Intrinsic::amdgcn_cvt_pk_u16: {
8657 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8658 EVT VT = Op.getValueType();
8659 unsigned Opcode;
8660
8661 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8662 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
8663 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8664 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8666 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8668 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8669 else
8670 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8671
8672 if (isTypeLegal(VT))
8673 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8674
8675 SDValue Node = DAG.getNode(Opcode, DL, VT: MVT::i32,
8676 N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
8677 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
8678 }
8679 case Intrinsic::amdgcn_fmad_ftz:
8680 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
8681 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8682
8683 case Intrinsic::amdgcn_if_break:
8684 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
8685 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)), 0);
8686
8687 case Intrinsic::amdgcn_groupstaticsize: {
8688 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
8689 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8690 return Op;
8691
8692 const Module *M = MF.getFunction().getParent();
8693 const GlobalValue *GV =
8694 M->getNamedValue(Name: Intrinsic::getName(id: Intrinsic::amdgcn_groupstaticsize));
8695 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
8696 TargetFlags: SIInstrInfo::MO_ABS32_LO);
8697 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
8698 }
8699 case Intrinsic::amdgcn_is_shared:
8700 case Intrinsic::amdgcn_is_private: {
8701 SDLoc SL(Op);
8702 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8703 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
8704 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
8705 SDValue SrcVec = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32,
8706 Operand: Op.getOperand(i: 1));
8707
8708 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
8709 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
8710 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
8711 }
8712 case Intrinsic::amdgcn_perm:
8713 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
8714 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
8715 case Intrinsic::amdgcn_reloc_constant: {
8716 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8717 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
8718 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
8719 auto RelocSymbol = cast<GlobalVariable>(
8720 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
8721 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
8722 TargetFlags: SIInstrInfo::MO_ABS32_LO);
8723 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
8724 }
8725 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8726 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8727 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8728 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8733 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
8734 return SDValue();
8735
8736 SDLoc SL(Op);
8737 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
8738 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
8739 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
8740 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
8741 }
8742 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8743 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8744 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8745 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
8746 return SDValue();
8747
8748 SDLoc SL(Op);
8749 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
8750 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
8751 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
8752 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
8753 IndexKeyi32, Op.getOperand(i: 7)});
8754 }
8755 case Intrinsic::amdgcn_addrspacecast_nonnull:
8756 return lowerADDRSPACECAST(Op, DAG);
8757 case Intrinsic::amdgcn_readlane:
8758 case Intrinsic::amdgcn_readfirstlane:
8759 case Intrinsic::amdgcn_writelane:
8760 case Intrinsic::amdgcn_permlane16:
8761 case Intrinsic::amdgcn_permlanex16:
8762 case Intrinsic::amdgcn_permlane64:
8763 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
8764 default:
8765 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8766 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
8767 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
8768
8769 return Op;
8770 }
8771}
8772
8773// On targets not supporting constant in soffset field, turn zero to
8774// SGPR_NULL to avoid generating an extra s_mov with zero.
8775static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
8776 const GCNSubtarget *Subtarget) {
8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
8778 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
8779 return SOffset;
8780}
8781
8782SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8783 SelectionDAG &DAG,
8784 unsigned NewOpcode) const {
8785 SDLoc DL(Op);
8786
8787 SDValue VData = Op.getOperand(i: 2);
8788 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
8789 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8790 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8791 SDValue Ops[] = {
8792 Op.getOperand(i: 0), // Chain
8793 VData, // vdata
8794 Rsrc, // rsrc
8795 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8796 Offsets.first, // voffset
8797 SOffset, // soffset
8798 Offsets.second, // offset
8799 Op.getOperand(i: 6), // cachepolicy
8800 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8801 };
8802
8803 auto *M = cast<MemSDNode>(Val&: Op);
8804
8805 EVT MemVT = VData.getValueType();
8806 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
8807 MMO: M->getMemOperand());
8808}
8809
8810SDValue
8811SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8812 unsigned NewOpcode) const {
8813 SDLoc DL(Op);
8814
8815 SDValue VData = Op.getOperand(i: 2);
8816 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
8817 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
8818 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
8819 SDValue Ops[] = {
8820 Op.getOperand(i: 0), // Chain
8821 VData, // vdata
8822 Rsrc, // rsrc
8823 Op.getOperand(i: 4), // vindex
8824 Offsets.first, // voffset
8825 SOffset, // soffset
8826 Offsets.second, // offset
8827 Op.getOperand(i: 7), // cachepolicy
8828 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
8829 };
8830
8831 auto *M = cast<MemSDNode>(Val&: Op);
8832
8833 EVT MemVT = VData.getValueType();
8834 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
8835 MMO: M->getMemOperand());
8836}
8837
8838SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8839 SelectionDAG &DAG) const {
8840 unsigned IntrID = Op.getConstantOperandVal(i: 1);
8841 SDLoc DL(Op);
8842
8843 switch (IntrID) {
8844 case Intrinsic::amdgcn_ds_ordered_add:
8845 case Intrinsic::amdgcn_ds_ordered_swap: {
8846 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8847 SDValue Chain = M->getOperand(Num: 0);
8848 SDValue M0 = M->getOperand(Num: 2);
8849 SDValue Value = M->getOperand(Num: 3);
8850 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
8851 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
8852 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
8853
8854 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8855 IndexOperand &= ~0x3f;
8856 unsigned CountDw = 0;
8857
8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8859 CountDw = (IndexOperand >> 24) & 0xf;
8860 IndexOperand &= ~(0xf << 24);
8861
8862 if (CountDw < 1 || CountDw > 4) {
8863 report_fatal_error(
8864 reason: "ds_ordered_count: dword count must be between 1 and 4");
8865 }
8866 }
8867
8868 if (IndexOperand)
8869 report_fatal_error(reason: "ds_ordered_count: bad index operand");
8870
8871 if (WaveDone && !WaveRelease)
8872 report_fatal_error(reason: "ds_ordered_count: wave_done requires wave_release");
8873
8874 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8875 unsigned ShaderType =
8876 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
8877 unsigned Offset0 = OrderedCountIndex << 2;
8878 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8879
8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8881 Offset1 |= (CountDw - 1) << 6;
8882
8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8884 Offset1 |= ShaderType << 2;
8885
8886 unsigned Offset = Offset0 | (Offset1 << 8);
8887
8888 SDValue Ops[] = {
8889 Chain,
8890 Value,
8891 DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
8892 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
8893 };
8894 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
8895 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
8896 MMO: M->getMemOperand());
8897 }
8898 case Intrinsic::amdgcn_raw_buffer_load:
8899 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8900 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8901 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8902 case Intrinsic::amdgcn_raw_buffer_load_format:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8904 const bool IsFormat =
8905 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8906 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8907
8908 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8909 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
8910 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
8911 SDValue Ops[] = {
8912 Op.getOperand(i: 0), // Chain
8913 Rsrc, // rsrc
8914 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8915 Offsets.first, // voffset
8916 SOffset, // soffset
8917 Offsets.second, // offset
8918 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
8919 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8920 };
8921
8922 auto *M = cast<MemSDNode>(Val&: Op);
8923 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8924 }
8925 case Intrinsic::amdgcn_struct_buffer_load:
8926 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8927 case Intrinsic::amdgcn_struct_buffer_load_format:
8928 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8929 const bool IsFormat =
8930 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8931 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8932
8933 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8934 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8935 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8936 SDValue Ops[] = {
8937 Op.getOperand(i: 0), // Chain
8938 Rsrc, // rsrc
8939 Op.getOperand(i: 3), // vindex
8940 Offsets.first, // voffset
8941 SOffset, // soffset
8942 Offsets.second, // offset
8943 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
8944 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
8945 };
8946
8947 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
8948 }
8949 case Intrinsic::amdgcn_raw_tbuffer_load:
8950 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8951 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8952 EVT LoadVT = Op.getValueType();
8953 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8954 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
8955 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
8956
8957 SDValue Ops[] = {
8958 Op.getOperand(i: 0), // Chain
8959 Rsrc, // rsrc
8960 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
8961 Offsets.first, // voffset
8962 SOffset, // soffset
8963 Offsets.second, // offset
8964 Op.getOperand(i: 5), // format
8965 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
8966 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
8967 };
8968
8969 if (LoadVT.getScalarType() == MVT::f16)
8970 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8971 M, DAG, Ops);
8972 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8973 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
8974 DAG);
8975 }
8976 case Intrinsic::amdgcn_struct_tbuffer_load:
8977 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8978 MemSDNode *M = cast<MemSDNode>(Val&: Op);
8979 EVT LoadVT = Op.getValueType();
8980 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
8981 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
8982 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
8983
8984 SDValue Ops[] = {
8985 Op.getOperand(i: 0), // Chain
8986 Rsrc, // rsrc
8987 Op.getOperand(i: 3), // vindex
8988 Offsets.first, // voffset
8989 SOffset, // soffset
8990 Offsets.second, // offset
8991 Op.getOperand(i: 6), // format
8992 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
8993 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
8994 };
8995
8996 if (LoadVT.getScalarType() == MVT::f16)
8997 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8998 M, DAG, Ops);
8999 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9000 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
9001 DAG);
9002 }
9003 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9004 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9005 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9006 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9007 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9008 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
9009 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9010 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9011 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9012 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9014 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
9015 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9016 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9017 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9018 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9019 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9020 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
9021 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9022 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9023 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9024 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9026 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9027 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9028 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9029 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9030 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9032 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9033 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9034 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9035 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9036 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9038 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9039 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9040 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9041 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9042 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9043 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9044 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9045 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9046 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9047 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9048 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9050 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9051 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9052 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9053 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9054 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9055 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9056 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9057 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9058 return lowerRawBufferAtomicIntrin(Op, DAG,
9059 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9060 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9062 return lowerStructBufferAtomicIntrin(Op, DAG,
9063 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
9064 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9066 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
9067 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9069 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
9070 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9072 return lowerStructBufferAtomicIntrin(Op, DAG,
9073 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
9074 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9076 return lowerStructBufferAtomicIntrin(Op, DAG,
9077 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
9078 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9080 return lowerStructBufferAtomicIntrin(Op, DAG,
9081 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
9082 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9084 return lowerStructBufferAtomicIntrin(Op, DAG,
9085 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
9086 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9088 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
9089 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9090 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9091 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
9092 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9094 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
9095 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9096 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9097 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
9098 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9099 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9100 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
9101 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9102 return lowerStructBufferAtomicIntrin(Op, DAG,
9103 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
9104
9105 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9107 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
9108 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9109 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9110 SDValue Ops[] = {
9111 Op.getOperand(i: 0), // Chain
9112 Op.getOperand(i: 2), // src
9113 Op.getOperand(i: 3), // cmp
9114 Rsrc, // rsrc
9115 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9116 Offsets.first, // voffset
9117 SOffset, // soffset
9118 Offsets.second, // offset
9119 Op.getOperand(i: 7), // cachepolicy
9120 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9121 };
9122 EVT VT = Op.getValueType();
9123 auto *M = cast<MemSDNode>(Val&: Op);
9124
9125 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9126 VTList: Op->getVTList(), Ops, MemVT: VT, MMO: M->getMemOperand());
9127 }
9128 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9129 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9130 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
9131 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
9132 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
9133 SDValue Ops[] = {
9134 Op.getOperand(i: 0), // Chain
9135 Op.getOperand(i: 2), // src
9136 Op.getOperand(i: 3), // cmp
9137 Rsrc, // rsrc
9138 Op.getOperand(i: 5), // vindex
9139 Offsets.first, // voffset
9140 SOffset, // soffset
9141 Offsets.second, // offset
9142 Op.getOperand(i: 8), // cachepolicy
9143 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9144 };
9145 EVT VT = Op.getValueType();
9146 auto *M = cast<MemSDNode>(Val&: Op);
9147
9148 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
9149 VTList: Op->getVTList(), Ops, MemVT: VT, MMO: M->getMemOperand());
9150 }
9151 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9152 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9153 SDValue NodePtr = M->getOperand(Num: 2);
9154 SDValue RayExtent = M->getOperand(Num: 3);
9155 SDValue RayOrigin = M->getOperand(Num: 4);
9156 SDValue RayDir = M->getOperand(Num: 5);
9157 SDValue RayInvDir = M->getOperand(Num: 6);
9158 SDValue TDescr = M->getOperand(Num: 7);
9159
9160 assert(NodePtr.getValueType() == MVT::i32 ||
9161 NodePtr.getValueType() == MVT::i64);
9162 assert(RayDir.getValueType() == MVT::v3f16 ||
9163 RayDir.getValueType() == MVT::v3f32);
9164
9165 if (!Subtarget->hasGFX10_AEncoding()) {
9166 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
9167 return SDValue();
9168 }
9169
9170 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
9171 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9172 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9173 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9174 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9175 const unsigned NumVDataDwords = 4;
9176 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9177 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9178 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9179 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9180 IsGFX12Plus;
9181 const unsigned BaseOpcodes[2][2] = {
9182 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9183 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9184 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9185 int Opcode;
9186 if (UseNSA) {
9187 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9188 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9189 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9190 : AMDGPU::MIMGEncGfx10NSA,
9191 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9192 } else {
9193 assert(!IsGFX12Plus);
9194 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
9195 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9196 : AMDGPU::MIMGEncGfx10Default,
9197 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9198 }
9199 assert(Opcode != -1);
9200
9201 SmallVector<SDValue, 16> Ops;
9202
9203 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9204 SmallVector<SDValue, 3> Lanes;
9205 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
9206 if (Lanes[0].getValueSizeInBits() == 32) {
9207 for (unsigned I = 0; I < 3; ++I)
9208 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
9209 } else {
9210 if (IsAligned) {
9211 Ops.push_back(
9212 Elt: DAG.getBitcast(VT: MVT::i32,
9213 V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9214 Ops: { Lanes[0], Lanes[1] })));
9215 Ops.push_back(Elt: Lanes[2]);
9216 } else {
9217 SDValue Elt0 = Ops.pop_back_val();
9218 Ops.push_back(
9219 Elt: DAG.getBitcast(VT: MVT::i32,
9220 V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9221 Ops: { Elt0, Lanes[0] })));
9222 Ops.push_back(
9223 Elt: DAG.getBitcast(VT: MVT::i32,
9224 V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9225 Ops: { Lanes[1], Lanes[2] })));
9226 }
9227 }
9228 };
9229
9230 if (UseNSA && IsGFX11Plus) {
9231 Ops.push_back(Elt: NodePtr);
9232 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9233 Ops.push_back(Elt: RayOrigin);
9234 if (IsA16) {
9235 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9236 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
9237 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
9238 for (unsigned I = 0; I < 3; ++I) {
9239 MergedLanes.push_back(Elt: DAG.getBitcast(
9240 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
9241 Ops: {DirLanes[I], InvDirLanes[I]})));
9242 }
9243 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
9244 } else {
9245 Ops.push_back(Elt: RayDir);
9246 Ops.push_back(Elt: RayInvDir);
9247 }
9248 } else {
9249 if (Is64)
9250 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
9251 Count: 2);
9252 else
9253 Ops.push_back(Elt: NodePtr);
9254
9255 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
9256 packLanes(RayOrigin, true);
9257 packLanes(RayDir, true);
9258 packLanes(RayInvDir, false);
9259 }
9260
9261 if (!UseNSA) {
9262 // Build a single vector containing all the operands so far prepared.
9263 if (NumVAddrDwords > 12) {
9264 SDValue Undef = DAG.getUNDEF(VT: MVT::i32);
9265 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
9266 }
9267 assert(Ops.size() >= 8 && Ops.size() <= 12);
9268 SDValue MergedOps = DAG.getBuildVector(
9269 VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
9270 Ops.clear();
9271 Ops.push_back(Elt: MergedOps);
9272 }
9273
9274 Ops.push_back(Elt: TDescr);
9275 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
9276 Ops.push_back(Elt: M->getChain());
9277
9278 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
9279 MachineMemOperand *MemRef = M->getMemOperand();
9280 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9281 return SDValue(NewNode, 0);
9282 }
9283 case Intrinsic::amdgcn_global_atomic_fmin:
9284 case Intrinsic::amdgcn_global_atomic_fmax:
9285 case Intrinsic::amdgcn_global_atomic_fmin_num:
9286 case Intrinsic::amdgcn_global_atomic_fmax_num:
9287 case Intrinsic::amdgcn_flat_atomic_fmin:
9288 case Intrinsic::amdgcn_flat_atomic_fmax:
9289 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9290 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9291 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9292 SDValue Ops[] = {
9293 M->getOperand(Num: 0), // Chain
9294 M->getOperand(Num: 2), // Ptr
9295 M->getOperand(Num: 3) // Value
9296 };
9297 unsigned Opcode = 0;
9298 switch (IntrID) {
9299 case Intrinsic::amdgcn_global_atomic_fmin:
9300 case Intrinsic::amdgcn_global_atomic_fmin_num:
9301 case Intrinsic::amdgcn_flat_atomic_fmin:
9302 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9303 Opcode = ISD::ATOMIC_LOAD_FMIN;
9304 break;
9305 }
9306 case Intrinsic::amdgcn_global_atomic_fmax:
9307 case Intrinsic::amdgcn_global_atomic_fmax_num:
9308 case Intrinsic::amdgcn_flat_atomic_fmax:
9309 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9310 Opcode = ISD::ATOMIC_LOAD_FMAX;
9311 break;
9312 }
9313 default:
9314 llvm_unreachable("unhandled atomic opcode");
9315 }
9316 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
9317 Ops, MMO: M->getMemOperand());
9318 }
9319 case Intrinsic::amdgcn_s_get_barrier_state: {
9320 SDValue Chain = Op->getOperand(Num: 0);
9321 SmallVector<SDValue, 2> Ops;
9322 unsigned Opc;
9323 bool IsInlinableBarID = false;
9324 int64_t BarID;
9325
9326 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
9327 BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getSExtValue();
9328 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarID);
9329 }
9330
9331 if (IsInlinableBarID) {
9332 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9333 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
9334 Ops.push_back(Elt: K);
9335 } else {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9337 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 2));
9338 Ops.push_back(Elt: M0Val.getValue(R: 0));
9339 }
9340
9341 auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9342 return SDValue(NewMI, 0);
9343 }
9344 default:
9345
9346 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9347 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
9348 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9349
9350 return SDValue();
9351 }
9352}
9353
9354// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9355// dwordx4 if on SI and handle TFE loads.
9356SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9357 SDVTList VTList,
9358 ArrayRef<SDValue> Ops, EVT MemVT,
9359 MachineMemOperand *MMO,
9360 SelectionDAG &DAG) const {
9361 LLVMContext &C = *DAG.getContext();
9362 MachineFunction &MF = DAG.getMachineFunction();
9363 EVT VT = VTList.VTs[0];
9364
9365 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9366 bool IsTFE = VTList.NumVTs == 3;
9367 if (IsTFE) {
9368 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
9369 unsigned NumOpDWords = NumValueDWords + 1;
9370 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
9371 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
9372 MachineMemOperand *OpDWordsMMO =
9373 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
9374 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
9375 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
9376 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
9377 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
9378 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
9379 SDValue ValueDWords =
9380 NumValueDWords == 1
9381 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
9382 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
9383 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
9384 N2: ZeroIdx);
9385 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
9386 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
9387 }
9388
9389 if (!Subtarget->hasDwordx3LoadStores() &&
9390 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9391 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
9392 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
9393 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
9394 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
9395 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
9396 MemVT: WidenedMemVT, MMO: WidenedMMO);
9397 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
9398 N2: DAG.getVectorIdxConstant(Val: 0, DL));
9399 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
9400 }
9401
9402 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
9403}
9404
9405SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9406 bool ImageStore) const {
9407 EVT StoreVT = VData.getValueType();
9408
9409 // No change for f16 and legal vector D16 types.
9410 if (!StoreVT.isVector())
9411 return VData;
9412
9413 SDLoc DL(VData);
9414 unsigned NumElements = StoreVT.getVectorNumElements();
9415
9416 if (Subtarget->hasUnpackedD16VMem()) {
9417 // We need to unpack the packed data to store.
9418 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9419 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9420
9421 EVT EquivStoreVT =
9422 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
9423 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
9424 return DAG.UnrollVectorOp(N: ZExt.getNode());
9425 }
9426
9427 // The sq block of gfx8.1 does not estimate register use correctly for d16
9428 // image store instructions. The data operand is computed as if it were not a
9429 // d16 image instruction.
9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9431 // Bitcast to i16
9432 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9433 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9434
9435 // Decompose into scalars
9436 SmallVector<SDValue, 4> Elts;
9437 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
9438
9439 // Group pairs of i16 into v2i16 and bitcast to i32
9440 SmallVector<SDValue, 4> PackedElts;
9441 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9442 SDValue Pair =
9443 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
9444 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
9445 PackedElts.push_back(Elt: IntPair);
9446 }
9447 if ((NumElements % 2) == 1) {
9448 // Handle v3i16
9449 unsigned I = Elts.size() / 2;
9450 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
9451 Ops: {Elts[I * 2], DAG.getUNDEF(VT: MVT::i16)});
9452 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
9453 PackedElts.push_back(Elt: IntPair);
9454 }
9455
9456 // Pad using UNDEF
9457 PackedElts.resize(N: Elts.size(), NV: DAG.getUNDEF(VT: MVT::i32));
9458
9459 // Build final vector
9460 EVT VecVT =
9461 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
9462 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
9463 }
9464
9465 if (NumElements == 3) {
9466 EVT IntStoreVT =
9467 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
9468 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
9469
9470 EVT WidenedStoreVT = EVT::getVectorVT(
9471 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
9472 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
9473 BitWidth: WidenedStoreVT.getStoreSizeInBits());
9474 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
9475 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
9476 }
9477
9478 assert(isTypeLegal(StoreVT));
9479 return VData;
9480}
9481
9482SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9483 SelectionDAG &DAG) const {
9484 SDLoc DL(Op);
9485 SDValue Chain = Op.getOperand(i: 0);
9486 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
9487 MachineFunction &MF = DAG.getMachineFunction();
9488
9489 switch (IntrinsicID) {
9490 case Intrinsic::amdgcn_exp_compr: {
9491 if (!Subtarget->hasCompressedExport()) {
9492 DiagnosticInfoUnsupported BadIntrin(
9493 DAG.getMachineFunction().getFunction(),
9494 "intrinsic not supported on subtarget", DL.getDebugLoc());
9495 DAG.getContext()->diagnose(DI: BadIntrin);
9496 }
9497 SDValue Src0 = Op.getOperand(i: 4);
9498 SDValue Src1 = Op.getOperand(i: 5);
9499 // Hack around illegal type on SI by directly selecting it.
9500 if (isTypeLegal(VT: Src0.getValueType()))
9501 return SDValue();
9502
9503 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
9504 SDValue Undef = DAG.getUNDEF(VT: MVT::f32);
9505 const SDValue Ops[] = {
9506 Op.getOperand(i: 2), // tgt
9507 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
9508 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
9509 Undef, // src2
9510 Undef, // src3
9511 Op.getOperand(i: 7), // vm
9512 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
9513 Op.getOperand(i: 3), // en
9514 Op.getOperand(i: 0) // Chain
9515 };
9516
9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9518 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
9519 }
9520 case Intrinsic::amdgcn_s_barrier: {
9521 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
9522 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9523 unsigned WGSize = ST.getFlatWorkGroupSizes(F: MF.getFunction()).second;
9524 if (WGSize <= ST.getWavefrontSize())
9525 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::WAVE_BARRIER, dl: DL, VT: MVT::Other,
9526 Op1: Op.getOperand(i: 0)), 0);
9527 }
9528
9529 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9530 if (ST.hasSplitBarriers()) {
9531 SDValue K =
9532 DAG.getTargetConstant(Val: AMDGPU::Barrier::WORKGROUP, DL, VT: MVT::i32);
9533 SDValue BarSignal =
9534 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_SIGNAL_IMM, dl: DL,
9535 VT: MVT::Other, Op1: K, Op2: Op.getOperand(i: 0)),
9536 0);
9537 SDValue BarWait =
9538 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_BARRIER_WAIT, dl: DL, VT: MVT::Other, Op1: K,
9539 Op2: BarSignal.getValue(R: 0)),
9540 0);
9541 return BarWait;
9542 }
9543
9544 return SDValue();
9545 };
9546
9547 case Intrinsic::amdgcn_struct_tbuffer_store:
9548 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9549 SDValue VData = Op.getOperand(i: 2);
9550 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9551 if (IsD16)
9552 VData = handleD16VData(VData, DAG);
9553 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9554 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9555 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9556 SDValue Ops[] = {
9557 Chain,
9558 VData, // vdata
9559 Rsrc, // rsrc
9560 Op.getOperand(i: 4), // vindex
9561 Offsets.first, // voffset
9562 SOffset, // soffset
9563 Offsets.second, // offset
9564 Op.getOperand(i: 7), // format
9565 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
9566 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9567 };
9568 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9569 AMDGPUISD::TBUFFER_STORE_FORMAT;
9570 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9571 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
9572 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
9573 }
9574
9575 case Intrinsic::amdgcn_raw_tbuffer_store:
9576 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9577 SDValue VData = Op.getOperand(i: 2);
9578 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9579 if (IsD16)
9580 VData = handleD16VData(VData, DAG);
9581 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9582 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9583 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9584 SDValue Ops[] = {
9585 Chain,
9586 VData, // vdata
9587 Rsrc, // rsrc
9588 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9589 Offsets.first, // voffset
9590 SOffset, // soffset
9591 Offsets.second, // offset
9592 Op.getOperand(i: 6), // format
9593 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
9594 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9595 };
9596 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9597 AMDGPUISD::TBUFFER_STORE_FORMAT;
9598 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9599 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
9600 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
9601 }
9602
9603 case Intrinsic::amdgcn_raw_buffer_store:
9604 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9605 case Intrinsic::amdgcn_raw_buffer_store_format:
9606 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9607 const bool IsFormat =
9608 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9609 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9610
9611 SDValue VData = Op.getOperand(i: 2);
9612 EVT VDataVT = VData.getValueType();
9613 EVT EltType = VDataVT.getScalarType();
9614 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9615 if (IsD16) {
9616 VData = handleD16VData(VData, DAG);
9617 VDataVT = VData.getValueType();
9618 }
9619
9620 if (!isTypeLegal(VT: VDataVT)) {
9621 VData =
9622 DAG.getNode(Opcode: ISD::BITCAST, DL,
9623 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
9624 }
9625
9626 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9627 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
9628 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
9629 SDValue Ops[] = {
9630 Chain,
9631 VData,
9632 Rsrc,
9633 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
9634 Offsets.first, // voffset
9635 SOffset, // soffset
9636 Offsets.second, // offset
9637 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
9638 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
9639 };
9640 unsigned Opc =
9641 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
9642 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9643 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9644
9645 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9646 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9647 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
9648
9649 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
9650 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
9651 }
9652
9653 case Intrinsic::amdgcn_struct_buffer_store:
9654 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9655 case Intrinsic::amdgcn_struct_buffer_store_format:
9656 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9657 const bool IsFormat =
9658 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9659 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9660
9661 SDValue VData = Op.getOperand(i: 2);
9662 EVT VDataVT = VData.getValueType();
9663 EVT EltType = VDataVT.getScalarType();
9664 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9665
9666 if (IsD16) {
9667 VData = handleD16VData(VData, DAG);
9668 VDataVT = VData.getValueType();
9669 }
9670
9671 if (!isTypeLegal(VT: VDataVT)) {
9672 VData =
9673 DAG.getNode(Opcode: ISD::BITCAST, DL,
9674 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
9675 }
9676
9677 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
9678 auto Offsets = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
9679 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
9680 SDValue Ops[] = {
9681 Chain,
9682 VData,
9683 Rsrc,
9684 Op.getOperand(i: 4), // vindex
9685 Offsets.first, // voffset
9686 SOffset, // soffset
9687 Offsets.second, // offset
9688 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
9689 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
9690 };
9691 unsigned Opc =
9692 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
9693 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9694 MemSDNode *M = cast<MemSDNode>(Val&: Op);
9695
9696 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9697 EVT VDataType = VData.getValueType().getScalarType();
9698 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9699 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9700
9701 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
9702 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
9703 }
9704 case Intrinsic::amdgcn_raw_buffer_load_lds:
9705 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9706 case Intrinsic::amdgcn_struct_buffer_load_lds:
9707 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9708 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9709 unsigned Opc;
9710 bool HasVIndex =
9711 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9712 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9713 unsigned OpOffset = HasVIndex ? 1 : 0;
9714 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
9715 bool HasVOffset = !isNullConstant(V: VOffset);
9716 unsigned Size = Op->getConstantOperandVal(Num: 4);
9717
9718 switch (Size) {
9719 default:
9720 return SDValue();
9721 case 1:
9722 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9723 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9724 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9725 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9726 break;
9727 case 2:
9728 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9729 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9730 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9731 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9732 break;
9733 case 4:
9734 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9735 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9736 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9737 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9738 break;
9739 }
9740
9741 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
9742
9743 SmallVector<SDValue, 8> Ops;
9744
9745 if (HasVIndex && HasVOffset)
9746 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
9747 Ops: { Op.getOperand(i: 5), // VIndex
9748 VOffset }));
9749 else if (HasVIndex)
9750 Ops.push_back(Elt: Op.getOperand(i: 5));
9751 else if (HasVOffset)
9752 Ops.push_back(Elt: VOffset);
9753
9754 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
9755 Ops.push_back(Elt: Rsrc);
9756 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
9757 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
9758 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
9759 Ops.push_back(
9760 Elt: DAG.getTargetConstant(Val: Aux & AMDGPU::CPol::ALL, DL, VT: MVT::i8)); // cpol
9761 Ops.push_back(Elt: DAG.getTargetConstant(
9762 Val: Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, VT: MVT::i8)); // swz
9763 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
9764 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
9765
9766 auto *M = cast<MemSDNode>(Val&: Op);
9767 MachineMemOperand *LoadMMO = M->getMemOperand();
9768 // Don't set the offset value here because the pointer points to the base of
9769 // the buffer.
9770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9771
9772 MachinePointerInfo StorePtrI = LoadPtrI;
9773 LoadPtrI.V = PoisonValue::get(
9774 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
9775 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9776 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9777
9778 auto F = LoadMMO->getFlags() &
9779 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9780 LoadMMO =
9781 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
9782 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9783
9784 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9785 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t),
9786 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9787
9788 auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
9789 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
9790
9791 return SDValue(Load, 0);
9792 }
9793 case Intrinsic::amdgcn_global_load_lds: {
9794 unsigned Opc;
9795 unsigned Size = Op->getConstantOperandVal(Num: 4);
9796 switch (Size) {
9797 default:
9798 return SDValue();
9799 case 1:
9800 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9801 break;
9802 case 2:
9803 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9804 break;
9805 case 4:
9806 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9807 break;
9808 }
9809
9810 auto *M = cast<MemSDNode>(Val&: Op);
9811 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
9812
9813 SmallVector<SDValue, 6> Ops;
9814
9815 SDValue Addr = Op.getOperand(i: 2); // Global ptr
9816 SDValue VOffset;
9817 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9818 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9819 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9820 SDValue LHS = Addr.getOperand(i: 0);
9821 SDValue RHS = Addr.getOperand(i: 1);
9822
9823 if (LHS->isDivergent())
9824 std::swap(a&: LHS, b&: RHS);
9825
9826 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9827 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
9828 // add (i64 sgpr), (zero_extend (i32 vgpr))
9829 Addr = LHS;
9830 VOffset = RHS.getOperand(i: 0);
9831 }
9832 }
9833
9834 Ops.push_back(Elt: Addr);
9835 if (!Addr->isDivergent()) {
9836 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
9837 if (!VOffset)
9838 VOffset = SDValue(
9839 DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
9840 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)), 0);
9841 Ops.push_back(Elt: VOffset);
9842 }
9843
9844 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
9845 Ops.push_back(Elt: Op.getOperand(i: 6)); // CPol
9846 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
9847 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
9848
9849 MachineMemOperand *LoadMMO = M->getMemOperand();
9850 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9851 LoadPtrI.Offset = Op->getConstantOperandVal(Num: 5);
9852 MachinePointerInfo StorePtrI = LoadPtrI;
9853 LoadPtrI.V = PoisonValue::get(
9854 T: PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
9855 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
9856 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
9857 auto F = LoadMMO->getFlags() &
9858 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
9859 LoadMMO =
9860 MF.getMachineMemOperand(PtrInfo: LoadPtrI, F: F | MachineMemOperand::MOLoad, Size,
9861 BaseAlignment: LoadMMO->getBaseAlign(), AAInfo: LoadMMO->getAAInfo());
9862 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9863 PtrInfo: StorePtrI, F: F | MachineMemOperand::MOStore, Size: sizeof(int32_t), BaseAlignment: Align(4),
9864 AAInfo: LoadMMO->getAAInfo());
9865
9866 auto Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9867 DAG.setNodeMemRefs(N: Load, NewMemRefs: {LoadMMO, StoreMMO});
9868
9869 return SDValue(Load, 0);
9870 }
9871 case Intrinsic::amdgcn_end_cf:
9872 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
9873 Op1: Op->getOperand(Num: 2), Op2: Chain), 0);
9874 case Intrinsic::amdgcn_s_barrier_init:
9875 case Intrinsic::amdgcn_s_barrier_join:
9876 case Intrinsic::amdgcn_s_wakeup_barrier: {
9877 SDValue Chain = Op->getOperand(Num: 0);
9878 SmallVector<SDValue, 2> Ops;
9879 SDValue BarOp = Op->getOperand(Num: 2);
9880 unsigned Opc;
9881 bool IsInlinableBarID = false;
9882 int64_t BarVal;
9883
9884 if (isa<ConstantSDNode>(Val: BarOp)) {
9885 BarVal = cast<ConstantSDNode>(Val&: BarOp)->getSExtValue();
9886 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(Literal: BarVal);
9887 }
9888
9889 if (IsInlinableBarID) {
9890 switch (IntrinsicID) {
9891 default:
9892 return SDValue();
9893 case Intrinsic::amdgcn_s_barrier_init:
9894 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9895 break;
9896 case Intrinsic::amdgcn_s_barrier_join:
9897 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9898 break;
9899 case Intrinsic::amdgcn_s_wakeup_barrier:
9900 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9901 break;
9902 }
9903
9904 SDValue K = DAG.getTargetConstant(Val: BarVal, DL, VT: MVT::i32);
9905 Ops.push_back(Elt: K);
9906 } else {
9907 switch (IntrinsicID) {
9908 default:
9909 return SDValue();
9910 case Intrinsic::amdgcn_s_barrier_init:
9911 Opc = AMDGPU::S_BARRIER_INIT_M0;
9912 break;
9913 case Intrinsic::amdgcn_s_barrier_join:
9914 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9915 break;
9916 case Intrinsic::amdgcn_s_wakeup_barrier:
9917 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9918 break;
9919 }
9920 }
9921
9922 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9923 SDValue M0Val;
9924 // Member count will be read from M0[16:22]
9925 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Op.getOperand(i: 3),
9926 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL));
9927
9928 if (!IsInlinableBarID) {
9929 // If reference to barrier id is not an inline constant then it must be
9930 // referenced with M0[4:0]. Perform an OR with the member count to
9931 // include it in M0.
9932 M0Val = SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32,
9933 Op1: Op.getOperand(i: 2), Op2: M0Val),
9934 0);
9935 }
9936 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
9937 } else if (!IsInlinableBarID) {
9938 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: BarOp).getValue(R: 0));
9939 }
9940
9941 auto NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
9942 return SDValue(NewMI, 0);
9943 }
9944 default: {
9945 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9946 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
9947 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
9948
9949 return Op;
9950 }
9951 }
9952}
9953
9954// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
9955// offset (the offset that is included in bounds checking and swizzling, to be
9956// split between the instruction's voffset and immoffset fields) and soffset
9957// (the offset that is excluded from bounds checking and swizzling, to go in
9958// the instruction's soffset field). This function takes the first kind of
9959// offset and figures out how to split it between voffset and immoffset.
9960std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9961 SDValue Offset, SelectionDAG &DAG) const {
9962 SDLoc DL(Offset);
9963 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
9964 SDValue N0 = Offset;
9965 ConstantSDNode *C1 = nullptr;
9966
9967 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
9968 N0 = SDValue();
9969 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
9970 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
9971 N0 = N0.getOperand(i: 0);
9972 }
9973
9974 if (C1) {
9975 unsigned ImmOffset = C1->getZExtValue();
9976 // If the immediate value is too big for the immoffset field, put only bits
9977 // that would normally fit in the immoffset field. The remaining value that
9978 // is copied/added for the voffset field is a large power of 2, and it
9979 // stands more chance of being CSEd with the copy/add for another similar
9980 // load/store.
9981 // However, do not do that rounding down if that is a negative
9982 // number, as it appears to be illegal to have a negative offset in the
9983 // vgpr, even if adding the immediate offset makes it positive.
9984 unsigned Overflow = ImmOffset & ~MaxImm;
9985 ImmOffset -= Overflow;
9986 if ((int32_t)Overflow < 0) {
9987 Overflow += ImmOffset;
9988 ImmOffset = 0;
9989 }
9990 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
9991 if (Overflow) {
9992 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
9993 if (!N0)
9994 N0 = OverflowVal;
9995 else {
9996 SDValue Ops[] = { N0, OverflowVal };
9997 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
9998 }
9999 }
10000 }
10001 if (!N0)
10002 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10003 if (!C1)
10004 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
10005 return {N0, SDValue(C1, 0)};
10006}
10007
10008// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10009// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10010// pointed to by Offsets.
10011void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10012 SelectionDAG &DAG, SDValue *Offsets,
10013 Align Alignment) const {
10014 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10015 SDLoc DL(CombinedOffset);
10016 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
10017 uint32_t Imm = C->getZExtValue();
10018 uint32_t SOffset, ImmOffset;
10019 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10020 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10021 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10022 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10023 return;
10024 }
10025 }
10026 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
10027 SDValue N0 = CombinedOffset.getOperand(i: 0);
10028 SDValue N1 = CombinedOffset.getOperand(i: 1);
10029 uint32_t SOffset, ImmOffset;
10030 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
10031 if (Offset >= 0 &&
10032 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
10033 Offsets[0] = N0;
10034 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
10035 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
10036 return;
10037 }
10038 }
10039
10040 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10041 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
10042 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10043
10044 Offsets[0] = CombinedOffset;
10045 Offsets[1] = SOffsetZero;
10046 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
10047}
10048
10049SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10050 SelectionDAG &DAG) const {
10051 if (!MaybePointer.getValueType().isScalarInteger())
10052 return MaybePointer;
10053
10054 SDLoc DL(MaybePointer);
10055
10056 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
10057 return Rsrc;
10058}
10059
10060// Wrap a global or flat pointer into a buffer intrinsic using the flags
10061// specified in the intrinsic.
10062SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10063 SelectionDAG &DAG) const {
10064 SDLoc Loc(Op);
10065
10066 SDValue Pointer = Op->getOperand(Num: 1);
10067 SDValue Stride = Op->getOperand(Num: 2);
10068 SDValue NumRecords = Op->getOperand(Num: 3);
10069 SDValue Flags = Op->getOperand(Num: 4);
10070
10071 auto [LowHalf, HighHalf] = DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
10072 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
10073 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
10074 std::optional<uint32_t> ConstStride = std::nullopt;
10075 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Val&: Stride))
10076 ConstStride = ConstNode->getZExtValue();
10077
10078 SDValue NewHighHalf = Masked;
10079 if (!ConstStride || *ConstStride != 0) {
10080 SDValue ShiftedStride;
10081 if (ConstStride) {
10082 ShiftedStride = DAG.getConstant(Val: *ConstStride << 16, DL: Loc, VT: MVT::i32);
10083 } else {
10084 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
10085 ShiftedStride =
10086 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
10087 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
10088 }
10089 NewHighHalf = DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
10090 }
10091
10092 SDValue Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf,
10093 N2: NewHighHalf, N3: NumRecords, N4: Flags);
10094 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
10095 return RsrcPtr;
10096}
10097
10098// Handle 8 bit and 16 bit buffer loads
10099SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10100 EVT LoadVT, SDLoc DL,
10101 ArrayRef<SDValue> Ops,
10102 MachineMemOperand *MMO,
10103 bool IsTFE) const {
10104 EVT IntVT = LoadVT.changeTypeToInteger();
10105
10106 if (IsTFE) {
10107 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10108 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
10109 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
10110 MachineFunction &MF = DAG.getMachineFunction();
10111 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
10112 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
10113 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
10114 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10115 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
10116 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
10117 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
10118 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
10119 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
10120 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
10121 }
10122
10123 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10124 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
10125
10126 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
10127 SDValue BufferLoad =
10128 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
10129 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
10130 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
10131
10132 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
10133}
10134
10135// Handle 8 bit and 16 bit buffer stores
10136SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10137 EVT VDataType, SDLoc DL,
10138 SDValue Ops[],
10139 MemSDNode *M) const {
10140 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10141 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
10142
10143 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
10144 Ops[1] = BufferStoreExt;
10145 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10146 AMDGPUISD::BUFFER_STORE_SHORT;
10147 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10148 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
10149 MMO: M->getMemOperand());
10150}
10151
10152static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
10153 ISD::LoadExtType ExtType, SDValue Op,
10154 const SDLoc &SL, EVT VT) {
10155 if (VT.bitsLT(VT: Op.getValueType()))
10156 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
10157
10158 switch (ExtType) {
10159 case ISD::SEXTLOAD:
10160 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
10161 case ISD::ZEXTLOAD:
10162 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
10163 case ISD::EXTLOAD:
10164 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
10165 case ISD::NON_EXTLOAD:
10166 return Op;
10167 }
10168
10169 llvm_unreachable("invalid ext type");
10170}
10171
10172// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10174SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10175 SelectionDAG &DAG = DCI.DAG;
10176 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10177 return SDValue();
10178
10179 // FIXME: Constant loads should all be marked invariant.
10180 unsigned AS = Ld->getAddressSpace();
10181 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10182 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
10183 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10184 return SDValue();
10185
10186 // Don't do this early, since it may interfere with adjacent load merging for
10187 // illegal types. We can avoid losing alignment information for exotic types
10188 // pre-legalize.
10189 EVT MemVT = Ld->getMemoryVT();
10190 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10191 MemVT.getSizeInBits() >= 32)
10192 return SDValue();
10193
10194 SDLoc SL(Ld);
10195
10196 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10197 "unexpected vector extload");
10198
10199 // TODO: Drop only high part of range.
10200 SDValue Ptr = Ld->getBasePtr();
10201 SDValue NewLoad = DAG.getLoad(
10202 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
10203 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
10204 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
10205 Ranges: nullptr); // Drop ranges
10206
10207 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
10208 if (MemVT.isFloatingPoint()) {
10209 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
10210 "unexpected fp extload");
10211 TruncVT = MemVT.changeTypeToInteger();
10212 }
10213
10214 SDValue Cvt = NewLoad;
10215 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10216 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
10217 N2: DAG.getValueType(TruncVT));
10218 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10219 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
10220 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
10221 } else {
10222 assert(Ld->getExtensionType() == ISD::EXTLOAD);
10223 }
10224
10225 EVT VT = Ld->getValueType(ResNo: 0);
10226 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
10227
10228 DCI.AddToWorklist(N: Cvt.getNode());
10229
10230 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10231 // the appropriate extension from the 32-bit load.
10232 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
10233 DCI.AddToWorklist(N: Cvt.getNode());
10234
10235 // Handle conversion back to floating point if necessary.
10236 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
10237
10238 return DAG.getMergeValues(Ops: { Cvt, NewLoad.getValue(R: 1) }, dl: SL);
10239}
10240
10241static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
10242 const SIMachineFunctionInfo &Info) {
10243 // TODO: Should check if the address can definitely not access stack.
10244 if (Info.isEntryFunction())
10245 return Info.getUserSGPRInfo().hasFlatScratchInit();
10246 return true;
10247}
10248
10249SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10250 SDLoc DL(Op);
10251 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
10252 ISD::LoadExtType ExtType = Load->getExtensionType();
10253 EVT MemVT = Load->getMemoryVT();
10254
10255 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10256 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
10257 return SDValue();
10258
10259 // FIXME: Copied from PPC
10260 // First, load into 32 bits, then truncate to 1 bit.
10261
10262 SDValue Chain = Load->getChain();
10263 SDValue BasePtr = Load->getBasePtr();
10264 MachineMemOperand *MMO = Load->getMemOperand();
10265
10266 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10267
10268 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain,
10269 Ptr: BasePtr, MemVT: RealMemVT, MMO);
10270
10271 if (!MemVT.isVector()) {
10272 SDValue Ops[] = {
10273 DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
10274 NewLD.getValue(R: 1)
10275 };
10276
10277 return DAG.getMergeValues(Ops, dl: DL);
10278 }
10279
10280 SmallVector<SDValue, 3> Elts;
10281 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10282 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
10283 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
10284
10285 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
10286 }
10287
10288 SDValue Ops[] = {
10289 DAG.getBuildVector(VT: MemVT, DL, Ops: Elts),
10290 NewLD.getValue(R: 1)
10291 };
10292
10293 return DAG.getMergeValues(Ops, dl: DL);
10294 }
10295
10296 if (!MemVT.isVector())
10297 return SDValue();
10298
10299 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10300 "Custom lowering for non-i32 vectors hasn't been implemented.");
10301
10302 Align Alignment = Load->getAlign();
10303 unsigned AS = Load->getAddressSpace();
10304 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10305 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10306 return SplitVectorLoad(Op, DAG);
10307 }
10308
10309 MachineFunction &MF = DAG.getMachineFunction();
10310 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10311 // If there is a possibility that flat instruction access scratch memory
10312 // then we need to use the same legalization rules we use for private.
10313 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10314 !Subtarget->hasMultiDwordFlatScratchAddressing())
10315 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI) ?
10316 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10317
10318 unsigned NumElements = MemVT.getVectorNumElements();
10319
10320 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10321 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
10322 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10323 if (MemVT.isPow2VectorType() ||
10324 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10325 return SDValue();
10326 return WidenOrSplitVectorLoad(Op, DAG);
10327 }
10328 // Non-uniform loads will be selected to MUBUF instructions, so they
10329 // have the same legalization requirements as global and private
10330 // loads.
10331 //
10332 }
10333
10334 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10335 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10336 AS == AMDGPUAS::GLOBAL_ADDRESS) {
10337 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10338 Load->isSimple() && isMemOpHasNoClobberedMemOperand(N: Load) &&
10339 Alignment >= Align(4) && NumElements < 32) {
10340 if (MemVT.isPow2VectorType() ||
10341 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10342 return SDValue();
10343 return WidenOrSplitVectorLoad(Op, DAG);
10344 }
10345 // Non-uniform loads will be selected to MUBUF instructions, so they
10346 // have the same legalization requirements as global and private
10347 // loads.
10348 //
10349 }
10350 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10351 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
10352 AS == AMDGPUAS::GLOBAL_ADDRESS ||
10353 AS == AMDGPUAS::FLAT_ADDRESS) {
10354 if (NumElements > 4)
10355 return SplitVectorLoad(Op, DAG);
10356 // v3 loads not supported on SI.
10357 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10358 return WidenOrSplitVectorLoad(Op, DAG);
10359
10360 // v3 and v4 loads are supported for private and global memory.
10361 return SDValue();
10362 }
10363 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10364 // Depending on the setting of the private_element_size field in the
10365 // resource descriptor, we can only make private accesses up to a certain
10366 // size.
10367 switch (Subtarget->getMaxPrivateElementSize()) {
10368 case 4: {
10369 SDValue Ops[2];
10370 std::tie(args&: Ops[0], args&: Ops[1]) = scalarizeVectorLoad(LD: Load, DAG);
10371 return DAG.getMergeValues(Ops, dl: DL);
10372 }
10373 case 8:
10374 if (NumElements > 2)
10375 return SplitVectorLoad(Op, DAG);
10376 return SDValue();
10377 case 16:
10378 // Same as global/flat
10379 if (NumElements > 4)
10380 return SplitVectorLoad(Op, DAG);
10381 // v3 loads not supported on SI.
10382 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10383 return WidenOrSplitVectorLoad(Op, DAG);
10384
10385 return SDValue();
10386 default:
10387 llvm_unreachable("unsupported private_element_size");
10388 }
10389 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10390 unsigned Fast = 0;
10391 auto Flags = Load->getMemOperand()->getFlags();
10392 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
10393 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
10394 Fast > 1)
10395 return SDValue();
10396
10397 if (MemVT.isVector())
10398 return SplitVectorLoad(Op, DAG);
10399 }
10400
10401 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
10402 VT: MemVT, MMO: *Load->getMemOperand())) {
10403 SDValue Ops[2];
10404 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG);
10405 return DAG.getMergeValues(Ops, dl: DL);
10406 }
10407
10408 return SDValue();
10409}
10410
10411SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10412 EVT VT = Op.getValueType();
10413 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10414 VT.getSizeInBits() == 512)
10415 return splitTernaryVectorOp(Op, DAG);
10416
10417 assert(VT.getSizeInBits() == 64);
10418
10419 SDLoc DL(Op);
10420 SDValue Cond = Op.getOperand(i: 0);
10421
10422 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
10423 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
10424
10425 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
10426 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
10427
10428 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
10429 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
10430
10431 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
10432
10433 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
10434 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
10435
10436 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
10437
10438 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
10439 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
10440}
10441
10442// Catch division cases where we can use shortcuts with rcp and rsq
10443// instructions.
10444SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10445 SelectionDAG &DAG) const {
10446 SDLoc SL(Op);
10447 SDValue LHS = Op.getOperand(i: 0);
10448 SDValue RHS = Op.getOperand(i: 1);
10449 EVT VT = Op.getValueType();
10450 const SDNodeFlags Flags = Op->getFlags();
10451
10452 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10453 DAG.getTarget().Options.UnsafeFPMath;
10454
10455 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
10456 // Without !fpmath accuracy information, we can't do more because we don't
10457 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10458 // f16 is always accurate enough
10459 if (!AllowInaccurateRcp && VT != MVT::f16)
10460 return SDValue();
10461
10462 if (CLHS->isExactlyValue(V: 1.0)) {
10463 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10464 // the CI documentation has a worst case error of 1 ulp.
10465 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10466 // use it as long as we aren't trying to use denormals.
10467 //
10468 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10469
10470 // 1.0 / sqrt(x) -> rsq(x)
10471
10472 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10473 // error seems really high at 2^29 ULP.
10474 // 1.0 / x -> rcp(x)
10475 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
10476 }
10477
10478 // Same as for 1.0, but expand the sign out of the constant.
10479 if (CLHS->isExactlyValue(V: -1.0)) {
10480 // -1.0 / x -> rcp (fneg x)
10481 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
10482 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
10483 }
10484 }
10485
10486 // For f16 require afn or arcp.
10487 // For f32 require afn.
10488 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10489 return SDValue();
10490
10491 // Turn into multiply by the reciprocal.
10492 // x / y -> x * (1.0 / y)
10493 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
10494 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
10495}
10496
10497SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10498 SelectionDAG &DAG) const {
10499 SDLoc SL(Op);
10500 SDValue X = Op.getOperand(i: 0);
10501 SDValue Y = Op.getOperand(i: 1);
10502 EVT VT = Op.getValueType();
10503 const SDNodeFlags Flags = Op->getFlags();
10504
10505 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10506 DAG.getTarget().Options.UnsafeFPMath;
10507 if (!AllowInaccurateDiv)
10508 return SDValue();
10509
10510 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
10511 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
10512
10513 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
10514 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
10515
10516 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
10517 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
10518 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
10519 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
10520 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
10521 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
10522}
10523
10524static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10525 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10526 SDNodeFlags Flags) {
10527 if (GlueChain->getNumValues() <= 1) {
10528 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
10529 }
10530
10531 assert(GlueChain->getNumValues() == 3);
10532
10533 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
10534 switch (Opcode) {
10535 default: llvm_unreachable("no chain equivalent for opcode");
10536 case ISD::FMUL:
10537 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10538 break;
10539 }
10540
10541 return DAG.getNode(Opcode, DL: SL, VTList,
10542 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
10543 Flags);
10544}
10545
10546static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10547 EVT VT, SDValue A, SDValue B, SDValue C,
10548 SDValue GlueChain, SDNodeFlags Flags) {
10549 if (GlueChain->getNumValues() <= 1) {
10550 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
10551 }
10552
10553 assert(GlueChain->getNumValues() == 3);
10554
10555 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
10556 switch (Opcode) {
10557 default: llvm_unreachable("no chain equivalent for opcode");
10558 case ISD::FMA:
10559 Opcode = AMDGPUISD::FMA_W_CHAIN;
10560 break;
10561 }
10562
10563 return DAG.getNode(Opcode, DL: SL, VTList,
10564 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
10565 Flags);
10566}
10567
10568SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10569 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10570 return FastLowered;
10571
10572 SDLoc SL(Op);
10573 SDValue Src0 = Op.getOperand(i: 0);
10574 SDValue Src1 = Op.getOperand(i: 1);
10575
10576 SDValue CvtSrc0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
10577 SDValue CvtSrc1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
10578
10579 SDValue RcpSrc1 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: CvtSrc1);
10580 SDValue Quot = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: CvtSrc0, N2: RcpSrc1);
10581
10582 SDValue FPRoundFlag = DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32);
10583 SDValue BestQuot = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot, N2: FPRoundFlag);
10584
10585 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: BestQuot, N2: Src1, N3: Src0);
10586}
10587
10588// Faster 2.5 ULP division that does not support denormals.
10589SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10590 SDNodeFlags Flags = Op->getFlags();
10591 SDLoc SL(Op);
10592 SDValue LHS = Op.getOperand(i: 1);
10593 SDValue RHS = Op.getOperand(i: 2);
10594
10595 SDValue r1 = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
10596
10597 const APFloat K0Val(0x1p+96f);
10598 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
10599
10600 const APFloat K1Val(0x1p-32f);
10601 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
10602
10603 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
10604
10605 EVT SetCCVT =
10606 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
10607
10608 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
10609
10610 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
10611
10612 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
10613
10614 // rcp does not support denormals.
10615 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
10616
10617 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
10618
10619 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
10620}
10621
10622// Returns immediate value for setting the F32 denorm mode when using the
10623// S_DENORM_MODE instruction.
10624static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
10625 const SIMachineFunctionInfo *Info,
10626 const GCNSubtarget *ST) {
10627 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10628 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10629 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10630 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
10631}
10632
10633SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10634 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10635 return FastLowered;
10636
10637 // The selection matcher assumes anything with a chain selecting to a
10638 // mayRaiseFPException machine instruction. Since we're introducing a chain
10639 // here, we need to explicitly report nofpexcept for the regular fdiv
10640 // lowering.
10641 SDNodeFlags Flags = Op->getFlags();
10642 Flags.setNoFPExcept(true);
10643
10644 SDLoc SL(Op);
10645 SDValue LHS = Op.getOperand(i: 0);
10646 SDValue RHS = Op.getOperand(i: 1);
10647
10648 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
10649
10650 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
10651
10652 SDValue DenominatorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT,
10653 Ops: {RHS, RHS, LHS}, Flags);
10654 SDValue NumeratorScaled = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT,
10655 Ops: {LHS, RHS, LHS}, Flags);
10656
10657 // Denominator is scaled to not be denormal, so using rcp is ok.
10658 SDValue ApproxRcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32,
10659 Operand: DenominatorScaled, Flags);
10660 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32,
10661 Operand: DenominatorScaled, Flags);
10662
10663 using namespace AMDGPU::Hwreg;
10664 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
10665 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
10666
10667 const MachineFunction &MF = DAG.getMachineFunction();
10668 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10669 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10670
10671 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10672 const bool HasDynamicDenormals =
10673 (DenormMode.Input == DenormalMode::Dynamic) ||
10674 (DenormMode.Output == DenormalMode::Dynamic);
10675
10676 SDValue SavedDenormMode;
10677
10678 if (!PreservesDenormals) {
10679 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10680 // lowering. The chain dependence is insufficient, and we need glue. We do
10681 // not need the glue variants in a strictfp function.
10682
10683 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
10684
10685 SDValue Glue = DAG.getEntryNode();
10686 if (HasDynamicDenormals) {
10687 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
10688 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
10689 Ops: {BitField, Glue});
10690 SavedDenormMode = SDValue(GetReg, 0);
10691
10692 Glue = DAG.getMergeValues(
10693 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
10694 }
10695
10696 SDNode *EnableDenorm;
10697 if (Subtarget->hasDenormModeInst()) {
10698 const SDValue EnableDenormValue =
10699 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
10700
10701 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
10702 N2: EnableDenormValue)
10703 .getNode();
10704 } else {
10705 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10706 DL: SL, VT: MVT::i32);
10707 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
10708 Ops: {EnableDenormValue, BitField, Glue});
10709 }
10710
10711 SDValue Ops[3] = {
10712 NegDivScale0,
10713 SDValue(EnableDenorm, 0),
10714 SDValue(EnableDenorm, 1)
10715 };
10716
10717 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
10718 }
10719
10720 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
10721 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
10722
10723 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
10724 C: ApproxRcp, GlueChain: Fma0, Flags);
10725
10726 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled,
10727 B: Fma1, GlueChain: Fma1, Flags);
10728
10729 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
10730 C: NumeratorScaled, GlueChain: Mul, Flags);
10731
10732 SDValue Fma3 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32,
10733 A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
10734
10735 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
10736 C: NumeratorScaled, GlueChain: Fma3, Flags);
10737
10738 if (!PreservesDenormals) {
10739 SDNode *DisableDenorm;
10740 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10741 const SDValue DisableDenormValue = getSPDenormModeValue(
10742 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
10743
10744 DisableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VT: MVT::Other,
10745 N1: Fma4.getValue(R: 1), N2: DisableDenormValue,
10746 N3: Fma4.getValue(R: 2)).getNode();
10747 } else {
10748 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10749 const SDValue DisableDenormValue =
10750 HasDynamicDenormals
10751 ? SavedDenormMode
10752 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
10753
10754 DisableDenorm = DAG.getMachineNode(
10755 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
10756 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
10757 }
10758
10759 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
10760 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
10761 DAG.setRoot(OutputChain);
10762 }
10763
10764 SDValue Scale = NumeratorScaled.getValue(R: 1);
10765 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
10766 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
10767
10768 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
10769}
10770
10771SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10772 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10773 return FastLowered;
10774
10775 SDLoc SL(Op);
10776 SDValue X = Op.getOperand(i: 0);
10777 SDValue Y = Op.getOperand(i: 1);
10778
10779 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
10780
10781 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
10782
10783 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
10784
10785 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
10786
10787 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
10788
10789 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
10790
10791 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
10792
10793 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
10794
10795 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
10796
10797 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
10798 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
10799
10800 SDValue Fma4 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64,
10801 N1: NegDivScale0, N2: Mul, N3: DivScale1);
10802
10803 SDValue Scale;
10804
10805 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10806 // Workaround a hardware bug on SI where the condition output from div_scale
10807 // is not usable.
10808
10809 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
10810
10811 // Figure out if the scale to use for div_fmas.
10812 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
10813 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
10814 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
10815 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
10816
10817 SDValue NumHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
10818 SDValue DenHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
10819
10820 SDValue Scale0Hi
10821 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
10822 SDValue Scale1Hi
10823 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
10824
10825 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
10826 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
10827 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
10828 } else {
10829 Scale = DivScale1.getValue(R: 1);
10830 }
10831
10832 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64,
10833 N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
10834
10835 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
10836}
10837
10838SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10839 EVT VT = Op.getValueType();
10840
10841 if (VT == MVT::f32)
10842 return LowerFDIV32(Op, DAG);
10843
10844 if (VT == MVT::f64)
10845 return LowerFDIV64(Op, DAG);
10846
10847 if (VT == MVT::f16)
10848 return LowerFDIV16(Op, DAG);
10849
10850 llvm_unreachable("Unexpected type for fdiv");
10851}
10852
10853SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10854 SDLoc dl(Op);
10855 SDValue Val = Op.getOperand(i: 0);
10856 EVT VT = Val.getValueType();
10857 EVT ResultExpVT = Op->getValueType(ResNo: 1);
10858 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10859
10860 SDValue Mant = DAG.getNode(
10861 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
10862 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
10863
10864 SDValue Exp = DAG.getNode(
10865 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
10866 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
10867
10868 if (Subtarget->hasFractBug()) {
10869 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
10870 SDValue Inf = DAG.getConstantFP(
10871 Val: APFloat::getInf(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: dl, VT);
10872
10873 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
10874 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
10875 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
10876 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
10877 }
10878
10879 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
10880 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
10881}
10882
10883SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10884 SDLoc DL(Op);
10885 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
10886 EVT VT = Store->getMemoryVT();
10887
10888 if (VT == MVT::i1) {
10889 return DAG.getTruncStore(Chain: Store->getChain(), dl: DL,
10890 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
10891 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
10892 }
10893
10894 assert(VT.isVector() &&
10895 Store->getValue().getValueType().getScalarType() == MVT::i32);
10896
10897 unsigned AS = Store->getAddressSpace();
10898 if (Subtarget->hasLDSMisalignedBug() &&
10899 AS == AMDGPUAS::FLAT_ADDRESS &&
10900 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10901 return SplitVectorStore(Op, DAG);
10902 }
10903
10904 MachineFunction &MF = DAG.getMachineFunction();
10905 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
10906 // If there is a possibility that flat instruction access scratch memory
10907 // then we need to use the same legalization rules we use for private.
10908 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10909 !Subtarget->hasMultiDwordFlatScratchAddressing())
10910 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI) ?
10911 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
10912
10913 unsigned NumElements = VT.getVectorNumElements();
10914 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10915 AS == AMDGPUAS::FLAT_ADDRESS) {
10916 if (NumElements > 4)
10917 return SplitVectorStore(Op, DAG);
10918 // v3 stores not supported on SI.
10919 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10920 return SplitVectorStore(Op, DAG);
10921
10922 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
10923 VT, MMO: *Store->getMemOperand()))
10924 return expandUnalignedStore(ST: Store, DAG);
10925
10926 return SDValue();
10927 }
10928 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10929 switch (Subtarget->getMaxPrivateElementSize()) {
10930 case 4:
10931 return scalarizeVectorStore(ST: Store, DAG);
10932 case 8:
10933 if (NumElements > 2)
10934 return SplitVectorStore(Op, DAG);
10935 return SDValue();
10936 case 16:
10937 if (NumElements > 4 ||
10938 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10939 return SplitVectorStore(Op, DAG);
10940 return SDValue();
10941 default:
10942 llvm_unreachable("unsupported private_element_size");
10943 }
10944 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10945 unsigned Fast = 0;
10946 auto Flags = Store->getMemOperand()->getFlags();
10947 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
10948 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
10949 Fast > 1)
10950 return SDValue();
10951
10952 if (VT.isVector())
10953 return SplitVectorStore(Op, DAG);
10954
10955 return expandUnalignedStore(ST: Store, DAG);
10956 }
10957
10958 // Probably an invalid store. If so we'll end up emitting a selection error.
10959 return SDValue();
10960}
10961
10962// Avoid the full correct expansion for f32 sqrt when promoting from f16.
10963SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
10964 SDLoc SL(Op);
10965 assert(!Subtarget->has16BitInsts());
10966 SDNodeFlags Flags = Op->getFlags();
10967 SDValue Ext =
10968 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
10969
10970 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
10971 SDValue Sqrt =
10972 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
10973
10974 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
10975 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
10976}
10977
10978SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
10979 SDLoc DL(Op);
10980 SDNodeFlags Flags = Op->getFlags();
10981 MVT VT = Op.getValueType().getSimpleVT();
10982 const SDValue X = Op.getOperand(i: 0);
10983
10984 if (allowApproxFunc(DAG, Flags)) {
10985 // Instruction is 1ulp but ignores denormals.
10986 return DAG.getNode(
10987 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
10988 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
10989 }
10990
10991 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
10992 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
10993
10994 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
10995
10996 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
10997
10998 SDValue SqrtX =
10999 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
11000
11001 SDValue SqrtS;
11002 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
11003 SDValue SqrtID =
11004 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
11005 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
11006
11007 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
11008 SDValue SqrtSNextDownInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11009 N2: DAG.getConstant(Val: -1, DL, VT: MVT::i32));
11010 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
11011
11012 SDValue NegSqrtSNextDown =
11013 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
11014
11015 SDValue SqrtVP =
11016 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
11017
11018 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
11019 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
11020 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
11021
11022 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
11023 SDValue SqrtVS =
11024 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
11025
11026 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
11027 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
11028
11029 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
11030 Flags);
11031
11032 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
11033 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
11034 Flags);
11035 } else {
11036 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
11037
11038 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
11039
11040 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
11041 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
11042 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
11043
11044 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
11045 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
11046 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
11047
11048 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
11049 SDValue SqrtD =
11050 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
11051 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
11052 }
11053
11054 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
11055
11056 SDValue ScaledDown =
11057 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
11058
11059 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
11060 SDValue IsZeroOrInf =
11061 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11062 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
11063
11064 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
11065}
11066
11067SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11068 // For double type, the SQRT and RSQ instructions don't have required
11069 // precision, we apply Goldschmidt's algorithm to improve the result:
11070 //
11071 // y0 = rsq(x)
11072 // g0 = x * y0
11073 // h0 = 0.5 * y0
11074 //
11075 // r0 = 0.5 - h0 * g0
11076 // g1 = g0 * r0 + g0
11077 // h1 = h0 * r0 + h0
11078 //
11079 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11080 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11081 // h2 = h1 * r1 + h1
11082 //
11083 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11084 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11085 //
11086 // sqrt(x) = g3
11087
11088 SDNodeFlags Flags = Op->getFlags();
11089
11090 SDLoc DL(Op);
11091
11092 SDValue X = Op.getOperand(i: 0);
11093 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
11094
11095 SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
11096
11097 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
11098
11099 // Scale up input if it is too small.
11100 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
11101 SDValue ScaleUp =
11102 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
11103 SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
11104
11105 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
11106
11107 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
11108
11109 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
11110 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
11111
11112 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
11113 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
11114
11115 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
11116
11117 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
11118
11119 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
11120 SDValue SqrtD0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
11121
11122 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
11123
11124 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
11125 SDValue SqrtD1 =
11126 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
11127
11128 SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
11129
11130 SDValue ScaleDownFactor = DAG.getConstant(Val: -128, DL, VT: MVT::i32);
11131 SDValue ScaleDown =
11132 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
11133 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
11134
11135 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11136 // with finite only or nsz because rsq(+/-0) = +/-inf
11137
11138 // TODO: Check for DAZ and expand to subnormals
11139 SDValue IsZeroOrInf =
11140 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
11141 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
11142
11143 // If x is +INF, +0, or -0, use its original value
11144 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
11145 Flags);
11146}
11147
11148SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11149 SDLoc DL(Op);
11150 EVT VT = Op.getValueType();
11151 SDValue Arg = Op.getOperand(i: 0);
11152 SDValue TrigVal;
11153
11154 // Propagate fast-math flags so that the multiply we introduce can be folded
11155 // if Arg is already the result of a multiply by constant.
11156 auto Flags = Op->getFlags();
11157
11158 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
11159
11160 if (Subtarget->hasTrigReducedRange()) {
11161 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11162 TrigVal = DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags);
11163 } else {
11164 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
11165 }
11166
11167 switch (Op.getOpcode()) {
11168 case ISD::FCOS:
11169 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11170 case ISD::FSIN:
11171 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
11172 default:
11173 llvm_unreachable("Wrong trig opcode");
11174 }
11175}
11176
11177SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11178 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
11179 assert(AtomicNode->isCompareAndSwap());
11180 unsigned AS = AtomicNode->getAddressSpace();
11181
11182 // No custom lowering required for local address space
11183 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
11184 return Op;
11185
11186 // Non-local address space requires custom lowering for atomic compare
11187 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11188 SDLoc DL(Op);
11189 SDValue ChainIn = Op.getOperand(i: 0);
11190 SDValue Addr = Op.getOperand(i: 1);
11191 SDValue Old = Op.getOperand(i: 2);
11192 SDValue New = Op.getOperand(i: 3);
11193 EVT VT = Op.getValueType();
11194 MVT SimpleVT = VT.getSimpleVT();
11195 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
11196
11197 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
11198 SDValue Ops[] = { ChainIn, Addr, NewOld };
11199
11200 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL, VTList: Op->getVTList(),
11201 Ops, MemVT: VT, MMO: AtomicNode->getMemOperand());
11202}
11203
11204//===----------------------------------------------------------------------===//
11205// Custom DAG optimizations
11206//===----------------------------------------------------------------------===//
11207
11208SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11209 DAGCombinerInfo &DCI) const {
11210 EVT VT = N->getValueType(ResNo: 0);
11211 EVT ScalarVT = VT.getScalarType();
11212 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11213 return SDValue();
11214
11215 SelectionDAG &DAG = DCI.DAG;
11216 SDLoc DL(N);
11217
11218 SDValue Src = N->getOperand(Num: 0);
11219 EVT SrcVT = Src.getValueType();
11220
11221 // TODO: We could try to match extracting the higher bytes, which would be
11222 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11223 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11224 // about in practice.
11225 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11226 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
11227 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
11228 DCI.AddToWorklist(N: Cvt.getNode());
11229
11230 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11231 if (ScalarVT != MVT::f32) {
11232 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
11233 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
11234 }
11235 return Cvt;
11236 }
11237 }
11238
11239 return SDValue();
11240}
11241
11242SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11243 DAGCombinerInfo &DCI) const {
11244 SDValue MagnitudeOp = N->getOperand(Num: 0);
11245 SDValue SignOp = N->getOperand(Num: 1);
11246 SelectionDAG &DAG = DCI.DAG;
11247 SDLoc DL(N);
11248
11249 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11250 // lower half with a copy.
11251 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11252 if (MagnitudeOp.getValueType() == MVT::f64) {
11253 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f32, Operand: MagnitudeOp);
11254 SDValue MagLo =
11255 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11256 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
11257 SDValue MagHi =
11258 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
11259 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
11260
11261 SDValue HiOp =
11262 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOp);
11263
11264 SDValue Vector = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
11265
11266 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
11267 }
11268
11269 if (SignOp.getValueType() != MVT::f64)
11270 return SDValue();
11271
11272 // Reduce width of sign operand, we only need the highest bit.
11273 //
11274 // fcopysign f64:x, f64:y ->
11275 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11276 // TODO: In some cases it might make sense to go all the way to f16.
11277 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f32, Operand: SignOp);
11278 SDValue SignAsF32 =
11279 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
11280 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
11281
11282 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
11283 N2: SignAsF32);
11284}
11285
11286// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11288// bits
11289
11290// This is a variant of
11291// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11292//
11293// The normal DAG combiner will do this, but only if the add has one use since
11294// that would increase the number of instructions.
11295//
11296// This prevents us from seeing a constant offset that can be folded into a
11297// memory instruction's addressing mode. If we know the resulting add offset of
11298// a pointer can be folded into an addressing offset, we can replace the pointer
11299// operand with the add of new constant offset. This eliminates one of the uses,
11300// and may allow the remaining use to also be simplified.
11301//
11302SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11303 unsigned AddrSpace,
11304 EVT MemVT,
11305 DAGCombinerInfo &DCI) const {
11306 SDValue N0 = N->getOperand(Num: 0);
11307 SDValue N1 = N->getOperand(Num: 1);
11308
11309 // We only do this to handle cases where it's profitable when there are
11310 // multiple uses of the add, so defer to the standard combine.
11311 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11312 N0->hasOneUse())
11313 return SDValue();
11314
11315 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
11316 if (!CN1)
11317 return SDValue();
11318
11319 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
11320 if (!CAdd)
11321 return SDValue();
11322
11323 SelectionDAG &DAG = DCI.DAG;
11324
11325 if (N0->getOpcode() == ISD::OR &&
11326 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
11327 return SDValue();
11328
11329 // If the resulting offset is too large, we can't fold it into the
11330 // addressing mode offset.
11331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11332 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
11333
11334 AddrMode AM;
11335 AM.HasBaseReg = true;
11336 AM.BaseOffs = Offset.getSExtValue();
11337 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
11338 return SDValue();
11339
11340 SDLoc SL(N);
11341 EVT VT = N->getValueType(ResNo: 0);
11342
11343 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
11344 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
11345
11346 SDNodeFlags Flags;
11347 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11348 (N0.getOpcode() == ISD::OR ||
11349 N0->getFlags().hasNoUnsignedWrap()));
11350
11351 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
11352}
11353
11354/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11355/// by the chain and intrinsic ID. Theoretically we would also need to check the
11356/// specific intrinsic, but they all place the pointer operand first.
11357static unsigned getBasePtrIndex(const MemSDNode *N) {
11358 switch (N->getOpcode()) {
11359 case ISD::STORE:
11360 case ISD::INTRINSIC_W_CHAIN:
11361 case ISD::INTRINSIC_VOID:
11362 return 2;
11363 default:
11364 return 1;
11365 }
11366}
11367
11368SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11369 DAGCombinerInfo &DCI) const {
11370 SelectionDAG &DAG = DCI.DAG;
11371 SDLoc SL(N);
11372
11373 unsigned PtrIdx = getBasePtrIndex(N);
11374 SDValue Ptr = N->getOperand(Num: PtrIdx);
11375
11376 // TODO: We could also do this for multiplies.
11377 if (Ptr.getOpcode() == ISD::SHL) {
11378 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
11379 MemVT: N->getMemoryVT(), DCI);
11380 if (NewPtr) {
11381 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11382
11383 NewOps[PtrIdx] = NewPtr;
11384 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
11385 }
11386 }
11387
11388 return SDValue();
11389}
11390
11391static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11392 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11393 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11394 (Opc == ISD::XOR && Val == 0);
11395}
11396
11397// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399// integer combine opportunities since most 64-bit operations are decomposed
11400// this way. TODO: We won't want this for SALU especially if it is an inline
11401// immediate.
11402SDValue SITargetLowering::splitBinaryBitConstantOp(
11403 DAGCombinerInfo &DCI,
11404 const SDLoc &SL,
11405 unsigned Opc, SDValue LHS,
11406 const ConstantSDNode *CRHS) const {
11407 uint64_t Val = CRHS->getZExtValue();
11408 uint32_t ValLo = Lo_32(Value: Val);
11409 uint32_t ValHi = Hi_32(Value: Val);
11410 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11411
11412 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
11413 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
11414 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
11415 // If we need to materialize a 64-bit immediate, it will be split up later
11416 // anyway. Avoid creating the harder to understand 64-bit immediate
11417 // materialization.
11418 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11419 }
11420
11421 return SDValue();
11422}
11423
11424bool llvm::isBoolSGPR(SDValue V) {
11425 if (V.getValueType() != MVT::i1)
11426 return false;
11427 switch (V.getOpcode()) {
11428 default:
11429 break;
11430 case ISD::SETCC:
11431 case AMDGPUISD::FP_CLASS:
11432 return true;
11433 case ISD::AND:
11434 case ISD::OR:
11435 case ISD::XOR:
11436 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
11437 }
11438 return false;
11439}
11440
11441// If a constant has all zeroes or all ones within each byte return it.
11442// Otherwise return 0.
11443static uint32_t getConstantPermuteMask(uint32_t C) {
11444 // 0xff for any zero byte in the mask
11445 uint32_t ZeroByteMask = 0;
11446 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11447 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11448 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11449 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11450 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11451 if ((NonZeroByteMask & C) != NonZeroByteMask)
11452 return 0; // Partial bytes selected.
11453 return C;
11454}
11455
11456// Check if a node selects whole bytes from its operand 0 starting at a byte
11457// boundary while masking the rest. Returns select mask as in the v_perm_b32
11458// or -1 if not succeeded.
11459// Note byte select encoding:
11460// value 0-3 selects corresponding source byte;
11461// value 0xc selects zero;
11462// value 0xff selects 0xff.
11463static uint32_t getPermuteMask(SDValue V) {
11464 assert(V.getValueSizeInBits() == 32);
11465
11466 if (V.getNumOperands() != 2)
11467 return ~0;
11468
11469 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
11470 if (!N1)
11471 return ~0;
11472
11473 uint32_t C = N1->getZExtValue();
11474
11475 switch (V.getOpcode()) {
11476 default:
11477 break;
11478 case ISD::AND:
11479 if (uint32_t ConstMask = getConstantPermuteMask(C))
11480 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11481 break;
11482
11483 case ISD::OR:
11484 if (uint32_t ConstMask = getConstantPermuteMask(C))
11485 return (0x03020100 & ~ConstMask) | ConstMask;
11486 break;
11487
11488 case ISD::SHL:
11489 if (C % 8)
11490 return ~0;
11491
11492 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11493
11494 case ISD::SRL:
11495 if (C % 8)
11496 return ~0;
11497
11498 return uint32_t(0x0c0c0c0c03020100ull >> C);
11499 }
11500
11501 return ~0;
11502}
11503
11504SDValue SITargetLowering::performAndCombine(SDNode *N,
11505 DAGCombinerInfo &DCI) const {
11506 if (DCI.isBeforeLegalize())
11507 return SDValue();
11508
11509 SelectionDAG &DAG = DCI.DAG;
11510 EVT VT = N->getValueType(ResNo: 0);
11511 SDValue LHS = N->getOperand(Num: 0);
11512 SDValue RHS = N->getOperand(Num: 1);
11513
11514
11515 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
11516 if (VT == MVT::i64 && CRHS) {
11517 if (SDValue Split
11518 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
11519 return Split;
11520 }
11521
11522 if (CRHS && VT == MVT::i32) {
11523 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11524 // nb = number of trailing zeroes in mask
11525 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11526 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11527 uint64_t Mask = CRHS->getZExtValue();
11528 unsigned Bits = llvm::popcount(Value: Mask);
11529 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11530 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
11531 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
11532 unsigned Shift = CShift->getZExtValue();
11533 unsigned NB = CRHS->getAPIntValue().countr_zero();
11534 unsigned Offset = NB + Shift;
11535 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11536 SDLoc SL(N);
11537 SDValue BFE = DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32,
11538 N1: LHS->getOperand(Num: 0),
11539 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
11540 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
11541 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
11542 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
11543 N2: DAG.getValueType(NarrowVT));
11544 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
11545 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
11546 return Shl;
11547 }
11548 }
11549 }
11550
11551 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11552 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11553 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
11554 uint32_t Sel = getConstantPermuteMask(C: Mask);
11555 if (!Sel)
11556 return SDValue();
11557
11558 // Select 0xc for all zero bytes
11559 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
11560 SDLoc DL(N);
11561 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
11562 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
11563 }
11564 }
11565
11566 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11567 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11568 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11569 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
11570 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
11571
11572 SDValue X = LHS.getOperand(i: 0);
11573 SDValue Y = RHS.getOperand(i: 0);
11574 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
11575 !isTypeLegal(VT: X.getValueType()))
11576 return SDValue();
11577
11578 if (LCC == ISD::SETO) {
11579 if (X != LHS.getOperand(i: 1))
11580 return SDValue();
11581
11582 if (RCC == ISD::SETUNE) {
11583 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
11584 if (!C1 || !C1->isInfinity() || C1->isNegative())
11585 return SDValue();
11586
11587 const uint32_t Mask = SIInstrFlags::N_NORMAL |
11588 SIInstrFlags::N_SUBNORMAL |
11589 SIInstrFlags::N_ZERO |
11590 SIInstrFlags::P_ZERO |
11591 SIInstrFlags::P_SUBNORMAL |
11592 SIInstrFlags::P_NORMAL;
11593
11594 static_assert(((~(SIInstrFlags::S_NAN |
11595 SIInstrFlags::Q_NAN |
11596 SIInstrFlags::N_INFINITY |
11597 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11598 "mask not equal");
11599
11600 SDLoc DL(N);
11601 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1,
11602 N1: X, N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
11603 }
11604 }
11605 }
11606
11607 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11608 std::swap(a&: LHS, b&: RHS);
11609
11610 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11611 RHS.hasOneUse()) {
11612 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
11613 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11614 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11615 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
11616 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11617 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
11618 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
11619 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11620 unsigned NewMask = LCC == ISD::SETO ?
11621 Mask->getZExtValue() & ~OrdMask :
11622 Mask->getZExtValue() & OrdMask;
11623
11624 SDLoc DL(N);
11625 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
11626 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
11627 }
11628 }
11629
11630 if (VT == MVT::i32 &&
11631 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11632 // and x, (sext cc from i1) => select cc, x, 0
11633 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11634 std::swap(a&: LHS, b&: RHS);
11635 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
11636 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0),
11637 LHS, RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
11638 }
11639
11640 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11641 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11642 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11643 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
11644 uint32_t LHSMask = getPermuteMask(V: LHS);
11645 uint32_t RHSMask = getPermuteMask(V: RHS);
11646 if (LHSMask != ~0u && RHSMask != ~0u) {
11647 // Canonicalize the expression in an attempt to have fewer unique masks
11648 // and therefore fewer registers used to hold the masks.
11649 if (LHSMask > RHSMask) {
11650 std::swap(a&: LHSMask, b&: RHSMask);
11651 std::swap(a&: LHS, b&: RHS);
11652 }
11653
11654 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11655 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11656 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11657 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11658
11659 // Check of we need to combine values from two sources within a byte.
11660 if (!(LHSUsedLanes & RHSUsedLanes) &&
11661 // If we select high and lower word keep it for SDWA.
11662 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11663 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11664 // Each byte in each mask is either selector mask 0-3, or has higher
11665 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11666 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11667 // mask which is not 0xff wins. By anding both masks we have a correct
11668 // result except that 0x0c shall be corrected to give 0x0c only.
11669 uint32_t Mask = LHSMask & RHSMask;
11670 for (unsigned I = 0; I < 32; I += 8) {
11671 uint32_t ByteSel = 0xff << I;
11672 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11673 Mask &= (0x0c << I) & 0xffffffff;
11674 }
11675
11676 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11677 // or 0x0c.
11678 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11679 SDLoc DL(N);
11680
11681 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32,
11682 N1: LHS.getOperand(i: 0), N2: RHS.getOperand(i: 0),
11683 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
11684 }
11685 }
11686 }
11687
11688 return SDValue();
11689}
11690
11691// A key component of v_perm is a mapping between byte position of the src
11692// operands, and the byte position of the dest. To provide such, we need: 1. the
11693// node that provides x byte of the dest of the OR, and 2. the byte of the node
11694// used to provide that x byte. calculateByteProvider finds which node provides
11695// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11696// and finds an ultimate src and byte position For example: The supported
11697// LoadCombine pattern for vector loads is as follows
11698// t1
11699// or
11700// / \
11701// t2 t3
11702// zext shl
11703// | | \
11704// t4 t5 16
11705// or anyext
11706// / \ |
11707// t6 t7 t8
11708// srl shl or
11709// / | / \ / \
11710// t9 t10 t11 t12 t13 t14
11711// trunc* 8 trunc* 8 and and
11712// | | / | | \
11713// t15 t16 t17 t18 t19 t20
11714// trunc* 255 srl -256
11715// | / \
11716// t15 t15 16
11717//
11718// *In this example, the truncs are from i32->i16
11719//
11720// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721// respectively. calculateSrcByte would find (given node) -> ultimate src &
11722// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11723// After finding the mapping, we can combine the tree into vperm t15, t16,
11724// 0x05000407
11725
11726// Find the source and byte position from a node.
11727// \p DestByte is the byte position of the dest of the or that the src
11728// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11729// dest of the or byte. \p Depth tracks how many recursive iterations we have
11730// performed.
11731static const std::optional<ByteProvider<SDValue>>
11732calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11733 unsigned Depth = 0) {
11734 // We may need to recursively traverse a series of SRLs
11735 if (Depth >= 6)
11736 return std::nullopt;
11737
11738 if (Op.getValueSizeInBits() < 8)
11739 return std::nullopt;
11740
11741 if (Op.getValueType().isVector())
11742 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
11743
11744 switch (Op->getOpcode()) {
11745 case ISD::TRUNCATE: {
11746 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11747 }
11748
11749 case ISD::SIGN_EXTEND:
11750 case ISD::ZERO_EXTEND:
11751 case ISD::SIGN_EXTEND_INREG: {
11752 SDValue NarrowOp = Op->getOperand(Num: 0);
11753 auto NarrowVT = NarrowOp.getValueType();
11754 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11755 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
11756 NarrowVT = VTSign->getVT();
11757 }
11758 if (!NarrowVT.isByteSized())
11759 return std::nullopt;
11760 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11761
11762 if (SrcIndex >= NarrowByteWidth)
11763 return std::nullopt;
11764 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11765 }
11766
11767 case ISD::SRA:
11768 case ISD::SRL: {
11769 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11770 if (!ShiftOp)
11771 return std::nullopt;
11772
11773 uint64_t BitShift = ShiftOp->getZExtValue();
11774
11775 if (BitShift % 8 != 0)
11776 return std::nullopt;
11777
11778 SrcIndex += BitShift / 8;
11779
11780 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
11781 }
11782
11783 default: {
11784 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
11785 }
11786 }
11787 llvm_unreachable("fully handled switch");
11788}
11789
11790// For a byte position in the result of an Or, traverse the tree and find the
11791// node (and the byte of the node) which ultimately provides this {Or,
11792// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11793// the byte position of the Op that corresponds with the originally requested
11794// byte of the Or \p Depth tracks how many recursive iterations we have
11795// performed. \p StartingIndex is the originally requested byte of the Or
11796static const std::optional<ByteProvider<SDValue>>
11797calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11798 unsigned StartingIndex = 0) {
11799 // Finding Src tree of RHS of or typically requires at least 1 additional
11800 // depth
11801 if (Depth > 6)
11802 return std::nullopt;
11803
11804 unsigned BitWidth = Op.getScalarValueSizeInBits();
11805 if (BitWidth % 8 != 0)
11806 return std::nullopt;
11807 if (Index > BitWidth / 8 - 1)
11808 return std::nullopt;
11809
11810 bool IsVec = Op.getValueType().isVector();
11811 switch (Op.getOpcode()) {
11812 case ISD::OR: {
11813 if (IsVec)
11814 return std::nullopt;
11815
11816 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
11817 StartingIndex);
11818 if (!RHS)
11819 return std::nullopt;
11820 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
11821 StartingIndex);
11822 if (!LHS)
11823 return std::nullopt;
11824 // A well formed Or will have two ByteProviders for each byte, one of which
11825 // is constant zero
11826 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11827 return std::nullopt;
11828 if (!LHS || LHS->isConstantZero())
11829 return RHS;
11830 if (!RHS || RHS->isConstantZero())
11831 return LHS;
11832 return std::nullopt;
11833 }
11834
11835 case ISD::AND: {
11836 if (IsVec)
11837 return std::nullopt;
11838
11839 auto BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11840 if (!BitMaskOp)
11841 return std::nullopt;
11842
11843 uint32_t BitMask = BitMaskOp->getZExtValue();
11844 // Bits we expect for our StartingIndex
11845 uint32_t IndexMask = 0xFF << (Index * 8);
11846
11847 if ((IndexMask & BitMask) != IndexMask) {
11848 // If the result of the and partially provides the byte, then it
11849 // is not well formatted
11850 if (IndexMask & BitMask)
11851 return std::nullopt;
11852 return ByteProvider<SDValue>::getConstantZero();
11853 }
11854
11855 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
11856 }
11857
11858 case ISD::FSHR: {
11859 if (IsVec)
11860 return std::nullopt;
11861
11862 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11863 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
11864 if (!ShiftOp || Op.getValueType().isVector())
11865 return std::nullopt;
11866
11867 uint64_t BitsProvided = Op.getValueSizeInBits();
11868 if (BitsProvided % 8 != 0)
11869 return std::nullopt;
11870
11871 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
11872 if (BitShift % 8)
11873 return std::nullopt;
11874
11875 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11876 uint64_t ByteShift = BitShift / 8;
11877
11878 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11879 uint64_t BytesProvided = BitsProvided / 8;
11880 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
11881 NewIndex %= BytesProvided;
11882 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
11883 }
11884
11885 case ISD::SRA:
11886 case ISD::SRL: {
11887 if (IsVec)
11888 return std::nullopt;
11889
11890 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11891 if (!ShiftOp)
11892 return std::nullopt;
11893
11894 uint64_t BitShift = ShiftOp->getZExtValue();
11895 if (BitShift % 8)
11896 return std::nullopt;
11897
11898 auto BitsProvided = Op.getScalarValueSizeInBits();
11899 if (BitsProvided % 8 != 0)
11900 return std::nullopt;
11901
11902 uint64_t BytesProvided = BitsProvided / 8;
11903 uint64_t ByteShift = BitShift / 8;
11904 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11905 // If the byte we are trying to provide (as tracked by index) falls in this
11906 // range, then the SRL provides the byte. The byte of interest of the src of
11907 // the SRL is Index + ByteShift
11908 return BytesProvided - ByteShift > Index
11909 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
11910 SrcIndex: Index + ByteShift)
11911 : ByteProvider<SDValue>::getConstantZero();
11912 }
11913
11914 case ISD::SHL: {
11915 if (IsVec)
11916 return std::nullopt;
11917
11918 auto ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
11919 if (!ShiftOp)
11920 return std::nullopt;
11921
11922 uint64_t BitShift = ShiftOp->getZExtValue();
11923 if (BitShift % 8 != 0)
11924 return std::nullopt;
11925 uint64_t ByteShift = BitShift / 8;
11926
11927 // If we are shifting by an amount greater than (or equal to)
11928 // the index we are trying to provide, then it provides 0s. If not,
11929 // then this bytes are not definitively 0s, and the corresponding byte
11930 // of interest is Index - ByteShift of the src
11931 return Index < ByteShift
11932 ? ByteProvider<SDValue>::getConstantZero()
11933 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
11934 Depth: Depth + 1, StartingIndex);
11935 }
11936 case ISD::ANY_EXTEND:
11937 case ISD::SIGN_EXTEND:
11938 case ISD::ZERO_EXTEND:
11939 case ISD::SIGN_EXTEND_INREG:
11940 case ISD::AssertZext:
11941 case ISD::AssertSext: {
11942 if (IsVec)
11943 return std::nullopt;
11944
11945 SDValue NarrowOp = Op->getOperand(Num: 0);
11946 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11947 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11948 Op->getOpcode() == ISD::AssertZext ||
11949 Op->getOpcode() == ISD::AssertSext) {
11950 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
11951 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11952 }
11953 if (NarrowBitWidth % 8 != 0)
11954 return std::nullopt;
11955 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11956
11957 if (Index >= NarrowByteWidth)
11958 return Op.getOpcode() == ISD::ZERO_EXTEND
11959 ? std::optional<ByteProvider<SDValue>>(
11960 ByteProvider<SDValue>::getConstantZero())
11961 : std::nullopt;
11962 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
11963 }
11964
11965 case ISD::TRUNCATE: {
11966 if (IsVec)
11967 return std::nullopt;
11968
11969 uint64_t NarrowByteWidth = BitWidth / 8;
11970
11971 if (NarrowByteWidth >= Index) {
11972 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
11973 StartingIndex);
11974 }
11975
11976 return std::nullopt;
11977 }
11978
11979 case ISD::CopyFromReg: {
11980 if (BitWidth / 8 > Index)
11981 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
11982
11983 return std::nullopt;
11984 }
11985
11986 case ISD::LOAD: {
11987 auto L = cast<LoadSDNode>(Val: Op.getNode());
11988
11989 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11990 if (NarrowBitWidth % 8 != 0)
11991 return std::nullopt;
11992 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11993
11994 // If the width of the load does not reach byte we are trying to provide for
11995 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
11996 // question
11997 if (Index >= NarrowByteWidth) {
11998 return L->getExtensionType() == ISD::ZEXTLOAD
11999 ? std::optional<ByteProvider<SDValue>>(
12000 ByteProvider<SDValue>::getConstantZero())
12001 : std::nullopt;
12002 }
12003
12004 if (NarrowByteWidth > Index) {
12005 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
12006 }
12007
12008 return std::nullopt;
12009 }
12010
12011 case ISD::BSWAP: {
12012 if (IsVec)
12013 return std::nullopt;
12014
12015 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
12016 Depth: Depth + 1, StartingIndex);
12017 }
12018
12019 case ISD::EXTRACT_VECTOR_ELT: {
12020 auto IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
12021 if (!IdxOp)
12022 return std::nullopt;
12023 auto VecIdx = IdxOp->getZExtValue();
12024 auto ScalarSize = Op.getScalarValueSizeInBits();
12025 if (ScalarSize < 32)
12026 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12027 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
12028 DestByte: StartingIndex, SrcIndex: Index);
12029 }
12030
12031 case AMDGPUISD::PERM: {
12032 if (IsVec)
12033 return std::nullopt;
12034
12035 auto PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
12036 if (!PermMask)
12037 return std::nullopt;
12038
12039 auto IdxMask =
12040 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12041 if (IdxMask > 0x07 && IdxMask != 0x0c)
12042 return std::nullopt;
12043
12044 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
12045 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12046
12047 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
12048 : ByteProvider<SDValue>(
12049 ByteProvider<SDValue>::getConstantZero());
12050 }
12051
12052 default: {
12053 return std::nullopt;
12054 }
12055 }
12056
12057 llvm_unreachable("fully handled switch");
12058}
12059
12060// Returns true if the Operand is a scalar and is 16 bits
12061static bool isExtendedFrom16Bits(SDValue &Operand) {
12062
12063 switch (Operand.getOpcode()) {
12064 case ISD::ANY_EXTEND:
12065 case ISD::SIGN_EXTEND:
12066 case ISD::ZERO_EXTEND: {
12067 auto OpVT = Operand.getOperand(i: 0).getValueType();
12068 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12069 }
12070 case ISD::LOAD: {
12071 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
12072 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
12073 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12074 ExtType == ISD::EXTLOAD) {
12075 auto MemVT = L->getMemoryVT();
12076 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12077 }
12078 return L->getMemoryVT().getSizeInBits() == 16;
12079 }
12080 default:
12081 return false;
12082 }
12083}
12084
12085// Returns true if the mask matches consecutive bytes, and the first byte
12086// begins at a power of 2 byte offset from 0th byte
12087static bool addresses16Bits(int Mask) {
12088 int Low8 = Mask & 0xff;
12089 int Hi8 = (Mask & 0xff00) >> 8;
12090
12091 assert(Low8 < 8 && Hi8 < 8);
12092 // Are the bytes contiguous in the order of increasing addresses.
12093 bool IsConsecutive = (Hi8 - Low8 == 1);
12094 // Is the first byte at location that is aligned for 16 bit instructions.
12095 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12096 // In this case, we still need code to extract the 16 bit operand, so it
12097 // is better to use i8 v_perm
12098 bool Is16Aligned = !(Low8 % 2);
12099
12100 return IsConsecutive && Is16Aligned;
12101}
12102
12103// Do not lower into v_perm if the operands are actually 16 bit
12104// and the selected bits (based on PermMask) correspond with two
12105// easily addressable 16 bit operands.
12106static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
12107 SDValue &OtherOp) {
12108 int Low16 = PermMask & 0xffff;
12109 int Hi16 = (PermMask & 0xffff0000) >> 16;
12110
12111 auto TempOp = peekThroughBitcasts(V: Op);
12112 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
12113
12114 auto OpIs16Bit =
12115 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
12116 if (!OpIs16Bit)
12117 return true;
12118
12119 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12120 isExtendedFrom16Bits(Operand&: TempOtherOp);
12121 if (!OtherOpIs16Bit)
12122 return true;
12123
12124 // Do we cleanly address both
12125 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
12126}
12127
12128static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
12129 unsigned DWordOffset) {
12130 SDValue Ret;
12131
12132 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12133 // ByteProvider must be at least 8 bits
12134 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12135
12136 if (TypeSize <= 32)
12137 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
12138
12139 if (Src.getValueType().isVector()) {
12140 auto ScalarTySize = Src.getScalarValueSizeInBits();
12141 auto ScalarTy = Src.getValueType().getScalarType();
12142 if (ScalarTySize == 32) {
12143 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
12144 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
12145 }
12146 if (ScalarTySize > 32) {
12147 Ret = DAG.getNode(
12148 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
12149 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
12150 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12151 if (ShiftVal)
12152 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
12153 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12154 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12155 }
12156
12157 assert(ScalarTySize < 32);
12158 auto NumElements = TypeSize / ScalarTySize;
12159 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12160 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12161 auto NumElementsIn32 = 32 / ScalarTySize;
12162 auto NumAvailElements = DWordOffset < Trunc32Elements
12163 ? NumElementsIn32
12164 : NumElements - NormalizedTrunc;
12165
12166 SmallVector<SDValue, 4> VecSrcs;
12167 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
12168 Count: NumAvailElements);
12169
12170 Ret = DAG.getBuildVector(
12171 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
12172 Ops: VecSrcs);
12173 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12174 }
12175
12176 /// Scalar Type
12177 auto ShiftVal = 32 * DWordOffset;
12178 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
12179 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
12180 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
12181}
12182
12183static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
12184 SelectionDAG &DAG = DCI.DAG;
12185 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
12186 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
12187
12188 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12189 assert(VT == MVT::i32);
12190 for (int i = 0; i < 4; i++) {
12191 // Find the ByteProvider that provides the ith byte of the result of OR
12192 std::optional<ByteProvider<SDValue>> P =
12193 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
12194 // TODO support constantZero
12195 if (!P || P->isConstantZero())
12196 return SDValue();
12197
12198 PermNodes.push_back(Elt: *P);
12199 }
12200 if (PermNodes.size() != 4)
12201 return SDValue();
12202
12203 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12204 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12205 uint64_t PermMask = 0x00000000;
12206 for (size_t i = 0; i < PermNodes.size(); i++) {
12207 auto PermOp = PermNodes[i];
12208 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12209 // by sizeof(Src2) = 4
12210 int SrcByteAdjust = 4;
12211
12212 // If the Src uses a byte from a different DWORD, then it corresponds
12213 // with a difference source
12214 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
12215 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12216 if (SecondSrc)
12217 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
12218 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12219 return SDValue();
12220
12221 // Set the index of the second distinct Src node
12222 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12223 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12224 SrcByteAdjust = 0;
12225 }
12226 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12227 assert(!DAG.getDataLayout().isBigEndian());
12228 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12229 }
12230 SDLoc DL(N);
12231 SDValue Op = *PermNodes[FirstSrc.first].Src;
12232 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
12233 assert(Op.getValueSizeInBits() == 32);
12234
12235 // Check that we are not just extracting the bytes in order from an op
12236 if (!SecondSrc) {
12237 int Low16 = PermMask & 0xffff;
12238 int Hi16 = (PermMask & 0xffff0000) >> 16;
12239
12240 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12241 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12242
12243 // The perm op would really just produce Op. So combine into Op
12244 if (WellFormedLow && WellFormedHi)
12245 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
12246 }
12247
12248 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12249
12250 if (SecondSrc) {
12251 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
12252 assert(OtherOp.getValueSizeInBits() == 32);
12253 }
12254
12255 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12256
12257 assert(Op.getValueType().isByteSized() &&
12258 OtherOp.getValueType().isByteSized());
12259
12260 // If the ultimate src is less than 32 bits, then we will only be
12261 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12262 // CalculateByteProvider would not have returned Op as source if we
12263 // used a byte that is outside its ValueType. Thus, we are free to
12264 // ANY_EXTEND as the extended bits are dont-cares.
12265 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
12266 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
12267
12268 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
12269 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
12270 }
12271 return SDValue();
12272}
12273
12274SDValue SITargetLowering::performOrCombine(SDNode *N,
12275 DAGCombinerInfo &DCI) const {
12276 SelectionDAG &DAG = DCI.DAG;
12277 SDValue LHS = N->getOperand(Num: 0);
12278 SDValue RHS = N->getOperand(Num: 1);
12279
12280 EVT VT = N->getValueType(ResNo: 0);
12281 if (VT == MVT::i1) {
12282 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12283 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12284 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12285 SDValue Src = LHS.getOperand(i: 0);
12286 if (Src != RHS.getOperand(i: 0))
12287 return SDValue();
12288
12289 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
12290 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
12291 if (!CLHS || !CRHS)
12292 return SDValue();
12293
12294 // Only 10 bits are used.
12295 static const uint32_t MaxMask = 0x3ff;
12296
12297 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12298 SDLoc DL(N);
12299 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1,
12300 N1: Src, N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
12301 }
12302
12303 return SDValue();
12304 }
12305
12306 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12307 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
12308 LHS.getOpcode() == AMDGPUISD::PERM &&
12309 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
12310 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
12311 if (!Sel)
12312 return SDValue();
12313
12314 Sel |= LHS.getConstantOperandVal(i: 2);
12315 SDLoc DL(N);
12316 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
12317 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12318 }
12319
12320 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12321 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12322 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12323 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
12324
12325 // If all the uses of an or need to extract the individual elements, do not
12326 // attempt to lower into v_perm
12327 auto usesCombinedOperand = [](SDNode *OrUse) {
12328 // If we have any non-vectorized use, then it is a candidate for v_perm
12329 if (OrUse->getOpcode() != ISD::BITCAST ||
12330 !OrUse->getValueType(ResNo: 0).isVector())
12331 return true;
12332
12333 // If we have any non-vectorized use, then it is a candidate for v_perm
12334 for (auto VUse : OrUse->uses()) {
12335 if (!VUse->getValueType(ResNo: 0).isVector())
12336 return true;
12337
12338 // If the use of a vector is a store, then combining via a v_perm
12339 // is beneficial.
12340 // TODO -- whitelist more uses
12341 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12342 if (VUse->getOpcode() == VectorwiseOp)
12343 return true;
12344 }
12345 return false;
12346 };
12347
12348 if (!any_of(Range: N->uses(), P: usesCombinedOperand))
12349 return SDValue();
12350
12351 uint32_t LHSMask = getPermuteMask(V: LHS);
12352 uint32_t RHSMask = getPermuteMask(V: RHS);
12353
12354 if (LHSMask != ~0u && RHSMask != ~0u) {
12355 // Canonicalize the expression in an attempt to have fewer unique masks
12356 // and therefore fewer registers used to hold the masks.
12357 if (LHSMask > RHSMask) {
12358 std::swap(a&: LHSMask, b&: RHSMask);
12359 std::swap(a&: LHS, b&: RHS);
12360 }
12361
12362 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12363 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12364 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12365 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12366
12367 // Check of we need to combine values from two sources within a byte.
12368 if (!(LHSUsedLanes & RHSUsedLanes) &&
12369 // If we select high and lower word keep it for SDWA.
12370 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12371 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12372 // Kill zero bytes selected by other mask. Zero value is 0xc.
12373 LHSMask &= ~RHSUsedLanes;
12374 RHSMask &= ~LHSUsedLanes;
12375 // Add 4 to each active LHS lane
12376 LHSMask |= LHSUsedLanes & 0x04040404;
12377 // Combine masks
12378 uint32_t Sel = LHSMask | RHSMask;
12379 SDLoc DL(N);
12380
12381 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32,
12382 N1: LHS.getOperand(i: 0), N2: RHS.getOperand(i: 0),
12383 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
12384 }
12385 }
12386 if (LHSMask == ~0u || RHSMask == ~0u) {
12387 if (SDValue Perm = matchPERM(N, DCI))
12388 return Perm;
12389 }
12390 }
12391
12392 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12393 return SDValue();
12394
12395 // TODO: This could be a generic combine with a predicate for extracting the
12396 // high half of an integer being free.
12397
12398 // (or i64:x, (zero_extend i32:y)) ->
12399 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12400 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12401 RHS.getOpcode() != ISD::ZERO_EXTEND)
12402 std::swap(a&: LHS, b&: RHS);
12403
12404 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12405 SDValue ExtSrc = RHS.getOperand(i: 0);
12406 EVT SrcVT = ExtSrc.getValueType();
12407 if (SrcVT == MVT::i32) {
12408 SDLoc SL(N);
12409 SDValue LowLHS, HiBits;
12410 std::tie(args&: LowLHS, args&: HiBits) = split64BitValue(Op: LHS, DAG);
12411 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
12412
12413 DCI.AddToWorklist(N: LowOr.getNode());
12414 DCI.AddToWorklist(N: HiBits.getNode());
12415
12416 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32,
12417 N1: LowOr, N2: HiBits);
12418 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
12419 }
12420 }
12421
12422 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
12423 if (CRHS) {
12424 if (SDValue Split
12425 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
12426 LHS: N->getOperand(Num: 0), CRHS))
12427 return Split;
12428 }
12429
12430 return SDValue();
12431}
12432
12433SDValue SITargetLowering::performXorCombine(SDNode *N,
12434 DAGCombinerInfo &DCI) const {
12435 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
12436 return RV;
12437
12438 SDValue LHS = N->getOperand(Num: 0);
12439 SDValue RHS = N->getOperand(Num: 1);
12440
12441 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
12442 SelectionDAG &DAG = DCI.DAG;
12443
12444 EVT VT = N->getValueType(ResNo: 0);
12445 if (CRHS && VT == MVT::i64) {
12446 if (SDValue Split
12447 = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
12448 return Split;
12449 }
12450
12451 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12452 // fneg-like xors into 64-bit select.
12453 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12454 // This looks like an fneg, try to fold as a source modifier.
12455 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12456 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
12457 // xor (select c, a, b), 0x80000000 ->
12458 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12459 SDLoc DL(N);
12460 SDValue CastLHS =
12461 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
12462 SDValue CastRHS =
12463 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
12464 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
12465 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
12466 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
12467 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
12468 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
12469 }
12470 }
12471
12472 return SDValue();
12473}
12474
12475SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12476 DAGCombinerInfo &DCI) const {
12477 if (!Subtarget->has16BitInsts() ||
12478 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12479 return SDValue();
12480
12481 EVT VT = N->getValueType(ResNo: 0);
12482 if (VT != MVT::i32)
12483 return SDValue();
12484
12485 SDValue Src = N->getOperand(Num: 0);
12486 if (Src.getValueType() != MVT::i16)
12487 return SDValue();
12488
12489 return SDValue();
12490}
12491
12492SDValue
12493SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12494 DAGCombinerInfo &DCI) const {
12495 SDValue Src = N->getOperand(Num: 0);
12496 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
12497
12498 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12499 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12500 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12501 VTSign->getVT() == MVT::i8) ||
12502 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12503 VTSign->getVT() == MVT::i16))) {
12504 assert(Subtarget->hasScalarSubwordLoads() &&
12505 "s_buffer_load_{u8, i8} are supported "
12506 "in GFX12 (or newer) architectures.");
12507 EVT VT = Src.getValueType();
12508 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12509 ? AMDGPUISD::SBUFFER_LOAD_BYTE
12510 : AMDGPUISD::SBUFFER_LOAD_SHORT;
12511 SDLoc DL(N);
12512 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
12513 SDValue Ops[] = {
12514 Src.getOperand(i: 0), // source register
12515 Src.getOperand(i: 1), // offset
12516 Src.getOperand(i: 2) // cachePolicy
12517 };
12518 auto *M = cast<MemSDNode>(Val&: Src);
12519 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12520 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
12521 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
12522 return LoadVal;
12523 }
12524 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12525 VTSign->getVT() == MVT::i8) ||
12526 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12527 VTSign->getVT() == MVT::i16)) &&
12528 Src.hasOneUse()) {
12529 auto *M = cast<MemSDNode>(Val&: Src);
12530 SDValue Ops[] = {
12531 Src.getOperand(i: 0), // Chain
12532 Src.getOperand(i: 1), // rsrc
12533 Src.getOperand(i: 2), // vindex
12534 Src.getOperand(i: 3), // voffset
12535 Src.getOperand(i: 4), // soffset
12536 Src.getOperand(i: 5), // offset
12537 Src.getOperand(i: 6),
12538 Src.getOperand(i: 7)
12539 };
12540 // replace with BUFFER_LOAD_BYTE/SHORT
12541 SDVTList ResList = DCI.DAG.getVTList(VT1: MVT::i32,
12542 VT2: Src.getOperand(i: 0).getValueType());
12543 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12544 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
12545 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opcode: Opc, dl: SDLoc(N),
12546 VTList: ResList,
12547 Ops, MemVT: M->getMemoryVT(),
12548 MMO: M->getMemOperand());
12549 return DCI.DAG.getMergeValues(Ops: {BufferLoadSignExt,
12550 BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
12551 }
12552 return SDValue();
12553}
12554
12555SDValue SITargetLowering::performClassCombine(SDNode *N,
12556 DAGCombinerInfo &DCI) const {
12557 SelectionDAG &DAG = DCI.DAG;
12558 SDValue Mask = N->getOperand(Num: 1);
12559
12560 // fp_class x, 0 -> false
12561 if (isNullConstant(V: Mask))
12562 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
12563
12564 if (N->getOperand(Num: 0).isUndef())
12565 return DAG.getUNDEF(VT: MVT::i1);
12566
12567 return SDValue();
12568}
12569
12570SDValue SITargetLowering::performRcpCombine(SDNode *N,
12571 DAGCombinerInfo &DCI) const {
12572 EVT VT = N->getValueType(ResNo: 0);
12573 SDValue N0 = N->getOperand(Num: 0);
12574
12575 if (N0.isUndef()) {
12576 return DCI.DAG.getConstantFP(
12577 Val: APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT)), DL: SDLoc(N),
12578 VT);
12579 }
12580
12581 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12582 N0.getOpcode() == ISD::SINT_TO_FP)) {
12583 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
12584 Flags: N->getFlags());
12585 }
12586
12587 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12588 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12589 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12590 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT,
12591 Operand: N0.getOperand(i: 0), Flags: N->getFlags());
12592 }
12593
12594 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
12595}
12596
12597bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
12598 unsigned MaxDepth) const {
12599 unsigned Opcode = Op.getOpcode();
12600 if (Opcode == ISD::FCANONICALIZE)
12601 return true;
12602
12603 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
12604 const auto &F = CFP->getValueAPF();
12605 if (F.isNaN() && F.isSignaling())
12606 return false;
12607 if (!F.isDenormal())
12608 return true;
12609
12610 DenormalMode Mode =
12611 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
12612 return Mode == DenormalMode::getIEEE();
12613 }
12614
12615 // If source is a result of another standard FP operation it is already in
12616 // canonical form.
12617 if (MaxDepth == 0)
12618 return false;
12619
12620 switch (Opcode) {
12621 // These will flush denorms if required.
12622 case ISD::FADD:
12623 case ISD::FSUB:
12624 case ISD::FMUL:
12625 case ISD::FCEIL:
12626 case ISD::FFLOOR:
12627 case ISD::FMA:
12628 case ISD::FMAD:
12629 case ISD::FSQRT:
12630 case ISD::FDIV:
12631 case ISD::FREM:
12632 case ISD::FP_ROUND:
12633 case ISD::FP_EXTEND:
12634 case ISD::FP16_TO_FP:
12635 case ISD::FP_TO_FP16:
12636 case ISD::BF16_TO_FP:
12637 case ISD::FP_TO_BF16:
12638 case ISD::FLDEXP:
12639 case AMDGPUISD::FMUL_LEGACY:
12640 case AMDGPUISD::FMAD_FTZ:
12641 case AMDGPUISD::RCP:
12642 case AMDGPUISD::RSQ:
12643 case AMDGPUISD::RSQ_CLAMP:
12644 case AMDGPUISD::RCP_LEGACY:
12645 case AMDGPUISD::RCP_IFLAG:
12646 case AMDGPUISD::LOG:
12647 case AMDGPUISD::EXP:
12648 case AMDGPUISD::DIV_SCALE:
12649 case AMDGPUISD::DIV_FMAS:
12650 case AMDGPUISD::DIV_FIXUP:
12651 case AMDGPUISD::FRACT:
12652 case AMDGPUISD::CVT_PKRTZ_F16_F32:
12653 case AMDGPUISD::CVT_F32_UBYTE0:
12654 case AMDGPUISD::CVT_F32_UBYTE1:
12655 case AMDGPUISD::CVT_F32_UBYTE2:
12656 case AMDGPUISD::CVT_F32_UBYTE3:
12657 case AMDGPUISD::FP_TO_FP16:
12658 case AMDGPUISD::SIN_HW:
12659 case AMDGPUISD::COS_HW:
12660 return true;
12661
12662 // It can/will be lowered or combined as a bit operation.
12663 // Need to check their input recursively to handle.
12664 case ISD::FNEG:
12665 case ISD::FABS:
12666 case ISD::FCOPYSIGN:
12667 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12668
12669 case ISD::AND:
12670 if (Op.getValueType() == MVT::i32) {
12671 // Be careful as we only know it is a bitcast floating point type. It
12672 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12673 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12674 // is valid to optimize for all types.
12675 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
12676 if (RHS->getZExtValue() == 0xffff0000) {
12677 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12678 }
12679 }
12680 }
12681 break;
12682
12683 case ISD::FSIN:
12684 case ISD::FCOS:
12685 case ISD::FSINCOS:
12686 return Op.getValueType().getScalarType() != MVT::f16;
12687
12688 case ISD::FMINNUM:
12689 case ISD::FMAXNUM:
12690 case ISD::FMINNUM_IEEE:
12691 case ISD::FMAXNUM_IEEE:
12692 case ISD::FMINIMUM:
12693 case ISD::FMAXIMUM:
12694 case AMDGPUISD::CLAMP:
12695 case AMDGPUISD::FMED3:
12696 case AMDGPUISD::FMAX3:
12697 case AMDGPUISD::FMIN3:
12698 case AMDGPUISD::FMAXIMUM3:
12699 case AMDGPUISD::FMINIMUM3: {
12700 // FIXME: Shouldn't treat the generic operations different based these.
12701 // However, we aren't really required to flush the result from
12702 // minnum/maxnum..
12703
12704 // snans will be quieted, so we only need to worry about denormals.
12705 if (Subtarget->supportsMinMaxDenormModes() ||
12706 // FIXME: denormalsEnabledForType is broken for dynamic
12707 denormalsEnabledForType(DAG, VT: Op.getValueType()))
12708 return true;
12709
12710 // Flushing may be required.
12711 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12712 // targets need to check their input recursively.
12713
12714 // FIXME: Does this apply with clamp? It's implemented with max.
12715 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12716 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), MaxDepth: MaxDepth - 1))
12717 return false;
12718 }
12719
12720 return true;
12721 }
12722 case ISD::SELECT: {
12723 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1) &&
12724 isCanonicalized(DAG, Op: Op.getOperand(i: 2), MaxDepth: MaxDepth - 1);
12725 }
12726 case ISD::BUILD_VECTOR: {
12727 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12728 SDValue SrcOp = Op.getOperand(i);
12729 if (!isCanonicalized(DAG, Op: SrcOp, MaxDepth: MaxDepth - 1))
12730 return false;
12731 }
12732
12733 return true;
12734 }
12735 case ISD::EXTRACT_VECTOR_ELT:
12736 case ISD::EXTRACT_SUBVECTOR: {
12737 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12738 }
12739 case ISD::INSERT_VECTOR_ELT: {
12740 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1) &&
12741 isCanonicalized(DAG, Op: Op.getOperand(i: 1), MaxDepth: MaxDepth - 1);
12742 }
12743 case ISD::UNDEF:
12744 // Could be anything.
12745 return false;
12746
12747 case ISD::BITCAST:
12748 // TODO: This is incorrect as it loses track of the operand's type. We may
12749 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12750 // same bits that are canonicalized in one type need not be in the other.
12751 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12752 case ISD::TRUNCATE: {
12753 // Hack round the mess we make when legalizing extract_vector_elt
12754 if (Op.getValueType() == MVT::i16) {
12755 SDValue TruncSrc = Op.getOperand(i: 0);
12756 if (TruncSrc.getValueType() == MVT::i32 &&
12757 TruncSrc.getOpcode() == ISD::BITCAST &&
12758 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
12759 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), MaxDepth: MaxDepth - 1);
12760 }
12761 }
12762 return false;
12763 }
12764 case ISD::INTRINSIC_WO_CHAIN: {
12765 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
12766 // TODO: Handle more intrinsics
12767 switch (IntrinsicID) {
12768 case Intrinsic::amdgcn_cvt_pkrtz:
12769 case Intrinsic::amdgcn_cubeid:
12770 case Intrinsic::amdgcn_frexp_mant:
12771 case Intrinsic::amdgcn_fdot2:
12772 case Intrinsic::amdgcn_rcp:
12773 case Intrinsic::amdgcn_rsq:
12774 case Intrinsic::amdgcn_rsq_clamp:
12775 case Intrinsic::amdgcn_rcp_legacy:
12776 case Intrinsic::amdgcn_rsq_legacy:
12777 case Intrinsic::amdgcn_trig_preop:
12778 case Intrinsic::amdgcn_log:
12779 case Intrinsic::amdgcn_exp2:
12780 case Intrinsic::amdgcn_sqrt:
12781 return true;
12782 default:
12783 break;
12784 }
12785
12786 break;
12787 }
12788 default:
12789 break;
12790 }
12791
12792 // FIXME: denormalsEnabledForType is broken for dynamic
12793 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
12794 DAG.isKnownNeverSNaN(Op);
12795}
12796
12797bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
12798 unsigned MaxDepth) const {
12799 const MachineRegisterInfo &MRI = MF.getRegInfo();
12800 MachineInstr *MI = MRI.getVRegDef(Reg);
12801 unsigned Opcode = MI->getOpcode();
12802
12803 if (Opcode == AMDGPU::G_FCANONICALIZE)
12804 return true;
12805
12806 std::optional<FPValueAndVReg> FCR;
12807 // Constant splat (can be padded with undef) or scalar constant.
12808 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
12809 if (FCR->Value.isSignaling())
12810 return false;
12811 if (!FCR->Value.isDenormal())
12812 return true;
12813
12814 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
12815 return Mode == DenormalMode::getIEEE();
12816 }
12817
12818 if (MaxDepth == 0)
12819 return false;
12820
12821 switch (Opcode) {
12822 case AMDGPU::G_FADD:
12823 case AMDGPU::G_FSUB:
12824 case AMDGPU::G_FMUL:
12825 case AMDGPU::G_FCEIL:
12826 case AMDGPU::G_FFLOOR:
12827 case AMDGPU::G_FRINT:
12828 case AMDGPU::G_FNEARBYINT:
12829 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12830 case AMDGPU::G_INTRINSIC_TRUNC:
12831 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12832 case AMDGPU::G_FMA:
12833 case AMDGPU::G_FMAD:
12834 case AMDGPU::G_FSQRT:
12835 case AMDGPU::G_FDIV:
12836 case AMDGPU::G_FREM:
12837 case AMDGPU::G_FPOW:
12838 case AMDGPU::G_FPEXT:
12839 case AMDGPU::G_FLOG:
12840 case AMDGPU::G_FLOG2:
12841 case AMDGPU::G_FLOG10:
12842 case AMDGPU::G_FPTRUNC:
12843 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12844 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12845 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12846 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12847 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12848 return true;
12849 case AMDGPU::G_FNEG:
12850 case AMDGPU::G_FABS:
12851 case AMDGPU::G_FCOPYSIGN:
12852 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
12853 case AMDGPU::G_FMINNUM:
12854 case AMDGPU::G_FMAXNUM:
12855 case AMDGPU::G_FMINNUM_IEEE:
12856 case AMDGPU::G_FMAXNUM_IEEE:
12857 case AMDGPU::G_FMINIMUM:
12858 case AMDGPU::G_FMAXIMUM: {
12859 if (Subtarget->supportsMinMaxDenormModes() ||
12860 // FIXME: denormalsEnabledForType is broken for dynamic
12861 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
12862 return true;
12863
12864 [[fallthrough]];
12865 }
12866 case AMDGPU::G_BUILD_VECTOR:
12867 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
12868 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
12869 return false;
12870 return true;
12871 case AMDGPU::G_INTRINSIC:
12872 case AMDGPU::G_INTRINSIC_CONVERGENT:
12873 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
12874 case Intrinsic::amdgcn_fmul_legacy:
12875 case Intrinsic::amdgcn_fmad_ftz:
12876 case Intrinsic::amdgcn_sqrt:
12877 case Intrinsic::amdgcn_fmed3:
12878 case Intrinsic::amdgcn_sin:
12879 case Intrinsic::amdgcn_cos:
12880 case Intrinsic::amdgcn_log:
12881 case Intrinsic::amdgcn_exp2:
12882 case Intrinsic::amdgcn_log_clamp:
12883 case Intrinsic::amdgcn_rcp:
12884 case Intrinsic::amdgcn_rcp_legacy:
12885 case Intrinsic::amdgcn_rsq:
12886 case Intrinsic::amdgcn_rsq_clamp:
12887 case Intrinsic::amdgcn_rsq_legacy:
12888 case Intrinsic::amdgcn_div_scale:
12889 case Intrinsic::amdgcn_div_fmas:
12890 case Intrinsic::amdgcn_div_fixup:
12891 case Intrinsic::amdgcn_fract:
12892 case Intrinsic::amdgcn_cvt_pkrtz:
12893 case Intrinsic::amdgcn_cubeid:
12894 case Intrinsic::amdgcn_cubema:
12895 case Intrinsic::amdgcn_cubesc:
12896 case Intrinsic::amdgcn_cubetc:
12897 case Intrinsic::amdgcn_frexp_mant:
12898 case Intrinsic::amdgcn_fdot2:
12899 case Intrinsic::amdgcn_trig_preop:
12900 return true;
12901 default:
12902 break;
12903 }
12904
12905 [[fallthrough]];
12906 default:
12907 return false;
12908 }
12909
12910 llvm_unreachable("invalid operation");
12911}
12912
12913// Constant fold canonicalize.
12914SDValue SITargetLowering::getCanonicalConstantFP(
12915 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12916 // Flush denormals to 0 if not enabled.
12917 if (C.isDenormal()) {
12918 DenormalMode Mode =
12919 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
12920 if (Mode == DenormalMode::getPreserveSign()) {
12921 return DAG.getConstantFP(
12922 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
12923 }
12924
12925 if (Mode != DenormalMode::getIEEE())
12926 return SDValue();
12927 }
12928
12929 if (C.isNaN()) {
12930 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
12931 if (C.isSignaling()) {
12932 // Quiet a signaling NaN.
12933 // FIXME: Is this supposed to preserve payload bits?
12934 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
12935 }
12936
12937 // Make sure it is the canonical NaN bitpattern.
12938 //
12939 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12940 // immediate?
12941 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12942 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
12943 }
12944
12945 // Already canonical.
12946 return DAG.getConstantFP(Val: C, DL: SL, VT);
12947}
12948
12949static bool vectorEltWillFoldAway(SDValue Op) {
12950 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
12951}
12952
12953SDValue SITargetLowering::performFCanonicalizeCombine(
12954 SDNode *N,
12955 DAGCombinerInfo &DCI) const {
12956 SelectionDAG &DAG = DCI.DAG;
12957 SDValue N0 = N->getOperand(Num: 0);
12958 EVT VT = N->getValueType(ResNo: 0);
12959
12960 // fcanonicalize undef -> qnan
12961 if (N0.isUndef()) {
12962 APFloat QNaN = APFloat::getQNaN(Sem: SelectionDAG::EVTToAPFloatSemantics(VT));
12963 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
12964 }
12965
12966 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
12967 EVT VT = N->getValueType(ResNo: 0);
12968 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
12969 }
12970
12971 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
12972 // (fcanonicalize k)
12973 //
12974 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
12975
12976 // TODO: This could be better with wider vectors that will be split to v2f16,
12977 // and to consider uses since there aren't that many packed operations.
12978 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
12979 isTypeLegal(VT: MVT::v2f16)) {
12980 SDLoc SL(N);
12981 SDValue NewElts[2];
12982 SDValue Lo = N0.getOperand(i: 0);
12983 SDValue Hi = N0.getOperand(i: 1);
12984 EVT EltVT = Lo.getValueType();
12985
12986 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
12987 for (unsigned I = 0; I != 2; ++I) {
12988 SDValue Op = N0.getOperand(i: I);
12989 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
12990 NewElts[I] = getCanonicalConstantFP(DAG, SL, VT: EltVT,
12991 C: CFP->getValueAPF());
12992 } else if (Op.isUndef()) {
12993 // Handled below based on what the other operand is.
12994 NewElts[I] = Op;
12995 } else {
12996 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
12997 }
12998 }
12999
13000 // If one half is undef, and one is constant, prefer a splat vector rather
13001 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13002 // cheaper to use and may be free with a packed operation.
13003 if (NewElts[0].isUndef()) {
13004 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
13005 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1]) ?
13006 NewElts[1]: DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
13007 }
13008
13009 if (NewElts[1].isUndef()) {
13010 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0]) ?
13011 NewElts[0] : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
13012 }
13013
13014 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
13015 }
13016 }
13017
13018 return SDValue();
13019}
13020
13021static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13022 switch (Opc) {
13023 case ISD::FMAXNUM:
13024 case ISD::FMAXNUM_IEEE:
13025 return AMDGPUISD::FMAX3;
13026 case ISD::FMAXIMUM:
13027 return AMDGPUISD::FMAXIMUM3;
13028 case ISD::SMAX:
13029 return AMDGPUISD::SMAX3;
13030 case ISD::UMAX:
13031 return AMDGPUISD::UMAX3;
13032 case ISD::FMINNUM:
13033 case ISD::FMINNUM_IEEE:
13034 return AMDGPUISD::FMIN3;
13035 case ISD::FMINIMUM:
13036 return AMDGPUISD::FMINIMUM3;
13037 case ISD::SMIN:
13038 return AMDGPUISD::SMIN3;
13039 case ISD::UMIN:
13040 return AMDGPUISD::UMIN3;
13041 default:
13042 llvm_unreachable("Not a min/max opcode");
13043 }
13044}
13045
13046SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13047 const SDLoc &SL, SDValue Src,
13048 SDValue MinVal,
13049 SDValue MaxVal,
13050 bool Signed) const {
13051
13052 // med3 comes from
13053 // min(max(x, K0), K1), K0 < K1
13054 // max(min(x, K0), K1), K1 < K0
13055 //
13056 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13057 // min/max op.
13058 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
13059 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
13060
13061 if (!MinK || !MaxK)
13062 return SDValue();
13063
13064 if (Signed) {
13065 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
13066 return SDValue();
13067 } else {
13068 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
13069 return SDValue();
13070 }
13071
13072 EVT VT = MinK->getValueType(ResNo: 0);
13073 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13074 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13075 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
13076
13077 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13078 // not available, but this is unlikely to be profitable as constants
13079 // will often need to be materialized & extended, especially on
13080 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13081 return SDValue();
13082}
13083
13084static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
13085 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
13086 return C;
13087
13088 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
13089 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13090 return C;
13091 }
13092
13093 return nullptr;
13094}
13095
13096SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13097 const SDLoc &SL,
13098 SDValue Op0,
13099 SDValue Op1) const {
13100 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
13101 if (!K1)
13102 return SDValue();
13103
13104 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
13105 if (!K0)
13106 return SDValue();
13107
13108 // Ordered >= (although NaN inputs should have folded away by now).
13109 if (K0->getValueAPF() > K1->getValueAPF())
13110 return SDValue();
13111
13112 const MachineFunction &MF = DAG.getMachineFunction();
13113 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13114
13115 // TODO: Check IEEE bit enabled?
13116 EVT VT = Op0.getValueType();
13117 if (Info->getMode().DX10Clamp) {
13118 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13119 // hardware fmed3 behavior converting to a min.
13120 // FIXME: Should this be allowing -0.0?
13121 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
13122 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
13123 }
13124
13125 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13127 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13128 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13129 // then give the other result, which is different from med3 with a NaN
13130 // input.
13131 SDValue Var = Op0.getOperand(i: 0);
13132 if (!DAG.isKnownNeverSNaN(Op: Var))
13133 return SDValue();
13134
13135 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13136
13137 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
13138 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
13139 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0),
13140 N1: Var, N2: SDValue(K0, 0), N3: SDValue(K1, 0));
13141 }
13142 }
13143
13144 return SDValue();
13145}
13146
13147/// \return true if the subtarget supports minimum3 and maximum3 with the given
13148/// base min/max opcode \p Opc for type \p VT.
13149static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13150 EVT VT) {
13151 switch (Opc) {
13152 case ISD::FMINNUM:
13153 case ISD::FMAXNUM:
13154 case ISD::FMINNUM_IEEE:
13155 case ISD::FMAXNUM_IEEE:
13156 case AMDGPUISD::FMIN_LEGACY:
13157 case AMDGPUISD::FMAX_LEGACY:
13158 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13159 case ISD::FMINIMUM:
13160 case ISD::FMAXIMUM:
13161 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13162 case ISD::SMAX:
13163 case ISD::SMIN:
13164 case ISD::UMAX:
13165 case ISD::UMIN:
13166 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13167 default:
13168 return false;
13169 }
13170
13171 llvm_unreachable("not a min/max opcode");
13172}
13173
13174SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13175 DAGCombinerInfo &DCI) const {
13176 SelectionDAG &DAG = DCI.DAG;
13177
13178 EVT VT = N->getValueType(ResNo: 0);
13179 unsigned Opc = N->getOpcode();
13180 SDValue Op0 = N->getOperand(Num: 0);
13181 SDValue Op1 = N->getOperand(Num: 1);
13182
13183 // Only do this if the inner op has one use since this will just increases
13184 // register pressure for no benefit.
13185
13186 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
13187 // max(max(a, b), c) -> max3(a, b, c)
13188 // min(min(a, b), c) -> min3(a, b, c)
13189 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13190 SDLoc DL(N);
13191 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc),
13192 DL,
13193 VT: N->getValueType(ResNo: 0),
13194 N1: Op0.getOperand(i: 0),
13195 N2: Op0.getOperand(i: 1),
13196 N3: Op1);
13197 }
13198
13199 // Try commuted.
13200 // max(a, max(b, c)) -> max3(a, b, c)
13201 // min(a, min(b, c)) -> min3(a, b, c)
13202 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13203 SDLoc DL(N);
13204 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc),
13205 DL,
13206 VT: N->getValueType(ResNo: 0),
13207 N1: Op0,
13208 N2: Op1.getOperand(i: 0),
13209 N3: Op1.getOperand(i: 1));
13210 }
13211 }
13212
13213 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13214 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13215 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13216 if (SDValue Med3 = performIntMed3ImmCombine(
13217 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
13218 return Med3;
13219 }
13220 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13221 if (SDValue Med3 = performIntMed3ImmCombine(
13222 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
13223 return Med3;
13224 }
13225
13226 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13227 if (SDValue Med3 = performIntMed3ImmCombine(
13228 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
13229 return Med3;
13230 }
13231 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13232 if (SDValue Med3 = performIntMed3ImmCombine(
13233 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
13234 return Med3;
13235 }
13236
13237 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13238 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13239 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13240 (Opc == AMDGPUISD::FMIN_LEGACY &&
13241 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13242 (VT == MVT::f32 || VT == MVT::f64 ||
13243 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13244 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13245 Op0.hasOneUse()) {
13246 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1))
13247 return Res;
13248 }
13249
13250 return SDValue();
13251}
13252
13253static bool isClampZeroToOne(SDValue A, SDValue B) {
13254 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
13255 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
13256 // FIXME: Should this be allowing -0.0?
13257 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
13258 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
13259 }
13260 }
13261
13262 return false;
13263}
13264
13265// FIXME: Should only worry about snans for version with chain.
13266SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13267 DAGCombinerInfo &DCI) const {
13268 EVT VT = N->getValueType(ResNo: 0);
13269 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13270 // NaNs. With a NaN input, the order of the operands may change the result.
13271
13272 SelectionDAG &DAG = DCI.DAG;
13273 SDLoc SL(N);
13274
13275 SDValue Src0 = N->getOperand(Num: 0);
13276 SDValue Src1 = N->getOperand(Num: 1);
13277 SDValue Src2 = N->getOperand(Num: 2);
13278
13279 if (isClampZeroToOne(A: Src0, B: Src1)) {
13280 // const_a, const_b, x -> clamp is safe in all cases including signaling
13281 // nans.
13282 // FIXME: Should this be allowing -0.0?
13283 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
13284 }
13285
13286 const MachineFunction &MF = DAG.getMachineFunction();
13287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13288
13289 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13290 // handling no dx10-clamp?
13291 if (Info->getMode().DX10Clamp) {
13292 // If NaNs is clamped to 0, we are free to reorder the inputs.
13293
13294 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
13295 std::swap(a&: Src0, b&: Src1);
13296
13297 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
13298 std::swap(a&: Src1, b&: Src2);
13299
13300 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
13301 std::swap(a&: Src0, b&: Src1);
13302
13303 if (isClampZeroToOne(A: Src1, B: Src2))
13304 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
13305 }
13306
13307 return SDValue();
13308}
13309
13310SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13311 DAGCombinerInfo &DCI) const {
13312 SDValue Src0 = N->getOperand(Num: 0);
13313 SDValue Src1 = N->getOperand(Num: 1);
13314 if (Src0.isUndef() && Src1.isUndef())
13315 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
13316 return SDValue();
13317}
13318
13319// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13320// expanded into a set of cmp/select instructions.
13321bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
13322 unsigned NumElem,
13323 bool IsDivergentIdx,
13324 const GCNSubtarget *Subtarget) {
13325 if (UseDivergentRegisterIndexing)
13326 return false;
13327
13328 unsigned VecSize = EltSize * NumElem;
13329
13330 // Sub-dword vectors of size 2 dword or less have better implementation.
13331 if (VecSize <= 64 && EltSize < 32)
13332 return false;
13333
13334 // Always expand the rest of sub-dword instructions, otherwise it will be
13335 // lowered via memory.
13336 if (EltSize < 32)
13337 return true;
13338
13339 // Always do this if var-idx is divergent, otherwise it will become a loop.
13340 if (IsDivergentIdx)
13341 return true;
13342
13343 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13344 unsigned NumInsts = NumElem /* Number of compares */ +
13345 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13346
13347 // On some architectures (GFX9) movrel is not available and it's better
13348 // to expand.
13349 if (!Subtarget->hasMovrel())
13350 return NumInsts <= 16;
13351
13352 // If movrel is available, use it instead of expanding for vector of 8
13353 // elements.
13354 return NumInsts <= 15;
13355}
13356
13357bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
13358 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
13359 if (isa<ConstantSDNode>(Val: Idx))
13360 return false;
13361
13362 SDValue Vec = N->getOperand(Num: 0);
13363 EVT VecVT = Vec.getValueType();
13364 EVT EltVT = VecVT.getVectorElementType();
13365 unsigned EltSize = EltVT.getSizeInBits();
13366 unsigned NumElem = VecVT.getVectorNumElements();
13367
13368 return SITargetLowering::shouldExpandVectorDynExt(
13369 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
13370}
13371
13372SDValue SITargetLowering::performExtractVectorEltCombine(
13373 SDNode *N, DAGCombinerInfo &DCI) const {
13374 SDValue Vec = N->getOperand(Num: 0);
13375 SelectionDAG &DAG = DCI.DAG;
13376
13377 EVT VecVT = Vec.getValueType();
13378 EVT VecEltVT = VecVT.getVectorElementType();
13379 EVT ResVT = N->getValueType(ResNo: 0);
13380
13381 unsigned VecSize = VecVT.getSizeInBits();
13382 unsigned VecEltSize = VecEltVT.getSizeInBits();
13383
13384 if ((Vec.getOpcode() == ISD::FNEG ||
13385 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
13386 SDLoc SL(N);
13387 SDValue Idx = N->getOperand(Num: 1);
13388 SDValue Elt =
13389 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
13390 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
13391 }
13392
13393 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13394 // =>
13395 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13396 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13397 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13398 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13399 SDLoc SL(N);
13400 SDValue Idx = N->getOperand(Num: 1);
13401 unsigned Opc = Vec.getOpcode();
13402
13403 switch(Opc) {
13404 default:
13405 break;
13406 // TODO: Support other binary operations.
13407 case ISD::FADD:
13408 case ISD::FSUB:
13409 case ISD::FMUL:
13410 case ISD::ADD:
13411 case ISD::UMIN:
13412 case ISD::UMAX:
13413 case ISD::SMIN:
13414 case ISD::SMAX:
13415 case ISD::FMAXNUM:
13416 case ISD::FMINNUM:
13417 case ISD::FMAXNUM_IEEE:
13418 case ISD::FMINNUM_IEEE:
13419 case ISD::FMAXIMUM:
13420 case ISD::FMINIMUM: {
13421 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
13422 N1: Vec.getOperand(i: 0), N2: Idx);
13423 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
13424 N1: Vec.getOperand(i: 1), N2: Idx);
13425
13426 DCI.AddToWorklist(N: Elt0.getNode());
13427 DCI.AddToWorklist(N: Elt1.getNode());
13428 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
13429 }
13430 }
13431 }
13432
13433 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13434 if (shouldExpandVectorDynExt(N)) {
13435 SDLoc SL(N);
13436 SDValue Idx = N->getOperand(Num: 1);
13437 SDValue V;
13438 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13439 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
13440 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
13441 if (I == 0)
13442 V = Elt;
13443 else
13444 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
13445 }
13446 return V;
13447 }
13448
13449 if (!DCI.isBeforeLegalize())
13450 return SDValue();
13451
13452 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13453 // elements. This exposes more load reduction opportunities by replacing
13454 // multiple small extract_vector_elements with a single 32-bit extract.
13455 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
13456 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13457 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13458 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
13459
13460 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13461 unsigned EltIdx = BitIndex / 32;
13462 unsigned LeftoverBitIdx = BitIndex % 32;
13463 SDLoc SL(N);
13464
13465 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
13466 DCI.AddToWorklist(N: Cast.getNode());
13467
13468 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
13469 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
13470 DCI.AddToWorklist(N: Elt.getNode());
13471 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
13472 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
13473 DCI.AddToWorklist(N: Srl.getNode());
13474
13475 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13476 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
13477 DCI.AddToWorklist(N: Trunc.getNode());
13478
13479 if (VecEltVT == ResVT) {
13480 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
13481 }
13482
13483 assert(ResVT.isScalarInteger());
13484 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
13485 }
13486
13487 return SDValue();
13488}
13489
13490SDValue
13491SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13492 DAGCombinerInfo &DCI) const {
13493 SDValue Vec = N->getOperand(Num: 0);
13494 SDValue Idx = N->getOperand(Num: 2);
13495 EVT VecVT = Vec.getValueType();
13496 EVT EltVT = VecVT.getVectorElementType();
13497
13498 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13499 // => BUILD_VECTOR n x select (e, const-idx)
13500 if (!shouldExpandVectorDynExt(N))
13501 return SDValue();
13502
13503 SelectionDAG &DAG = DCI.DAG;
13504 SDLoc SL(N);
13505 SDValue Ins = N->getOperand(Num: 1);
13506 EVT IdxVT = Idx.getValueType();
13507
13508 SmallVector<SDValue, 16> Ops;
13509 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13510 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
13511 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
13512 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
13513 Ops.push_back(Elt: V);
13514 }
13515
13516 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
13517}
13518
13519/// Return the source of an fp_extend from f16 to f32, or a converted FP
13520/// constant.
13521static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
13522 if (Src.getOpcode() == ISD::FP_EXTEND &&
13523 Src.getOperand(i: 0).getValueType() == MVT::f16) {
13524 return Src.getOperand(i: 0);
13525 }
13526
13527 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
13528 APFloat Val = CFP->getValueAPF();
13529 bool LosesInfo = true;
13530 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
13531 if (!LosesInfo)
13532 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
13533 }
13534
13535 return SDValue();
13536}
13537
13538SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13539 DAGCombinerInfo &DCI) const {
13540 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13541 "combine only useful on gfx8");
13542
13543 SDValue TruncSrc = N->getOperand(Num: 0);
13544 EVT VT = N->getValueType(ResNo: 0);
13545 if (VT != MVT::f16)
13546 return SDValue();
13547
13548 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13549 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13550 return SDValue();
13551
13552 SelectionDAG &DAG = DCI.DAG;
13553 SDLoc SL(N);
13554
13555 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13556 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13557 // casting back.
13558
13559 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13560 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13561 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
13562 if (!A)
13563 return SDValue();
13564
13565 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
13566 if (!B)
13567 return SDValue();
13568
13569 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
13570 if (!C)
13571 return SDValue();
13572
13573 // This changes signaling nan behavior. If an input is a signaling nan, it
13574 // would have been quieted by the fpext originally. We don't care because
13575 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13576 // we would be worse off than just doing the promotion.
13577 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
13578 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
13579 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
13580 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
13581}
13582
13583unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13584 const SDNode *N0,
13585 const SDNode *N1) const {
13586 EVT VT = N0->getValueType(ResNo: 0);
13587
13588 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13589 // support denormals ever.
13590 if (((VT == MVT::f32 &&
13591 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
13592 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13593 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
13594 isOperationLegal(Op: ISD::FMAD, VT))
13595 return ISD::FMAD;
13596
13597 const TargetOptions &Options = DAG.getTarget().Options;
13598 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13599 (N0->getFlags().hasAllowContract() &&
13600 N1->getFlags().hasAllowContract())) &&
13601 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
13602 return ISD::FMA;
13603 }
13604
13605 return 0;
13606}
13607
13608// For a reassociatable opcode perform:
13609// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13610SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13611 SelectionDAG &DAG) const {
13612 EVT VT = N->getValueType(ResNo: 0);
13613 if (VT != MVT::i32 && VT != MVT::i64)
13614 return SDValue();
13615
13616 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
13617 return SDValue();
13618
13619 unsigned Opc = N->getOpcode();
13620 SDValue Op0 = N->getOperand(Num: 0);
13621 SDValue Op1 = N->getOperand(Num: 1);
13622
13623 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13624 return SDValue();
13625
13626 if (Op0->isDivergent())
13627 std::swap(a&: Op0, b&: Op1);
13628
13629 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13630 return SDValue();
13631
13632 SDValue Op2 = Op1.getOperand(i: 1);
13633 Op1 = Op1.getOperand(i: 0);
13634 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13635 return SDValue();
13636
13637 if (Op1->isDivergent())
13638 std::swap(a&: Op1, b&: Op2);
13639
13640 SDLoc SL(N);
13641 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
13642 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
13643}
13644
13645static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13646 EVT VT,
13647 SDValue N0, SDValue N1, SDValue N2,
13648 bool Signed) {
13649 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
13650 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
13651 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
13652 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
13653}
13654
13655// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13656// multiplies, if any.
13657//
13658// Full 64-bit multiplies that feed into an addition are lowered here instead
13659// of using the generic expansion. The generic expansion ends up with
13660// a tree of ADD nodes that prevents us from using the "add" part of the
13661// MAD instruction. The expansion produced here results in a chain of ADDs
13662// instead of a tree.
13663SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13664 DAGCombinerInfo &DCI) const {
13665 assert(N->getOpcode() == ISD::ADD);
13666
13667 SelectionDAG &DAG = DCI.DAG;
13668 EVT VT = N->getValueType(ResNo: 0);
13669 SDLoc SL(N);
13670 SDValue LHS = N->getOperand(Num: 0);
13671 SDValue RHS = N->getOperand(Num: 1);
13672
13673 if (VT.isVector())
13674 return SDValue();
13675
13676 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13677 // result in scalar registers for uniform values.
13678 if (!N->isDivergent() && Subtarget->hasSMulHi())
13679 return SDValue();
13680
13681 unsigned NumBits = VT.getScalarSizeInBits();
13682 if (NumBits <= 32 || NumBits > 64)
13683 return SDValue();
13684
13685 if (LHS.getOpcode() != ISD::MUL) {
13686 assert(RHS.getOpcode() == ISD::MUL);
13687 std::swap(a&: LHS, b&: RHS);
13688 }
13689
13690 // Avoid the fold if it would unduly increase the number of multiplies due to
13691 // multiple uses, except on hardware with full-rate multiply-add (which is
13692 // part of full-rate 64-bit ops).
13693 if (!Subtarget->hasFullRate64Ops()) {
13694 unsigned NumUsers = 0;
13695 for (SDNode *Use : LHS->uses()) {
13696 // There is a use that does not feed into addition, so the multiply can't
13697 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13698 if (Use->getOpcode() != ISD::ADD)
13699 return SDValue();
13700
13701 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13702 // MUL + 3xADD + 3xADDC over 3xMAD.
13703 ++NumUsers;
13704 if (NumUsers >= 3)
13705 return SDValue();
13706 }
13707 }
13708
13709 SDValue MulLHS = LHS.getOperand(i: 0);
13710 SDValue MulRHS = LHS.getOperand(i: 1);
13711 SDValue AddRHS = RHS;
13712
13713 // Always check whether operands are small unsigned values, since that
13714 // knowledge is useful in more cases. Check for small signed values only if
13715 // doing so can unlock a shorter code sequence.
13716 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
13717 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
13718
13719 bool MulSignedLo = false;
13720 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13721 MulSignedLo = numBitsSigned(Op: MulLHS, DAG) <= 32 &&
13722 numBitsSigned(Op: MulRHS, DAG) <= 32;
13723 }
13724
13725 // The operands and final result all have the same number of bits. If
13726 // operands need to be extended, they can be extended with garbage. The
13727 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13728 // truncated away in the end.
13729 if (VT != MVT::i64) {
13730 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
13731 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
13732 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
13733 }
13734
13735 // The basic code generated is conceptually straightforward. Pseudo code:
13736 //
13737 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13738 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13739 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13740 //
13741 // The second and third lines are optional, depending on whether the factors
13742 // are {sign,zero}-extended or not.
13743 //
13744 // The actual DAG is noisier than the pseudo code, but only due to
13745 // instructions that disassemble values into low and high parts, and
13746 // assemble the final result.
13747 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
13748
13749 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
13750 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
13751 SDValue Accum =
13752 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
13753
13754 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13755 SDValue AccumLo, AccumHi;
13756 std::tie(args&: AccumLo, args&: AccumHi) = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
13757
13758 if (!MulLHSUnsigned32) {
13759 auto MulLHSHi =
13760 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
13761 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
13762 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
13763 }
13764
13765 if (!MulRHSUnsigned32) {
13766 auto MulRHSHi =
13767 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
13768 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
13769 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
13770 }
13771
13772 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
13773 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
13774 }
13775
13776 if (VT != MVT::i64)
13777 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
13778 return Accum;
13779}
13780
13781// Collect the ultimate src of each of the mul node's operands, and confirm
13782// each operand is 8 bytes.
13783static std::optional<ByteProvider<SDValue>>
13784handleMulOperand(const SDValue &MulOperand) {
13785 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
13786 if (!Byte0 || Byte0->isConstantZero()) {
13787 return std::nullopt;
13788 }
13789 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
13790 if (Byte1 && !Byte1->isConstantZero()) {
13791 return std::nullopt;
13792 }
13793 return Byte0;
13794}
13795
13796static unsigned addPermMasks(unsigned First, unsigned Second) {
13797 unsigned FirstCs = First & 0x0c0c0c0c;
13798 unsigned SecondCs = Second & 0x0c0c0c0c;
13799 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13800 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13801
13802 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13803 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13804 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13805 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13806
13807 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13808}
13809
13810struct DotSrc {
13811 SDValue SrcOp;
13812 int64_t PermMask;
13813 int64_t DWordOffset;
13814};
13815
13816static void placeSources(ByteProvider<SDValue> &Src0,
13817 ByteProvider<SDValue> &Src1,
13818 SmallVectorImpl<DotSrc> &Src0s,
13819 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13820
13821 assert(Src0.Src.has_value() && Src1.Src.has_value());
13822 // Src0s and Src1s are empty, just place arbitrarily.
13823 if (Step == 0) {
13824 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13825 .DWordOffset: Src0.SrcOffset / 4});
13826 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13827 .DWordOffset: Src1.SrcOffset / 4});
13828 return;
13829 }
13830
13831 for (int BPI = 0; BPI < 2; BPI++) {
13832 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13833 if (BPI == 1) {
13834 BPP = {Src1, Src0};
13835 }
13836 unsigned ZeroMask = 0x0c0c0c0c;
13837 unsigned FMask = 0xFF << (8 * (3 - Step));
13838
13839 unsigned FirstMask =
13840 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13841 unsigned SecondMask =
13842 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13843 // Attempt to find Src vector which contains our SDValue, if so, add our
13844 // perm mask to the existing one. If we are unable to find a match for the
13845 // first SDValue, attempt to find match for the second.
13846 int FirstGroup = -1;
13847 for (int I = 0; I < 2; I++) {
13848 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13849 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13850 return IterElt.SrcOp == *BPP.first.Src &&
13851 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13852 };
13853
13854 auto Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
13855 if (Match != Srcs.end()) {
13856 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
13857 FirstGroup = I;
13858 break;
13859 }
13860 }
13861 if (FirstGroup != -1) {
13862 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13863 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13864 return IterElt.SrcOp == *BPP.second.Src &&
13865 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13866 };
13867 auto Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
13868 if (Match != Srcs.end()) {
13869 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
13870 } else
13871 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
13872 return;
13873 }
13874 }
13875
13876 // If we have made it here, then we could not find a match in Src0s or Src1s
13877 // for either Src0 or Src1, so just place them arbitrarily.
13878
13879 unsigned ZeroMask = 0x0c0c0c0c;
13880 unsigned FMask = 0xFF << (8 * (3 - Step));
13881
13882 Src0s.push_back(
13883 Elt: {.SrcOp: *Src0.Src,
13884 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13885 .DWordOffset: Src1.SrcOffset / 4});
13886 Src1s.push_back(
13887 Elt: {.SrcOp: *Src1.Src,
13888 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13889 .DWordOffset: Src1.SrcOffset / 4});
13890
13891 return;
13892}
13893
13894static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
13895 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13896 bool IsAny) {
13897
13898 // If we just have one source, just permute it accordingly.
13899 if (Srcs.size() == 1) {
13900 auto Elt = Srcs.begin();
13901 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
13902
13903 // v_perm will produce the original value
13904 if (Elt->PermMask == 0x3020100)
13905 return EltOp;
13906
13907 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
13908 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
13909 }
13910
13911 auto FirstElt = Srcs.begin();
13912 auto SecondElt = std::next(x: FirstElt);
13913
13914 SmallVector<SDValue, 2> Perms;
13915
13916 // If we have multiple sources in the chain, combine them via perms (using
13917 // calculated perm mask) and Ors.
13918 while (true) {
13919 auto FirstMask = FirstElt->PermMask;
13920 auto SecondMask = SecondElt->PermMask;
13921
13922 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13923 unsigned FirstPlusFour = FirstMask | 0x04040404;
13924 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13925 // original 0x0C.
13926 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13927
13928 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
13929 auto FirstVal =
13930 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
13931 auto SecondVal =
13932 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
13933
13934 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
13935 N2: SecondVal,
13936 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
13937
13938 FirstElt = std::next(x: SecondElt);
13939 if (FirstElt == Srcs.end())
13940 break;
13941
13942 SecondElt = std::next(x: FirstElt);
13943 // If we only have a FirstElt, then just combine that into the cumulative
13944 // source node.
13945 if (SecondElt == Srcs.end()) {
13946 auto EltOp =
13947 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
13948
13949 Perms.push_back(
13950 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
13951 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
13952 break;
13953 }
13954 }
13955
13956 assert(Perms.size() == 1 || Perms.size() == 2);
13957 return Perms.size() == 2
13958 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
13959 : Perms[0];
13960}
13961
13962static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13963 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13964 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13965 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13966 EntryMask += ZeroMask;
13967 }
13968}
13969
13970static bool isMul(const SDValue Op) {
13971 auto Opcode = Op.getOpcode();
13972
13973 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13974 Opcode == AMDGPUISD::MUL_I24);
13975}
13976
13977static std::optional<bool>
13978checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
13979 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
13980 const SDValue &S1Op, const SelectionDAG &DAG) {
13981 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
13982 // of the dot4 is irrelevant.
13983 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
13984 return false;
13985
13986 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
13987 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
13988 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13989 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
13990 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
13991 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13992
13993 assert(!(S0IsUnsigned && S0IsSigned));
13994 assert(!(S1IsUnsigned && S1IsSigned));
13995
13996 // There are 9 possible permutations of
13997 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
13998
13999 // In two permutations, the sign bits are known to be the same for both Ops,
14000 // so simply return Signed / Unsigned corresponding to the MSB
14001
14002 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14003 return S0IsSigned;
14004
14005 // In another two permutations, the sign bits are known to be opposite. In
14006 // this case return std::nullopt to indicate a bad match.
14007
14008 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14009 return std::nullopt;
14010
14011 // In the remaining five permutations, we don't know the value of the sign
14012 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14013 // the upper bits must be extension bits. Thus, the only ways for the sign
14014 // bit to be unknown is if it was sign extended from unknown value, or if it
14015 // was any extended. In either case, it is correct to use the signed
14016 // version of the signedness semantics of dot4
14017
14018 // In two of such permutations, we known the sign bit is set for
14019 // one op, and the other is unknown. It is okay to used signed version of
14020 // dot4.
14021 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14022 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14023 return true;
14024
14025 // In one such permutation, we don't know either of the sign bits. It is okay
14026 // to used the signed version of dot4.
14027 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14028 return true;
14029
14030 // In two of such permutations, we known the sign bit is unset for
14031 // one op, and the other is unknown. Return std::nullopt to indicate a
14032 // bad match.
14033 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14034 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14035 return std::nullopt;
14036
14037 llvm_unreachable("Fully covered condition");
14038}
14039
14040SDValue SITargetLowering::performAddCombine(SDNode *N,
14041 DAGCombinerInfo &DCI) const {
14042 SelectionDAG &DAG = DCI.DAG;
14043 EVT VT = N->getValueType(ResNo: 0);
14044 SDLoc SL(N);
14045 SDValue LHS = N->getOperand(Num: 0);
14046 SDValue RHS = N->getOperand(Num: 1);
14047
14048 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14049 if (Subtarget->hasMad64_32()) {
14050 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14051 return Folded;
14052 }
14053 }
14054
14055 if (SDValue V = reassociateScalarOps(N, DAG)) {
14056 return V;
14057 }
14058
14059 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
14060 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14061 SDValue TempNode(N, 0);
14062 std::optional<bool> IsSigned;
14063 SmallVector<DotSrc, 4> Src0s;
14064 SmallVector<DotSrc, 4> Src1s;
14065 SmallVector<SDValue, 4> Src2s;
14066
14067 // Match the v_dot4 tree, while collecting src nodes.
14068 int ChainLength = 0;
14069 for (int I = 0; I < 4; I++) {
14070 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
14071 if (MulIdx == -1)
14072 break;
14073 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
14074 if (!Src0)
14075 break;
14076 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
14077 if (!Src1)
14078 break;
14079
14080 auto IterIsSigned = checkDot4MulSignedness(
14081 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
14082 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
14083 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
14084 if (!IterIsSigned)
14085 break;
14086 if (!IsSigned)
14087 IsSigned = *IterIsSigned;
14088 if (*IterIsSigned != *IsSigned)
14089 break;
14090 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
14091 auto AddIdx = 1 - MulIdx;
14092 // Allow the special case where add (add (mul24, 0), mul24) became ->
14093 // add (mul24, mul24).
14094 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
14095 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
14096 auto Src0 =
14097 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
14098 if (!Src0)
14099 break;
14100 auto Src1 =
14101 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
14102 if (!Src1)
14103 break;
14104 auto IterIsSigned = checkDot4MulSignedness(
14105 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
14106 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
14107 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
14108 if (!IterIsSigned)
14109 break;
14110 assert(IsSigned);
14111 if (*IterIsSigned != *IsSigned)
14112 break;
14113 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
14114 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
14115 ChainLength = I + 2;
14116 break;
14117 }
14118
14119 TempNode = TempNode->getOperand(Num: AddIdx);
14120 Src2s.push_back(Elt: TempNode);
14121 ChainLength = I + 1;
14122 if (TempNode->getNumOperands() < 2)
14123 break;
14124 LHS = TempNode->getOperand(Num: 0);
14125 RHS = TempNode->getOperand(Num: 1);
14126 }
14127
14128 if (ChainLength < 2)
14129 return SDValue();
14130
14131 // Masks were constructed with assumption that we would find a chain of
14132 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14133 // 0x0c) so they do not affect dot calculation.
14134 if (ChainLength < 4) {
14135 fixMasks(Srcs&: Src0s, ChainLength);
14136 fixMasks(Srcs&: Src1s, ChainLength);
14137 }
14138
14139 SDValue Src0, Src1;
14140
14141 // If we are just using a single source for both, and have permuted the
14142 // bytes consistently, we can just use the sources without permuting
14143 // (commutation).
14144 bool UseOriginalSrc = false;
14145 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14146 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14147 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14148 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14149 SmallVector<unsigned, 4> SrcBytes;
14150 auto Src0Mask = Src0s.begin()->PermMask;
14151 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
14152 bool UniqueEntries = true;
14153 for (auto I = 1; I < 4; I++) {
14154 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14155
14156 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
14157 UniqueEntries = false;
14158 break;
14159 }
14160 SrcBytes.push_back(Elt: NextByte);
14161 }
14162
14163 if (UniqueEntries) {
14164 UseOriginalSrc = true;
14165
14166 auto FirstElt = Src0s.begin();
14167 auto FirstEltOp =
14168 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
14169
14170 auto SecondElt = Src1s.begin();
14171 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
14172 DWordOffset: SecondElt->DWordOffset);
14173
14174 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
14175 VT: MVT::getIntegerVT(BitWidth: 32));
14176 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
14177 VT: MVT::getIntegerVT(BitWidth: 32));
14178 }
14179 }
14180
14181 if (!UseOriginalSrc) {
14182 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
14183 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
14184 }
14185
14186 assert(IsSigned);
14187 SDValue Src2 =
14188 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
14189
14190 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
14191 : Intrinsic::amdgcn_udot4,
14192 DL: SL, VT: MVT::i64);
14193
14194 assert(!VT.isVector());
14195 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
14196 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
14197
14198 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
14199 }
14200
14201 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14202 return SDValue();
14203
14204 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14205 // add x, sext (setcc) => usubo_carry x, 0, setcc
14206 unsigned Opc = LHS.getOpcode();
14207 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14208 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14209 std::swap(a&: RHS, b&: LHS);
14210
14211 Opc = RHS.getOpcode();
14212 switch (Opc) {
14213 default: break;
14214 case ISD::ZERO_EXTEND:
14215 case ISD::SIGN_EXTEND:
14216 case ISD::ANY_EXTEND: {
14217 auto Cond = RHS.getOperand(i: 0);
14218 // If this won't be a real VOPC output, we would still need to insert an
14219 // extra instruction anyway.
14220 if (!isBoolSGPR(V: Cond))
14221 break;
14222 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
14223 SDValue Args[] = { LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond };
14224 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14225 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
14226 }
14227 case ISD::UADDO_CARRY: {
14228 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14229 if (!isNullConstant(V: RHS.getOperand(i: 1)))
14230 break;
14231 SDValue Args[] = { LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2) };
14232 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
14233 }
14234 }
14235 return SDValue();
14236}
14237
14238SDValue SITargetLowering::performSubCombine(SDNode *N,
14239 DAGCombinerInfo &DCI) const {
14240 SelectionDAG &DAG = DCI.DAG;
14241 EVT VT = N->getValueType(ResNo: 0);
14242
14243 if (VT != MVT::i32)
14244 return SDValue();
14245
14246 SDLoc SL(N);
14247 SDValue LHS = N->getOperand(Num: 0);
14248 SDValue RHS = N->getOperand(Num: 1);
14249
14250 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14251 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14252 unsigned Opc = RHS.getOpcode();
14253 switch (Opc) {
14254 default: break;
14255 case ISD::ZERO_EXTEND:
14256 case ISD::SIGN_EXTEND:
14257 case ISD::ANY_EXTEND: {
14258 auto Cond = RHS.getOperand(i: 0);
14259 // If this won't be a real VOPC output, we would still need to insert an
14260 // extra instruction anyway.
14261 if (!isBoolSGPR(V: Cond))
14262 break;
14263 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
14264 SDValue Args[] = { LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond };
14265 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14266 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
14267 }
14268 }
14269
14270 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14271 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14272 if (!isNullConstant(V: LHS.getOperand(i: 1)))
14273 return SDValue();
14274 SDValue Args[] = { LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2) };
14275 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
14276 }
14277 return SDValue();
14278}
14279
14280SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14281 DAGCombinerInfo &DCI) const {
14282
14283 if (N->getValueType(ResNo: 0) != MVT::i32)
14284 return SDValue();
14285
14286 if (!isNullConstant(V: N->getOperand(Num: 1)))
14287 return SDValue();
14288
14289 SelectionDAG &DAG = DCI.DAG;
14290 SDValue LHS = N->getOperand(Num: 0);
14291
14292 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14293 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14294 unsigned LHSOpc = LHS.getOpcode();
14295 unsigned Opc = N->getOpcode();
14296 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14297 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14298 SDValue Args[] = { LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2) };
14299 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
14300 }
14301 return SDValue();
14302}
14303
14304SDValue SITargetLowering::performFAddCombine(SDNode *N,
14305 DAGCombinerInfo &DCI) const {
14306 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14307 return SDValue();
14308
14309 SelectionDAG &DAG = DCI.DAG;
14310 EVT VT = N->getValueType(ResNo: 0);
14311
14312 SDLoc SL(N);
14313 SDValue LHS = N->getOperand(Num: 0);
14314 SDValue RHS = N->getOperand(Num: 1);
14315
14316 // These should really be instruction patterns, but writing patterns with
14317 // source modifiers is a pain.
14318
14319 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14320 if (LHS.getOpcode() == ISD::FADD) {
14321 SDValue A = LHS.getOperand(i: 0);
14322 if (A == LHS.getOperand(i: 1)) {
14323 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
14324 if (FusedOp != 0) {
14325 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14326 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
14327 }
14328 }
14329 }
14330
14331 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14332 if (RHS.getOpcode() == ISD::FADD) {
14333 SDValue A = RHS.getOperand(i: 0);
14334 if (A == RHS.getOperand(i: 1)) {
14335 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
14336 if (FusedOp != 0) {
14337 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14338 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
14339 }
14340 }
14341 }
14342
14343 return SDValue();
14344}
14345
14346SDValue SITargetLowering::performFSubCombine(SDNode *N,
14347 DAGCombinerInfo &DCI) const {
14348 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14349 return SDValue();
14350
14351 SelectionDAG &DAG = DCI.DAG;
14352 SDLoc SL(N);
14353 EVT VT = N->getValueType(ResNo: 0);
14354 assert(!VT.isVector());
14355
14356 // Try to get the fneg to fold into the source modifier. This undoes generic
14357 // DAG combines and folds them into the mad.
14358 //
14359 // Only do this if we are not trying to support denormals. v_mad_f32 does
14360 // not support denormals ever.
14361 SDValue LHS = N->getOperand(Num: 0);
14362 SDValue RHS = N->getOperand(Num: 1);
14363 if (LHS.getOpcode() == ISD::FADD) {
14364 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14365 SDValue A = LHS.getOperand(i: 0);
14366 if (A == LHS.getOperand(i: 1)) {
14367 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
14368 if (FusedOp != 0){
14369 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
14370 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
14371
14372 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
14373 }
14374 }
14375 }
14376
14377 if (RHS.getOpcode() == ISD::FADD) {
14378 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14379
14380 SDValue A = RHS.getOperand(i: 0);
14381 if (A == RHS.getOperand(i: 1)) {
14382 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
14383 if (FusedOp != 0){
14384 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
14385 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
14386 }
14387 }
14388 }
14389
14390 return SDValue();
14391}
14392
14393SDValue SITargetLowering::performFDivCombine(SDNode *N,
14394 DAGCombinerInfo &DCI) const {
14395 SelectionDAG &DAG = DCI.DAG;
14396 SDLoc SL(N);
14397 EVT VT = N->getValueType(ResNo: 0);
14398 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14399 return SDValue();
14400
14401 SDValue LHS = N->getOperand(Num: 0);
14402 SDValue RHS = N->getOperand(Num: 1);
14403
14404 SDNodeFlags Flags = N->getFlags();
14405 SDNodeFlags RHSFlags = RHS->getFlags();
14406 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14407 !RHS->hasOneUse())
14408 return SDValue();
14409
14410 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
14411 bool IsNegative = false;
14412 if (CLHS->isExactlyValue(V: 1.0) ||
14413 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
14414 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14415 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14416 if (RHS.getOpcode() == ISD::FSQRT) {
14417 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14418 SDValue Rsq =
14419 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
14420 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
14421 }
14422 }
14423 }
14424
14425 return SDValue();
14426}
14427
14428SDValue SITargetLowering::performFMACombine(SDNode *N,
14429 DAGCombinerInfo &DCI) const {
14430 SelectionDAG &DAG = DCI.DAG;
14431 EVT VT = N->getValueType(ResNo: 0);
14432 SDLoc SL(N);
14433
14434 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14435 return SDValue();
14436
14437 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14438 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14439 SDValue Op1 = N->getOperand(Num: 0);
14440 SDValue Op2 = N->getOperand(Num: 1);
14441 SDValue FMA = N->getOperand(Num: 2);
14442
14443 if (FMA.getOpcode() != ISD::FMA ||
14444 Op1.getOpcode() != ISD::FP_EXTEND ||
14445 Op2.getOpcode() != ISD::FP_EXTEND)
14446 return SDValue();
14447
14448 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14449 // regardless of the denorm mode setting. Therefore,
14450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14451 const TargetOptions &Options = DAG.getTarget().Options;
14452 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14453 (N->getFlags().hasAllowContract() &&
14454 FMA->getFlags().hasAllowContract())) {
14455 Op1 = Op1.getOperand(i: 0);
14456 Op2 = Op2.getOperand(i: 0);
14457 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14458 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14459 return SDValue();
14460
14461 SDValue Vec1 = Op1.getOperand(i: 0);
14462 SDValue Idx1 = Op1.getOperand(i: 1);
14463 SDValue Vec2 = Op2.getOperand(i: 0);
14464
14465 SDValue FMAOp1 = FMA.getOperand(i: 0);
14466 SDValue FMAOp2 = FMA.getOperand(i: 1);
14467 SDValue FMAAcc = FMA.getOperand(i: 2);
14468
14469 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14470 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14471 return SDValue();
14472
14473 FMAOp1 = FMAOp1.getOperand(i: 0);
14474 FMAOp2 = FMAOp2.getOperand(i: 0);
14475 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14476 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14477 return SDValue();
14478
14479 SDValue Vec3 = FMAOp1.getOperand(i: 0);
14480 SDValue Vec4 = FMAOp2.getOperand(i: 0);
14481 SDValue Idx2 = FMAOp1.getOperand(i: 1);
14482
14483 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
14484 // Idx1 and Idx2 cannot be the same.
14485 Idx1 == Idx2)
14486 return SDValue();
14487
14488 if (Vec1 == Vec2 || Vec3 == Vec4)
14489 return SDValue();
14490
14491 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14492 return SDValue();
14493
14494 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14495 (Vec1 == Vec4 && Vec2 == Vec3)) {
14496 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
14497 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
14498 }
14499 }
14500 return SDValue();
14501}
14502
14503SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14504 DAGCombinerInfo &DCI) const {
14505 SelectionDAG &DAG = DCI.DAG;
14506 SDLoc SL(N);
14507
14508 SDValue LHS = N->getOperand(Num: 0);
14509 SDValue RHS = N->getOperand(Num: 1);
14510 EVT VT = LHS.getValueType();
14511 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
14512
14513 auto CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
14514 if (!CRHS) {
14515 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
14516 if (CRHS) {
14517 std::swap(a&: LHS, b&: RHS);
14518 CC = getSetCCSwappedOperands(Operation: CC);
14519 }
14520 }
14521
14522 if (CRHS) {
14523 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14524 isBoolSGPR(V: LHS.getOperand(i: 0))) {
14525 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14526 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14527 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14528 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14529 if ((CRHS->isAllOnes() &&
14530 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14531 (CRHS->isZero() &&
14532 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14533 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
14534 N2: DAG.getConstant(Val: -1, DL: SL, VT: MVT::i1));
14535 if ((CRHS->isAllOnes() &&
14536 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14537 (CRHS->isZero() &&
14538 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14539 return LHS.getOperand(i: 0);
14540 }
14541
14542 const APInt &CRHSVal = CRHS->getAPIntValue();
14543 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14544 LHS.getOpcode() == ISD::SELECT &&
14545 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
14546 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
14547 LHS.getConstantOperandVal(i: 1) != LHS.getConstantOperandVal(i: 2) &&
14548 isBoolSGPR(V: LHS.getOperand(i: 0))) {
14549 // Given CT != FT:
14550 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14551 // setcc (select cc, CT, CF), CF, ne => cc
14552 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14553 // setcc (select cc, CT, CF), CT, eq => cc
14554 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
14555 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
14556
14557 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14558 (CT == CRHSVal && CC == ISD::SETNE))
14559 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
14560 N2: DAG.getConstant(Val: -1, DL: SL, VT: MVT::i1));
14561 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14562 (CT == CRHSVal && CC == ISD::SETEQ))
14563 return LHS.getOperand(i: 0);
14564 }
14565 }
14566
14567 if (VT != MVT::f32 && VT != MVT::f64 &&
14568 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14569 return SDValue();
14570
14571 // Match isinf/isfinite pattern
14572 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14573 // (fcmp one (fabs x), inf) -> (fp_class x,
14574 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14575 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14576 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
14577 if (!CRHS)
14578 return SDValue();
14579
14580 const APFloat &APF = CRHS->getValueAPF();
14581 if (APF.isInfinity() && !APF.isNegative()) {
14582 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14583 SIInstrFlags::N_INFINITY;
14584 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14585 SIInstrFlags::P_ZERO |
14586 SIInstrFlags::N_NORMAL |
14587 SIInstrFlags::P_NORMAL |
14588 SIInstrFlags::N_SUBNORMAL |
14589 SIInstrFlags::P_SUBNORMAL;
14590 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14591 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
14592 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
14593 }
14594 }
14595
14596 return SDValue();
14597}
14598
14599SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14600 DAGCombinerInfo &DCI) const {
14601 SelectionDAG &DAG = DCI.DAG;
14602 SDLoc SL(N);
14603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14604
14605 SDValue Src = N->getOperand(Num: 0);
14606 SDValue Shift = N->getOperand(Num: 0);
14607
14608 // TODO: Extend type shouldn't matter (assuming legal types).
14609 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14610 Shift = Shift.getOperand(i: 0);
14611
14612 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14613 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14614 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14615 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14616 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14617 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14618 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
14619 SDValue Shifted = DAG.getZExtOrTrunc(Op: Shift.getOperand(i: 0),
14620 DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
14621
14622 unsigned ShiftOffset = 8 * Offset;
14623 if (Shift.getOpcode() == ISD::SHL)
14624 ShiftOffset -= C->getZExtValue();
14625 else
14626 ShiftOffset += C->getZExtValue();
14627
14628 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14629 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
14630 VT: MVT::f32, Operand: Shifted);
14631 }
14632 }
14633 }
14634
14635 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14636 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
14637 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
14638 // We simplified Src. If this node is not dead, visit it again so it is
14639 // folded properly.
14640 if (N->getOpcode() != ISD::DELETED_NODE)
14641 DCI.AddToWorklist(N);
14642 return SDValue(N, 0);
14643 }
14644
14645 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14646 if (SDValue DemandedSrc =
14647 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
14648 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
14649
14650 return SDValue();
14651}
14652
14653SDValue SITargetLowering::performClampCombine(SDNode *N,
14654 DAGCombinerInfo &DCI) const {
14655 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
14656 if (!CSrc)
14657 return SDValue();
14658
14659 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14660 const APFloat &F = CSrc->getValueAPF();
14661 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
14662 if (F < Zero ||
14663 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14664 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
14665 }
14666
14667 APFloat One(F.getSemantics(), "1.0");
14668 if (F > One)
14669 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
14670
14671 return SDValue(CSrc, 0);
14672}
14673
14674
14675SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
14676 DAGCombinerInfo &DCI) const {
14677 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14678 return SDValue();
14679 switch (N->getOpcode()) {
14680 case ISD::ADD:
14681 return performAddCombine(N, DCI);
14682 case ISD::SUB:
14683 return performSubCombine(N, DCI);
14684 case ISD::UADDO_CARRY:
14685 case ISD::USUBO_CARRY:
14686 return performAddCarrySubCarryCombine(N, DCI);
14687 case ISD::FADD:
14688 return performFAddCombine(N, DCI);
14689 case ISD::FSUB:
14690 return performFSubCombine(N, DCI);
14691 case ISD::FDIV:
14692 return performFDivCombine(N, DCI);
14693 case ISD::SETCC:
14694 return performSetCCCombine(N, DCI);
14695 case ISD::FMAXNUM:
14696 case ISD::FMINNUM:
14697 case ISD::FMAXNUM_IEEE:
14698 case ISD::FMINNUM_IEEE:
14699 case ISD::FMAXIMUM:
14700 case ISD::FMINIMUM:
14701 case ISD::SMAX:
14702 case ISD::SMIN:
14703 case ISD::UMAX:
14704 case ISD::UMIN:
14705 case AMDGPUISD::FMIN_LEGACY:
14706 case AMDGPUISD::FMAX_LEGACY:
14707 return performMinMaxCombine(N, DCI);
14708 case ISD::FMA:
14709 return performFMACombine(N, DCI);
14710 case ISD::AND:
14711 return performAndCombine(N, DCI);
14712 case ISD::OR:
14713 return performOrCombine(N, DCI);
14714 case ISD::FSHR: {
14715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14716 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
14717 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
14718 return matchPERM(N, DCI);
14719 }
14720 break;
14721 }
14722 case ISD::XOR:
14723 return performXorCombine(N, DCI);
14724 case ISD::ZERO_EXTEND:
14725 return performZeroExtendCombine(N, DCI);
14726 case ISD::SIGN_EXTEND_INREG:
14727 return performSignExtendInRegCombine(N , DCI);
14728 case AMDGPUISD::FP_CLASS:
14729 return performClassCombine(N, DCI);
14730 case ISD::FCANONICALIZE:
14731 return performFCanonicalizeCombine(N, DCI);
14732 case AMDGPUISD::RCP:
14733 return performRcpCombine(N, DCI);
14734 case ISD::FLDEXP:
14735 case AMDGPUISD::FRACT:
14736 case AMDGPUISD::RSQ:
14737 case AMDGPUISD::RCP_LEGACY:
14738 case AMDGPUISD::RCP_IFLAG:
14739 case AMDGPUISD::RSQ_CLAMP: {
14740 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14741 SDValue Src = N->getOperand(Num: 0);
14742 if (Src.isUndef())
14743 return Src;
14744 break;
14745 }
14746 case ISD::SINT_TO_FP:
14747 case ISD::UINT_TO_FP:
14748 return performUCharToFloatCombine(N, DCI);
14749 case ISD::FCOPYSIGN:
14750 return performFCopySignCombine(N, DCI);
14751 case AMDGPUISD::CVT_F32_UBYTE0:
14752 case AMDGPUISD::CVT_F32_UBYTE1:
14753 case AMDGPUISD::CVT_F32_UBYTE2:
14754 case AMDGPUISD::CVT_F32_UBYTE3:
14755 return performCvtF32UByteNCombine(N, DCI);
14756 case AMDGPUISD::FMED3:
14757 return performFMed3Combine(N, DCI);
14758 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14759 return performCvtPkRTZCombine(N, DCI);
14760 case AMDGPUISD::CLAMP:
14761 return performClampCombine(N, DCI);
14762 case ISD::SCALAR_TO_VECTOR: {
14763 SelectionDAG &DAG = DCI.DAG;
14764 EVT VT = N->getValueType(ResNo: 0);
14765
14766 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14767 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14768 SDLoc SL(N);
14769 SDValue Src = N->getOperand(Num: 0);
14770 EVT EltVT = Src.getValueType();
14771 if (EltVT != MVT::i16)
14772 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
14773
14774 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
14775 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
14776 }
14777
14778 break;
14779 }
14780 case ISD::EXTRACT_VECTOR_ELT:
14781 return performExtractVectorEltCombine(N, DCI);
14782 case ISD::INSERT_VECTOR_ELT:
14783 return performInsertVectorEltCombine(N, DCI);
14784 case ISD::FP_ROUND:
14785 return performFPRoundCombine(N, DCI);
14786 case ISD::LOAD: {
14787 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
14788 return Widened;
14789 [[fallthrough]];
14790 }
14791 default: {
14792 if (!DCI.isBeforeLegalize()) {
14793 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
14794 return performMemSDNodeCombine(N: MemNode, DCI);
14795 }
14796
14797 break;
14798 }
14799 }
14800
14801 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
14802}
14803
14804/// Helper function for adjustWritemask
14805static unsigned SubIdx2Lane(unsigned Idx) {
14806 switch (Idx) {
14807 default: return ~0u;
14808 case AMDGPU::sub0: return 0;
14809 case AMDGPU::sub1: return 1;
14810 case AMDGPU::sub2: return 2;
14811 case AMDGPU::sub3: return 3;
14812 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14813 }
14814}
14815
14816/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14817SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14818 SelectionDAG &DAG) const {
14819 unsigned Opcode = Node->getMachineOpcode();
14820
14821 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14822 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::d16) - 1;
14823 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
14824 return Node; // not implemented for D16
14825
14826 SDNode *Users[5] = { nullptr };
14827 unsigned Lane = 0;
14828 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::dmask) - 1;
14829 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
14830 unsigned NewDmask = 0;
14831 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::tfe) - 1;
14832 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, NamedIdx: AMDGPU::OpName::lwe) - 1;
14833 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
14834 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx)))
14835 ? true
14836 : false;
14837 unsigned TFCLane = 0;
14838 bool HasChain = Node->getNumValues() > 1;
14839
14840 if (OldDmask == 0) {
14841 // These are folded out, but on the chance it happens don't assert.
14842 return Node;
14843 }
14844
14845 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
14846 // Work out which is the TFE/LWE lane if that is enabled.
14847 if (UsesTFC) {
14848 TFCLane = OldBitsSet;
14849 }
14850
14851 // Try to figure out the used register components
14852 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14853 I != E; ++I) {
14854
14855 // Don't look at users of the chain.
14856 if (I.getUse().getResNo() != 0)
14857 continue;
14858
14859 // Abort if we can't understand the usage
14860 if (!I->isMachineOpcode() ||
14861 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14862 return Node;
14863
14864 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14865 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14866 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14867 // set, etc.
14868 Lane = SubIdx2Lane(Idx: I->getConstantOperandVal(Num: 1));
14869 if (Lane == ~0u)
14870 return Node;
14871
14872 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14873 if (UsesTFC && Lane == TFCLane) {
14874 Users[Lane] = *I;
14875 } else {
14876 // Set which texture component corresponds to the lane.
14877 unsigned Comp;
14878 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14879 Comp = llvm::countr_zero(Val: Dmask);
14880 Dmask &= ~(1 << Comp);
14881 }
14882
14883 // Abort if we have more than one user per component.
14884 if (Users[Lane])
14885 return Node;
14886
14887 Users[Lane] = *I;
14888 NewDmask |= 1 << Comp;
14889 }
14890 }
14891
14892 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14893 bool NoChannels = !NewDmask;
14894 if (NoChannels) {
14895 if (!UsesTFC) {
14896 // No uses of the result and not using TFC. Then do nothing.
14897 return Node;
14898 }
14899 // If the original dmask has one channel - then nothing to do
14900 if (OldBitsSet == 1)
14901 return Node;
14902 // Use an arbitrary dmask - required for the instruction to work
14903 NewDmask = 1;
14904 }
14905 // Abort if there's no change
14906 if (NewDmask == OldDmask)
14907 return Node;
14908
14909 unsigned BitsSet = llvm::popcount(Value: NewDmask);
14910
14911 // Check for TFE or LWE - increase the number of channels by one to account
14912 // for the extra return value
14913 // This will need adjustment for D16 if this is also included in
14914 // adjustWriteMask (this function) but at present D16 are excluded.
14915 unsigned NewChannels = BitsSet + UsesTFC;
14916
14917 int NewOpcode =
14918 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
14919 assert(NewOpcode != -1 &&
14920 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14921 "failed to find equivalent MIMG op");
14922
14923 // Adjust the writemask in the node
14924 SmallVector<SDValue, 12> Ops;
14925 Ops.insert(I: Ops.end(), From: Node->op_begin(), To: Node->op_begin() + DmaskIdx);
14926 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
14927 Ops.insert(I: Ops.end(), From: Node->op_begin() + DmaskIdx + 1, To: Node->op_end());
14928
14929 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
14930
14931 MVT ResultVT = NewChannels == 1 ?
14932 SVT : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4 :
14933 NewChannels == 5 ? 8 : NewChannels);
14934 SDVTList NewVTList = HasChain ?
14935 DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
14936
14937
14938 MachineSDNode *NewNode = DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node),
14939 VTs: NewVTList, Ops);
14940
14941 if (HasChain) {
14942 // Update chain.
14943 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
14944 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
14945 }
14946
14947 if (NewChannels == 1) {
14948 assert(Node->hasNUsesOfValue(1, 0));
14949 SDNode *Copy = DAG.getMachineNode(Opcode: TargetOpcode::COPY,
14950 dl: SDLoc(Node), VT: Users[Lane]->getValueType(ResNo: 0),
14951 Op1: SDValue(NewNode, 0));
14952 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
14953 return nullptr;
14954 }
14955
14956 // Update the users of the node with the new indices
14957 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14958 SDNode *User = Users[i];
14959 if (!User) {
14960 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14961 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14962 if (i || !NoChannels)
14963 continue;
14964 } else {
14965 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
14966 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
14967 if (NewUser != User) {
14968 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
14969 DAG.RemoveDeadNode(N: User);
14970 }
14971 }
14972
14973 switch (Idx) {
14974 default: break;
14975 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
14976 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
14977 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
14978 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
14979 }
14980 }
14981
14982 DAG.RemoveDeadNode(N: Node);
14983 return nullptr;
14984}
14985
14986static bool isFrameIndexOp(SDValue Op) {
14987 if (Op.getOpcode() == ISD::AssertZext)
14988 Op = Op.getOperand(i: 0);
14989
14990 return isa<FrameIndexSDNode>(Val: Op);
14991}
14992
14993/// Legalize target independent instructions (e.g. INSERT_SUBREG)
14994/// with frame index operands.
14995/// LLVM assumes that inputs are to these instructions are registers.
14996SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
14997 SelectionDAG &DAG) const {
14998 if (Node->getOpcode() == ISD::CopyToReg) {
14999 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
15000 SDValue SrcVal = Node->getOperand(Num: 2);
15001
15002 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15003 // to try understanding copies to physical registers.
15004 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15005 SDLoc SL(Node);
15006 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15007 SDValue VReg = DAG.getRegister(
15008 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
15009
15010 SDNode *Glued = Node->getGluedNode();
15011 SDValue ToVReg
15012 = DAG.getCopyToReg(Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
15013 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15014 SDValue ToResultReg
15015 = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
15016 N: VReg, Glue: ToVReg.getValue(R: 1));
15017 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
15018 DAG.RemoveDeadNode(N: Node);
15019 return ToResultReg.getNode();
15020 }
15021 }
15022
15023 SmallVector<SDValue, 8> Ops;
15024 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15025 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
15026 Ops.push_back(Elt: Node->getOperand(Num: i));
15027 continue;
15028 }
15029
15030 SDLoc DL(Node);
15031 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
15032 VT: Node->getOperand(Num: i).getValueType(),
15033 Op1: Node->getOperand(Num: i)), 0));
15034 }
15035
15036 return DAG.UpdateNodeOperands(N: Node, Ops);
15037}
15038
15039/// Fold the instructions after selecting them.
15040/// Returns null if users were already updated.
15041SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
15042 SelectionDAG &DAG) const {
15043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15044 unsigned Opcode = Node->getMachineOpcode();
15045
15046 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15047 !TII->isGather4(Opcode) &&
15048 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
15049 return adjustWritemask(Node, DAG);
15050 }
15051
15052 if (Opcode == AMDGPU::INSERT_SUBREG ||
15053 Opcode == AMDGPU::REG_SEQUENCE) {
15054 legalizeTargetIndependentNode(Node, DAG);
15055 return Node;
15056 }
15057
15058 switch (Opcode) {
15059 case AMDGPU::V_DIV_SCALE_F32_e64:
15060 case AMDGPU::V_DIV_SCALE_F64_e64: {
15061 // Satisfy the operand register constraint when one of the inputs is
15062 // undefined. Ordinarily each undef value will have its own implicit_def of
15063 // a vreg, so force these to use a single register.
15064 SDValue Src0 = Node->getOperand(Num: 1);
15065 SDValue Src1 = Node->getOperand(Num: 3);
15066 SDValue Src2 = Node->getOperand(Num: 5);
15067
15068 if ((Src0.isMachineOpcode() &&
15069 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15070 (Src0 == Src1 || Src0 == Src2))
15071 break;
15072
15073 MVT VT = Src0.getValueType().getSimpleVT();
15074 const TargetRegisterClass *RC =
15075 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
15076
15077 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
15078 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
15079
15080 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node),
15081 Reg: UndefReg, N: Src0, Glue: SDValue());
15082
15083 // src0 must be the same register as src1 or src2, even if the value is
15084 // undefined, so make sure we don't violate this constraint.
15085 if (Src0.isMachineOpcode() &&
15086 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15087 if (Src1.isMachineOpcode() &&
15088 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15089 Src0 = Src1;
15090 else if (Src2.isMachineOpcode() &&
15091 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15092 Src0 = Src2;
15093 else {
15094 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15095 Src0 = UndefReg;
15096 Src1 = UndefReg;
15097 }
15098 } else
15099 break;
15100
15101 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15102 Ops[1] = Src0;
15103 Ops[3] = Src1;
15104 Ops[5] = Src2;
15105 Ops.push_back(Elt: ImpDef.getValue(R: 1));
15106 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
15107 }
15108 default:
15109 break;
15110 }
15111
15112 return Node;
15113}
15114
15115// Any MIMG instructions that use tfe or lwe require an initialization of the
15116// result register that will be written in the case of a memory access failure.
15117// The required code is also added to tie this init code to the result of the
15118// img instruction.
15119void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
15120 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15121 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15122 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15123 MachineBasicBlock &MBB = *MI.getParent();
15124
15125 int DstIdx =
15126 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: AMDGPU::OpName::vdata);
15127 unsigned InitIdx = 0;
15128
15129 if (TII->isImage(MI)) {
15130 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
15131 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
15132 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
15133
15134 if (!TFE && !LWE) // intersect_ray
15135 return;
15136
15137 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15138 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15139 unsigned D16Val = D16 ? D16->getImm() : 0;
15140
15141 if (!TFEVal && !LWEVal)
15142 return;
15143
15144 // At least one of TFE or LWE are non-zero
15145 // We have to insert a suitable initialization of the result value and
15146 // tie this to the dest of the image instruction.
15147
15148 // Calculate which dword we have to initialize to 0.
15149 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
15150
15151 // check that dmask operand is found.
15152 assert(MO_Dmask && "Expected dmask operand in instruction");
15153
15154 unsigned dmask = MO_Dmask->getImm();
15155 // Determine the number of active lanes taking into account the
15156 // Gather4 special case
15157 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
15158
15159 bool Packed = !Subtarget->hasUnpackedD16VMem();
15160
15161 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15162
15163 // Abandon attempt if the dst size isn't large enough
15164 // - this is in fact an error but this is picked up elsewhere and
15165 // reported correctly.
15166 uint32_t DstSize =
15167 TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
15168 if (DstSize < InitIdx)
15169 return;
15170 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
15171 InitIdx = TRI.getRegSizeInBits(RC: *TII->getOpRegClass(MI, OpNo: DstIdx)) / 32;
15172 } else {
15173 return;
15174 }
15175
15176 const DebugLoc &DL = MI.getDebugLoc();
15177
15178 // Create a register for the initialization value.
15179 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
15180 unsigned NewDst = 0; // Final initialized value will be in here
15181
15182 // If PRTStrictNull feature is enabled (the default) then initialize
15183 // all the result registers to 0, otherwise just the error indication
15184 // register (VGPRn+1)
15185 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15186 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15187
15188 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
15189 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15190 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
15191 // Initialize dword
15192 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
15193 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
15194 .addImm(Val: 0);
15195 // Insert into the super-reg
15196 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
15197 .addReg(RegNo: PrevDst)
15198 .addReg(RegNo: SubReg)
15199 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
15200
15201 PrevDst = NewDst;
15202 }
15203
15204 // Add as an implicit operand
15205 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
15206
15207 // Tie the just added implicit operand to the dst
15208 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
15209}
15210
15211/// Assign the register class depending on the number of
15212/// bits set in the writemask
15213void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
15214 SDNode *Node) const {
15215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15216
15217 MachineFunction *MF = MI.getParent()->getParent();
15218 MachineRegisterInfo &MRI = MF->getRegInfo();
15219 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
15220
15221 if (TII->isVOP3(Opcode: MI.getOpcode())) {
15222 // Make sure constant bus requirements are respected.
15223 TII->legalizeOperandsVOP3(MRI, MI);
15224
15225 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15226 // This saves a chain-copy of registers and better balance register
15227 // use between vgpr and agpr as agpr tuples tend to be big.
15228 if (!MI.getDesc().operands().empty()) {
15229 unsigned Opc = MI.getOpcode();
15230 bool HasAGPRs = Info->mayNeedAGPRs();
15231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15232 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src2);
15233 for (auto I :
15234 {AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src0),
15235 AMDGPU::getNamedOperandIdx(Opcode: Opc, NamedIdx: AMDGPU::OpName::src1), Src2Idx}) {
15236 if (I == -1)
15237 break;
15238 if ((I == Src2Idx) && (HasAGPRs))
15239 break;
15240 MachineOperand &Op = MI.getOperand(i: I);
15241 if (!Op.isReg() || !Op.getReg().isVirtual())
15242 continue;
15243 auto *RC = TRI->getRegClassForReg(MRI, Reg: Op.getReg());
15244 if (!TRI->hasAGPRs(RC))
15245 continue;
15246 auto *Src = MRI.getUniqueVRegDef(Reg: Op.getReg());
15247 if (!Src || !Src->isCopy() ||
15248 !TRI->isSGPRReg(MRI, Reg: Src->getOperand(i: 1).getReg()))
15249 continue;
15250 auto *NewRC = TRI->getEquivalentVGPRClass(SRC: RC);
15251 // All uses of agpr64 and agpr32 can also accept vgpr except for
15252 // v_accvgpr_read, but we do not produce agpr reads during selection,
15253 // so no use checks are needed.
15254 MRI.setRegClass(Reg: Op.getReg(), RC: NewRC);
15255 }
15256
15257 if (!HasAGPRs)
15258 return;
15259
15260 // Resolve the rest of AV operands to AGPRs.
15261 if (auto *Src2 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src2)) {
15262 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15263 auto *RC = TRI->getRegClassForReg(MRI, Reg: Src2->getReg());
15264 if (TRI->isVectorSuperClass(RC)) {
15265 auto *NewRC = TRI->getEquivalentAGPRClass(SRC: RC);
15266 MRI.setRegClass(Reg: Src2->getReg(), RC: NewRC);
15267 if (Src2->isTied())
15268 MRI.setRegClass(Reg: MI.getOperand(i: 0).getReg(), RC: NewRC);
15269 }
15270 }
15271 }
15272 }
15273
15274 return;
15275 }
15276
15277 if (TII->isImage(MI))
15278 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
15279}
15280
15281static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
15282 uint64_t Val) {
15283 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
15284 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
15285}
15286
15287MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
15288 const SDLoc &DL,
15289 SDValue Ptr) const {
15290 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15291
15292 // Build the half of the subregister with the constants before building the
15293 // full 128-bit register. If we are building multiple resource descriptors,
15294 // this will allow CSEing of the 2-component register.
15295 const SDValue Ops0[] = {
15296 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
15297 buildSMovImm32(DAG, DL, Val: 0),
15298 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
15299 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
15300 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)
15301 };
15302
15303 SDValue SubRegHi = SDValue(DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL,
15304 VT: MVT::v2i32, Ops: Ops0), 0);
15305
15306 // Combine the constants and the pointer.
15307 const SDValue Ops1[] = {
15308 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
15309 Ptr,
15310 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32),
15311 SubRegHi,
15312 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)
15313 };
15314
15315 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
15316}
15317
15318/// Return a resource descriptor with the 'Add TID' bit enabled
15319/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15320/// of the resource descriptor) to create an offset, which is added to
15321/// the resource pointer.
15322MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
15323 SDValue Ptr, uint32_t RsrcDword1,
15324 uint64_t RsrcDword2And3) const {
15325 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
15326 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
15327 if (RsrcDword1) {
15328 PtrHi = SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
15329 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
15330 0);
15331 }
15332
15333 SDValue DataLo = buildSMovImm32(DAG, DL,
15334 Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15335 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
15336
15337 const SDValue Ops[] = {
15338 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
15339 PtrLo,
15340 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
15341 PtrHi,
15342 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
15343 DataLo,
15344 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
15345 DataHi,
15346 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)
15347 };
15348
15349 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
15350}
15351
15352//===----------------------------------------------------------------------===//
15353// SI Inline Assembly Support
15354//===----------------------------------------------------------------------===//
15355
15356std::pair<unsigned, const TargetRegisterClass *>
15357SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
15358 StringRef Constraint,
15359 MVT VT) const {
15360 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15361
15362 const TargetRegisterClass *RC = nullptr;
15363 if (Constraint.size() == 1) {
15364 const unsigned BitWidth = VT.getSizeInBits();
15365 switch (Constraint[0]) {
15366 default:
15367 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15368 case 's':
15369 case 'r':
15370 switch (BitWidth) {
15371 case 16:
15372 RC = &AMDGPU::SReg_32RegClass;
15373 break;
15374 case 64:
15375 RC = &AMDGPU::SGPR_64RegClass;
15376 break;
15377 default:
15378 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
15379 if (!RC)
15380 return std::pair(0U, nullptr);
15381 break;
15382 }
15383 break;
15384 case 'v':
15385 switch (BitWidth) {
15386 case 16:
15387 RC = &AMDGPU::VGPR_32RegClass;
15388 break;
15389 default:
15390 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15391 if (!RC)
15392 return std::pair(0U, nullptr);
15393 break;
15394 }
15395 break;
15396 case 'a':
15397 if (!Subtarget->hasMAIInsts())
15398 break;
15399 switch (BitWidth) {
15400 case 16:
15401 RC = &AMDGPU::AGPR_32RegClass;
15402 break;
15403 default:
15404 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15405 if (!RC)
15406 return std::pair(0U, nullptr);
15407 break;
15408 }
15409 break;
15410 }
15411 // We actually support i128, i16 and f16 as inline parameters
15412 // even if they are not reported as legal
15413 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15414 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15415 return std::pair(0U, RC);
15416 }
15417
15418 if (Constraint.starts_with(Prefix: "{") && Constraint.ends_with(Suffix: "}")) {
15419 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15420 if (RegName.consume_front(Prefix: "v")) {
15421 RC = &AMDGPU::VGPR_32RegClass;
15422 } else if (RegName.consume_front(Prefix: "s")) {
15423 RC = &AMDGPU::SGPR_32RegClass;
15424 } else if (RegName.consume_front(Prefix: "a")) {
15425 RC = &AMDGPU::AGPR_32RegClass;
15426 }
15427
15428 if (RC) {
15429 uint32_t Idx;
15430 if (RegName.consume_front(Prefix: "[")) {
15431 uint32_t End;
15432 bool Failed = RegName.consumeInteger(Radix: 10, Result&: Idx);
15433 Failed |= !RegName.consume_front(Prefix: ":");
15434 Failed |= RegName.consumeInteger(Radix: 10, Result&: End);
15435 Failed |= !RegName.consume_back(Suffix: "]");
15436 if (!Failed) {
15437 uint32_t Width = (End - Idx + 1) * 32;
15438 MCRegister Reg = RC->getRegister(i: Idx);
15439 if (SIRegisterInfo::isVGPRClass(RC))
15440 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
15441 else if (SIRegisterInfo::isSGPRClass(RC))
15442 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
15443 else if (SIRegisterInfo::isAGPRClass(RC))
15444 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
15445 if (RC) {
15446 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
15447 return std::pair(Reg, RC);
15448 }
15449 }
15450 } else {
15451 bool Failed = RegName.getAsInteger(Radix: 10, Result&: Idx);
15452 if (!Failed && Idx < RC->getNumRegs())
15453 return std::pair(RC->getRegister(i: Idx), RC);
15454 }
15455 }
15456 }
15457
15458 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15459 if (Ret.first)
15460 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
15461
15462 return Ret;
15463}
15464
15465static bool isImmConstraint(StringRef Constraint) {
15466 if (Constraint.size() == 1) {
15467 switch (Constraint[0]) {
15468 default: break;
15469 case 'I':
15470 case 'J':
15471 case 'A':
15472 case 'B':
15473 case 'C':
15474 return true;
15475 }
15476 } else if (Constraint == "DA" ||
15477 Constraint == "DB") {
15478 return true;
15479 }
15480 return false;
15481}
15482
15483SITargetLowering::ConstraintType
15484SITargetLowering::getConstraintType(StringRef Constraint) const {
15485 if (Constraint.size() == 1) {
15486 switch (Constraint[0]) {
15487 default: break;
15488 case 's':
15489 case 'v':
15490 case 'a':
15491 return C_RegisterClass;
15492 }
15493 }
15494 if (isImmConstraint(Constraint)) {
15495 return C_Other;
15496 }
15497 return TargetLowering::getConstraintType(Constraint);
15498}
15499
15500static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15501 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
15502 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
15503 }
15504 return Val;
15505}
15506
15507void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
15508 StringRef Constraint,
15509 std::vector<SDValue> &Ops,
15510 SelectionDAG &DAG) const {
15511 if (isImmConstraint(Constraint)) {
15512 uint64_t Val;
15513 if (getAsmOperandConstVal(Op, Val) &&
15514 checkAsmConstraintVal(Op, Constraint, Val)) {
15515 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
15516 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
15517 }
15518 } else {
15519 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15520 }
15521}
15522
15523bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
15524 unsigned Size = Op.getScalarValueSizeInBits();
15525 if (Size > 64)
15526 return false;
15527
15528 if (Size == 16 && !Subtarget->has16BitInsts())
15529 return false;
15530
15531 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
15532 Val = C->getSExtValue();
15533 return true;
15534 }
15535 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15536 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15537 return true;
15538 }
15539 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15540 if (Size != 16 || Op.getNumOperands() != 2)
15541 return false;
15542 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
15543 return false;
15544 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15545 Val = C->getSExtValue();
15546 return true;
15547 }
15548 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15549 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15550 return true;
15551 }
15552 }
15553
15554 return false;
15555}
15556
15557bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
15558 uint64_t Val) const {
15559 if (Constraint.size() == 1) {
15560 switch (Constraint[0]) {
15561 case 'I':
15562 return AMDGPU::isInlinableIntLiteral(Literal: Val);
15563 case 'J':
15564 return isInt<16>(x: Val);
15565 case 'A':
15566 return checkAsmConstraintValA(Op, Val);
15567 case 'B':
15568 return isInt<32>(x: Val);
15569 case 'C':
15570 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
15571 AMDGPU::isInlinableIntLiteral(Literal: Val);
15572 default:
15573 break;
15574 }
15575 } else if (Constraint.size() == 2) {
15576 if (Constraint == "DA") {
15577 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15578 int64_t LoBits = static_cast<int32_t>(Val);
15579 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
15580 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
15581 }
15582 if (Constraint == "DB") {
15583 return true;
15584 }
15585 }
15586 llvm_unreachable("Invalid asm constraint");
15587}
15588
15589bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
15590 unsigned MaxSize) const {
15591 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
15592 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15593 if (Size == 16) {
15594 MVT VT = Op.getSimpleValueType();
15595 switch (VT.SimpleTy) {
15596 default:
15597 return false;
15598 case MVT::i16:
15599 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
15600 case MVT::f16:
15601 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
15602 case MVT::bf16:
15603 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
15604 case MVT::v2i16:
15605 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
15606 case MVT::v2f16:
15607 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
15608 case MVT::v2bf16:
15609 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
15610 }
15611 }
15612 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
15613 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
15614 return true;
15615 return false;
15616}
15617
15618static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15619 switch (UnalignedClassID) {
15620 case AMDGPU::VReg_64RegClassID:
15621 return AMDGPU::VReg_64_Align2RegClassID;
15622 case AMDGPU::VReg_96RegClassID:
15623 return AMDGPU::VReg_96_Align2RegClassID;
15624 case AMDGPU::VReg_128RegClassID:
15625 return AMDGPU::VReg_128_Align2RegClassID;
15626 case AMDGPU::VReg_160RegClassID:
15627 return AMDGPU::VReg_160_Align2RegClassID;
15628 case AMDGPU::VReg_192RegClassID:
15629 return AMDGPU::VReg_192_Align2RegClassID;
15630 case AMDGPU::VReg_224RegClassID:
15631 return AMDGPU::VReg_224_Align2RegClassID;
15632 case AMDGPU::VReg_256RegClassID:
15633 return AMDGPU::VReg_256_Align2RegClassID;
15634 case AMDGPU::VReg_288RegClassID:
15635 return AMDGPU::VReg_288_Align2RegClassID;
15636 case AMDGPU::VReg_320RegClassID:
15637 return AMDGPU::VReg_320_Align2RegClassID;
15638 case AMDGPU::VReg_352RegClassID:
15639 return AMDGPU::VReg_352_Align2RegClassID;
15640 case AMDGPU::VReg_384RegClassID:
15641 return AMDGPU::VReg_384_Align2RegClassID;
15642 case AMDGPU::VReg_512RegClassID:
15643 return AMDGPU::VReg_512_Align2RegClassID;
15644 case AMDGPU::VReg_1024RegClassID:
15645 return AMDGPU::VReg_1024_Align2RegClassID;
15646 case AMDGPU::AReg_64RegClassID:
15647 return AMDGPU::AReg_64_Align2RegClassID;
15648 case AMDGPU::AReg_96RegClassID:
15649 return AMDGPU::AReg_96_Align2RegClassID;
15650 case AMDGPU::AReg_128RegClassID:
15651 return AMDGPU::AReg_128_Align2RegClassID;
15652 case AMDGPU::AReg_160RegClassID:
15653 return AMDGPU::AReg_160_Align2RegClassID;
15654 case AMDGPU::AReg_192RegClassID:
15655 return AMDGPU::AReg_192_Align2RegClassID;
15656 case AMDGPU::AReg_256RegClassID:
15657 return AMDGPU::AReg_256_Align2RegClassID;
15658 case AMDGPU::AReg_512RegClassID:
15659 return AMDGPU::AReg_512_Align2RegClassID;
15660 case AMDGPU::AReg_1024RegClassID:
15661 return AMDGPU::AReg_1024_Align2RegClassID;
15662 default:
15663 return -1;
15664 }
15665}
15666
15667// Figure out which registers should be reserved for stack access. Only after
15668// the function is legalized do we know all of the non-spill stack objects or if
15669// calls are present.
15670void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
15671 MachineRegisterInfo &MRI = MF.getRegInfo();
15672 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15673 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15674 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15675 const SIInstrInfo *TII = ST.getInstrInfo();
15676
15677 if (Info->isEntryFunction()) {
15678 // Callable functions have fixed registers used for stack access.
15679 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
15680 }
15681
15682 // TODO: Move this logic to getReservedRegs()
15683 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15684 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15685 Register SReg = ST.isWave32()
15686 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
15687 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15688 RC: &AMDGPU::SGPR_64RegClass);
15689 Info->setSGPRForEXECCopy(SReg);
15690
15691 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15692 Info->getStackPtrOffsetReg()));
15693 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15694 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
15695
15696 // We need to worry about replacing the default register with itself in case
15697 // of MIR testcases missing the MFI.
15698 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15699 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
15700
15701 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15702 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
15703
15704 Info->limitOccupancy(MF);
15705
15706 if (ST.isWave32() && !MF.empty()) {
15707 for (auto &MBB : MF) {
15708 for (auto &MI : MBB) {
15709 TII->fixImplicitOperands(MI);
15710 }
15711 }
15712 }
15713
15714 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15715 // classes if required. Ideally the register class constraints would differ
15716 // per-subtarget, but there's no easy way to achieve that right now. This is
15717 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15718 // from using them as the register class for legal types.
15719 if (ST.needsAlignedVGPRs()) {
15720 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15721 const Register Reg = Register::index2VirtReg(Index: I);
15722 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15723 if (!RC)
15724 continue;
15725 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
15726 if (NewClassID != -1)
15727 MRI.setRegClass(Reg, RC: TRI->getRegClass(RCID: NewClassID));
15728 }
15729 }
15730
15731 TargetLoweringBase::finalizeLowering(MF);
15732}
15733
15734void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
15735 KnownBits &Known,
15736 const APInt &DemandedElts,
15737 const SelectionDAG &DAG,
15738 unsigned Depth) const {
15739 Known.resetAll();
15740 unsigned Opc = Op.getOpcode();
15741 switch (Opc) {
15742 case ISD::INTRINSIC_WO_CHAIN: {
15743 unsigned IID = Op.getConstantOperandVal(i: 0);
15744 switch (IID) {
15745 case Intrinsic::amdgcn_mbcnt_lo:
15746 case Intrinsic::amdgcn_mbcnt_hi: {
15747 const GCNSubtarget &ST =
15748 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
15749 // These return at most the (wavefront size - 1) + src1
15750 // As long as src1 is an immediate we can calc known bits
15751 KnownBits Src1Known = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
15752 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15753 unsigned MaxActiveBits = std::max(a: Src1ValBits, b: ST.getWavefrontSizeLog2());
15754 // Cater for potential carry
15755 MaxActiveBits += Src1ValBits ? 1 : 0;
15756 unsigned Size = Op.getValueType().getSizeInBits();
15757 if (MaxActiveBits < Size)
15758 Known.Zero.setHighBits(Size - MaxActiveBits);
15759 return;
15760 }
15761 }
15762 break;
15763 }
15764 }
15765 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
15766 Op, Known, DemandedElts, DAG, Depth);
15767}
15768
15769void SITargetLowering::computeKnownBitsForFrameIndex(
15770 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15771 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
15772
15773 // Set the high bits to zero based on the maximum allowed scratch size per
15774 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15775 // calculation won't overflow, so assume the sign bit is never set.
15776 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15777}
15778
15779static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
15780 KnownBits &Known, unsigned Dim) {
15781 unsigned MaxValue =
15782 ST.getMaxWorkitemID(Kernel: KB.getMachineFunction().getFunction(), Dimension: Dim);
15783 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
15784}
15785
15786void SITargetLowering::computeKnownBitsForTargetInstr(
15787 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15788 const MachineRegisterInfo &MRI, unsigned Depth) const {
15789 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
15790 switch (MI->getOpcode()) {
15791 case AMDGPU::G_INTRINSIC:
15792 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15793 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15794 case Intrinsic::amdgcn_workitem_id_x:
15795 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 0);
15796 break;
15797 case Intrinsic::amdgcn_workitem_id_y:
15798 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 1);
15799 break;
15800 case Intrinsic::amdgcn_workitem_id_z:
15801 knownBitsForWorkitemID(ST: *getSubtarget(), KB, Known, Dim: 2);
15802 break;
15803 case Intrinsic::amdgcn_mbcnt_lo:
15804 case Intrinsic::amdgcn_mbcnt_hi: {
15805 // These return at most the wavefront size - 1.
15806 unsigned Size = MRI.getType(Reg: R).getSizeInBits();
15807 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15808 break;
15809 }
15810 case Intrinsic::amdgcn_groupstaticsize: {
15811 // We can report everything over the maximum size as 0. We can't report
15812 // based on the actual size because we don't know if it's accurate or not
15813 // at any given point.
15814 Known.Zero.setHighBits(
15815 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
15816 break;
15817 }
15818 }
15819 break;
15820 }
15821 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15822 Known.Zero.setHighBits(24);
15823 break;
15824 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15825 Known.Zero.setHighBits(16);
15826 break;
15827 case AMDGPU::G_AMDGPU_SMED3:
15828 case AMDGPU::G_AMDGPU_UMED3: {
15829 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15830
15831 KnownBits Known2;
15832 KB.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
15833 if (Known2.isUnknown())
15834 break;
15835
15836 KnownBits Known1;
15837 KB.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
15838 if (Known1.isUnknown())
15839 break;
15840
15841 KnownBits Known0;
15842 KB.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
15843 if (Known0.isUnknown())
15844 break;
15845
15846 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15847 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15848 Known.One = Known0.One & Known1.One & Known2.One;
15849 break;
15850 }
15851 }
15852}
15853
15854Align SITargetLowering::computeKnownAlignForTargetInstr(
15855 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
15856 unsigned Depth) const {
15857 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
15858 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
15859 // FIXME: Can this move to generic code? What about the case where the call
15860 // site specifies a lower alignment?
15861 Intrinsic::ID IID = GI->getIntrinsicID();
15862 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
15863 AttributeList Attrs = Intrinsic::getAttributes(C&: Ctx, id: IID);
15864 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15865 return *RetAlign;
15866 }
15867 return Align(1);
15868}
15869
15870Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
15871 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
15872 const Align CacheLineAlign = Align(64);
15873
15874 // Pre-GFX10 target did not benefit from loop alignment
15875 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15876 getSubtarget()->hasInstFwdPrefetchBug())
15877 return PrefAlign;
15878
15879 // On GFX10 I$ is 4 x 64 bytes cache lines.
15880 // By default prefetcher keeps one cache line behind and reads two ahead.
15881 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15882 // behind and one ahead.
15883 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15884 // If loop fits 64 bytes it always spans no more than two cache lines and
15885 // does not need an alignment.
15886 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15887 // Else if loop is less or equal 192 bytes we need two lines behind.
15888
15889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15890 const MachineBasicBlock *Header = ML->getHeader();
15891 if (Header->getAlignment() != PrefAlign)
15892 return Header->getAlignment(); // Already processed.
15893
15894 unsigned LoopSize = 0;
15895 for (const MachineBasicBlock *MBB : ML->blocks()) {
15896 // If inner loop block is aligned assume in average half of the alignment
15897 // size to be added as nops.
15898 if (MBB != Header)
15899 LoopSize += MBB->getAlignment().value() / 2;
15900
15901 for (const MachineInstr &MI : *MBB) {
15902 LoopSize += TII->getInstSizeInBytes(MI);
15903 if (LoopSize > 192)
15904 return PrefAlign;
15905 }
15906 }
15907
15908 if (LoopSize <= 64)
15909 return PrefAlign;
15910
15911 if (LoopSize <= 128)
15912 return CacheLineAlign;
15913
15914 // If any of parent loops is surrounded by prefetch instructions do not
15915 // insert new for inner loop, which would reset parent's settings.
15916 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15917 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15918 auto I = Exit->getFirstNonDebugInstr();
15919 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15920 return CacheLineAlign;
15921 }
15922 }
15923
15924 MachineBasicBlock *Pre = ML->getLoopPreheader();
15925 MachineBasicBlock *Exit = ML->getExitBlock();
15926
15927 if (Pre && Exit) {
15928 auto PreTerm = Pre->getFirstTerminator();
15929 if (PreTerm == Pre->begin() ||
15930 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15931 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
15932 .addImm(Val: 1); // prefetch 2 lines behind PC
15933
15934 auto ExitHead = Exit->getFirstNonDebugInstr();
15935 if (ExitHead == Exit->end() ||
15936 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15937 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
15938 .addImm(Val: 2); // prefetch 1 line behind PC
15939 }
15940
15941 return CacheLineAlign;
15942}
15943
15944LLVM_ATTRIBUTE_UNUSED
15945static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15946 assert(N->getOpcode() == ISD::CopyFromReg);
15947 do {
15948 // Follow the chain until we find an INLINEASM node.
15949 N = N->getOperand(Num: 0).getNode();
15950 if (N->getOpcode() == ISD::INLINEASM ||
15951 N->getOpcode() == ISD::INLINEASM_BR)
15952 return true;
15953 } while (N->getOpcode() == ISD::CopyFromReg);
15954 return false;
15955}
15956
15957bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
15958 FunctionLoweringInfo *FLI,
15959 UniformityInfo *UA) const {
15960 switch (N->getOpcode()) {
15961 case ISD::CopyFromReg: {
15962 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
15963 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15965 Register Reg = R->getReg();
15966
15967 // FIXME: Why does this need to consider isLiveIn?
15968 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15969 return !TRI->isSGPRReg(MRI, Reg);
15970
15971 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
15972 return UA->isDivergent(V);
15973
15974 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
15975 return !TRI->isSGPRReg(MRI, Reg);
15976 }
15977 case ISD::LOAD: {
15978 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
15979 unsigned AS = L->getAddressSpace();
15980 // A flat load may access private memory.
15981 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
15982 }
15983 case ISD::CALLSEQ_END:
15984 return true;
15985 case ISD::INTRINSIC_WO_CHAIN:
15986 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
15987 case ISD::INTRINSIC_W_CHAIN:
15988 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
15989 case AMDGPUISD::ATOMIC_CMP_SWAP:
15990 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
15991 case AMDGPUISD::BUFFER_ATOMIC_ADD:
15992 case AMDGPUISD::BUFFER_ATOMIC_SUB:
15993 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
15994 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
15995 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
15996 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
15997 case AMDGPUISD::BUFFER_ATOMIC_AND:
15998 case AMDGPUISD::BUFFER_ATOMIC_OR:
15999 case AMDGPUISD::BUFFER_ATOMIC_XOR:
16000 case AMDGPUISD::BUFFER_ATOMIC_INC:
16001 case AMDGPUISD::BUFFER_ATOMIC_DEC:
16002 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
16003 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
16004 case AMDGPUISD::BUFFER_ATOMIC_FADD:
16005 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
16006 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
16007 // Target-specific read-modify-write atomics are sources of divergence.
16008 return true;
16009 default:
16010 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
16011 // Generic read-modify-write atomics are sources of divergence.
16012 return A->readMem() && A->writeMem();
16013 }
16014 return false;
16015 }
16016}
16017
16018bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
16019 EVT VT) const {
16020 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16021 case MVT::f32:
16022 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
16023 case MVT::f64:
16024 case MVT::f16:
16025 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
16026 default:
16027 return false;
16028 }
16029}
16030
16031bool SITargetLowering::denormalsEnabledForType(
16032 LLT Ty, const MachineFunction &MF) const {
16033 switch (Ty.getScalarSizeInBits()) {
16034 case 32:
16035 return !denormalModeIsFlushAllF32(MF);
16036 case 64:
16037 case 16:
16038 return !denormalModeIsFlushAllF64F16(MF);
16039 default:
16040 return false;
16041 }
16042}
16043
16044bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16045 const SelectionDAG &DAG,
16046 bool SNaN,
16047 unsigned Depth) const {
16048 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16049 const MachineFunction &MF = DAG.getMachineFunction();
16050 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16051
16052 if (Info->getMode().DX10Clamp)
16053 return true; // Clamped to 0.
16054 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
16055 }
16056
16057 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
16058 SNaN, Depth);
16059}
16060
16061#if 0
16062// FIXME: This should be checked before unsafe fp atomics are enabled
16063// Global FP atomic instructions have a hardcoded FP mode and do not support
16064// FP32 denormals, and only support v2f16 denormals.
16065static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16066 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16068 if (&Flt == &APFloat::IEEEsingle())
16069 return DenormMode == DenormalMode::getPreserveSign();
16070 return DenormMode == DenormalMode::getIEEE();
16071}
16072#endif
16073
16074// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16075// floating point atomic instructions. May generate more efficient code,
16076// but may not respect rounding and denormal modes, and may give incorrect
16077// results for certain memory destinations.
16078bool unsafeFPAtomicsDisabled(Function *F) {
16079 return F->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics").getValueAsString() !=
16080 "true";
16081}
16082
16083static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
16084 LLVMContext &Ctx = RMW->getContext();
16085 SmallVector<StringRef> SSNs;
16086 Ctx.getSyncScopeNames(SSNs);
16087 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16088 ? "system"
16089 : SSNs[RMW->getSyncScopeID()];
16090
16091 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16092 << "Hardware instruction generated for atomic "
16093 << RMW->getOperationName(Op: RMW->getOperation())
16094 << " operation at memory scope " << MemScope;
16095}
16096
16097static bool isHalf2OrBFloat2(Type *Ty) {
16098 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
16099 Type *EltTy = VT->getElementType();
16100 return VT->getNumElements() == 2 &&
16101 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16102 }
16103
16104 return false;
16105}
16106
16107static bool isHalf2(Type *Ty) {
16108 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
16109 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16110}
16111
16112static bool isBFloat2(Type *Ty) {
16113 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
16114 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16115}
16116
16117TargetLowering::AtomicExpansionKind
16118SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16119 unsigned AS = RMW->getPointerAddressSpace();
16120 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16121 return AtomicExpansionKind::NotAtomic;
16122
16123 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16124 OptimizationRemarkEmitter ORE(RMW->getFunction());
16125 ORE.emit(RemarkBuilder: [=]() {
16126 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16127 });
16128 return Kind;
16129 };
16130
16131 auto SSID = RMW->getSyncScopeID();
16132 bool HasSystemScope =
16133 SSID == SyncScope::System ||
16134 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
16135
16136 switch (RMW->getOperation()) {
16137 case AtomicRMWInst::Sub:
16138 case AtomicRMWInst::Or:
16139 case AtomicRMWInst::Xor: {
16140 // Atomic sub/or/xor do not work over PCI express, but atomic add
16141 // does. InstCombine transforms these with 0 to or, so undo that.
16142 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16143 if (Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
16144 ConstVal && ConstVal->isNullValue())
16145 return AtomicExpansionKind::Expand;
16146 }
16147
16148 break;
16149 }
16150 case AtomicRMWInst::FAdd: {
16151 Type *Ty = RMW->getType();
16152
16153 // TODO: Handle REGION_ADDRESS
16154 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16155 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16156 // is fixed to round-to-nearest-even.
16157 //
16158 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16159 // round-to-nearest-even.
16160 //
16161 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16162 // suggests it is OK if the floating-point mode may not match the calling
16163 // thread.
16164 if (Ty->isFloatTy()) {
16165 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
16166 : AtomicExpansionKind::CmpXChg;
16167 }
16168
16169 if (Ty->isDoubleTy()) {
16170 // Ignores denormal mode, but we don't consider flushing mandatory.
16171 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
16172 : AtomicExpansionKind::CmpXChg;
16173 }
16174
16175 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16176 return AtomicExpansionKind::None;
16177
16178 return AtomicExpansionKind::CmpXChg;
16179 }
16180
16181 if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16182 AS != AMDGPUAS::BUFFER_FAT_POINTER)
16183 return AtomicExpansionKind::CmpXChg;
16184
16185 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16186 return AtomicExpansionKind::None;
16187
16188 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16189 // gfx940, gfx12
16190 // FIXME: Needs to account for no fine-grained memory
16191 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16192 return AtomicExpansionKind::None;
16193 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16194 // gfx90a, gfx940, gfx12
16195 // FIXME: Needs to account for no fine-grained memory
16196 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16197 return AtomicExpansionKind::None;
16198
16199 // gfx940, gfx12
16200 // FIXME: Needs to account for no fine-grained memory
16201 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16202 return AtomicExpansionKind::None;
16203 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16204 // gfx90a, gfx940, gfx12
16205 // FIXME: Needs to account for no fine-grained memory
16206 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16207 return AtomicExpansionKind::None;
16208
16209 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16210 // buffer. gfx12 does have the buffer version.
16211 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16212 return AtomicExpansionKind::None;
16213 }
16214
16215 if (unsafeFPAtomicsDisabled(F: RMW->getFunction()))
16216 return AtomicExpansionKind::CmpXChg;
16217
16218 // Always expand system scope fp atomics.
16219 if (HasSystemScope)
16220 return AtomicExpansionKind::CmpXChg;
16221
16222 // global and flat atomic fadd f64: gfx90a, gfx940.
16223 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16224 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16225
16226 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16227 if (Ty->isFloatTy()) {
16228 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16229 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16230 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16231 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16232 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16233 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16234 } else {
16235 // gfx908
16236 if (RMW->use_empty() &&
16237 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16238 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16239 }
16240 }
16241
16242 // flat atomic fadd f32: gfx940, gfx11+.
16243 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16244 if (Subtarget->hasFlatAtomicFaddF32Inst())
16245 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16246
16247 // If it is in flat address space, and the type is float, we will try to
16248 // expand it, if the target supports global and lds atomic fadd. The
16249 // reason we need that is, in the expansion, we emit the check of address
16250 // space. If it is in global address space, we emit the global atomic
16251 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16252 if (Subtarget->hasLDSFPAtomicAddF32()) {
16253 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16254 return AtomicExpansionKind::Expand;
16255 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16256 return AtomicExpansionKind::Expand;
16257 }
16258 }
16259
16260 return AtomicExpansionKind::CmpXChg;
16261 }
16262 case AtomicRMWInst::FMin:
16263 case AtomicRMWInst::FMax: {
16264 Type *Ty = RMW->getType();
16265
16266 // LDS float and double fmin/fmax were always supported.
16267 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy()))
16268 return AtomicExpansionKind::None;
16269
16270 if (unsafeFPAtomicsDisabled(F: RMW->getFunction()))
16271 return AtomicExpansionKind::CmpXChg;
16272
16273 // Always expand system scope fp atomics.
16274 if (HasSystemScope)
16275 return AtomicExpansionKind::CmpXChg;
16276
16277 // For flat and global cases:
16278 // float, double in gfx7. Manual claims denormal support.
16279 // Removed in gfx8.
16280 // float, double restored in gfx10.
16281 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16282 //
16283 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no
16284 // f32.
16285 //
16286 // FIXME: Check scope and fine grained memory
16287 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16288 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16289 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16290 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16291 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16292 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16293 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16294 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16295 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16296 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16297 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16298 }
16299
16300 return AtomicExpansionKind::CmpXChg;
16301 }
16302 case AtomicRMWInst::Min:
16303 case AtomicRMWInst::Max:
16304 case AtomicRMWInst::UMin:
16305 case AtomicRMWInst::UMax: {
16306 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
16307 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16308 // Always expand system scope min/max atomics.
16309 if (HasSystemScope)
16310 return AtomicExpansionKind::CmpXChg;
16311 }
16312 break;
16313 }
16314 default:
16315 break;
16316 }
16317
16318 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
16319}
16320
16321TargetLowering::AtomicExpansionKind
16322SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
16323 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16324 ? AtomicExpansionKind::NotAtomic
16325 : AtomicExpansionKind::None;
16326}
16327
16328TargetLowering::AtomicExpansionKind
16329SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
16330 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16331 ? AtomicExpansionKind::NotAtomic
16332 : AtomicExpansionKind::None;
16333}
16334
16335TargetLowering::AtomicExpansionKind
16336SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
16337 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16338 ? AtomicExpansionKind::NotAtomic
16339 : AtomicExpansionKind::None;
16340}
16341
16342const TargetRegisterClass *
16343SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16344 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
16345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16346 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16347 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16348 : &AMDGPU::SReg_32RegClass;
16349 if (!TRI->isSGPRClass(RC) && !isDivergent)
16350 return TRI->getEquivalentSGPRClass(VRC: RC);
16351 if (TRI->isSGPRClass(RC) && isDivergent)
16352 return TRI->getEquivalentVGPRClass(SRC: RC);
16353
16354 return RC;
16355}
16356
16357// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16358// uniform values (as produced by the mask results of control flow intrinsics)
16359// used outside of divergent blocks. The phi users need to also be treated as
16360// always uniform.
16361//
16362// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16363static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16364 unsigned WaveSize) {
16365 // FIXME: We assume we never cast the mask results of a control flow
16366 // intrinsic.
16367 // Early exit if the type won't be consistent as a compile time hack.
16368 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
16369 if (!IT || IT->getBitWidth() != WaveSize)
16370 return false;
16371
16372 if (!isa<Instruction>(Val: V))
16373 return false;
16374 if (!Visited.insert(Ptr: V).second)
16375 return false;
16376 bool Result = false;
16377 for (const auto *U : V->users()) {
16378 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
16379 if (V == U->getOperand(i: 1)) {
16380 switch (Intrinsic->getIntrinsicID()) {
16381 default:
16382 Result = false;
16383 break;
16384 case Intrinsic::amdgcn_if_break:
16385 case Intrinsic::amdgcn_if:
16386 case Intrinsic::amdgcn_else:
16387 Result = true;
16388 break;
16389 }
16390 }
16391 if (V == U->getOperand(i: 0)) {
16392 switch (Intrinsic->getIntrinsicID()) {
16393 default:
16394 Result = false;
16395 break;
16396 case Intrinsic::amdgcn_end_cf:
16397 case Intrinsic::amdgcn_loop:
16398 Result = true;
16399 break;
16400 }
16401 }
16402 } else {
16403 Result = hasCFUser(V: U, Visited, WaveSize);
16404 }
16405 if (Result)
16406 break;
16407 }
16408 return Result;
16409}
16410
16411bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
16412 const Value *V) const {
16413 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
16414 if (CI->isInlineAsm()) {
16415 // FIXME: This cannot give a correct answer. This should only trigger in
16416 // the case where inline asm returns mixed SGPR and VGPR results, used
16417 // outside the defining block. We don't have a specific result to
16418 // consider, so this assumes if any value is SGPR, the overall register
16419 // also needs to be SGPR.
16420 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16421 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
16422 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
16423 for (auto &TC : TargetConstraints) {
16424 if (TC.Type == InlineAsm::isOutput) {
16425 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
16426 const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
16427 TRI_: SIRI, Constraint: TC.ConstraintCode, VT: TC.ConstraintVT).second;
16428 if (RC && SIRI->isSGPRClass(RC))
16429 return true;
16430 }
16431 }
16432 }
16433 }
16434 SmallPtrSet<const Value *, 16> Visited;
16435 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
16436}
16437
16438bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
16439 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16440 for (; I != E; ++I) {
16441 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: *I)) {
16442 if (getBasePtrIndex(N: M) == I.getOperandNo())
16443 return true;
16444 }
16445 }
16446 return false;
16447}
16448
16449bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
16450 SDValue N1) const {
16451 if (!N0.hasOneUse())
16452 return false;
16453 // Take care of the opportunity to keep N0 uniform
16454 if (N0->isDivergent() || !N1->isDivergent())
16455 return true;
16456 // Check if we have a good chance to form the memory access pattern with the
16457 // base and offset
16458 return (DAG.isBaseWithConstantOffset(Op: N0) &&
16459 hasMemSDNodeUser(N: *N0->use_begin()));
16460}
16461
16462bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
16463 Register N0, Register N1) const {
16464 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
16465}
16466
16467MachineMemOperand::Flags
16468SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
16469 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16470 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
16471 if (I.getMetadata(Kind: "amdgpu.noclobber"))
16472 Flags |= MONoClobber;
16473 if (I.getMetadata(Kind: "amdgpu.last.use"))
16474 Flags |= MOLastUse;
16475 return Flags;
16476}
16477
16478bool SITargetLowering::checkForPhysRegDependency(
16479 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16480 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16481 if (User->getOpcode() != ISD::CopyToReg)
16482 return false;
16483 if (!Def->isMachineOpcode())
16484 return false;
16485 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Val: Def);
16486 if (!MDef)
16487 return false;
16488
16489 unsigned ResNo = User->getOperand(Num: Op).getResNo();
16490 if (User->getOperand(Num: Op)->getValueType(ResNo) != MVT::i1)
16491 return false;
16492 const MCInstrDesc &II = TII->get(Opcode: MDef->getMachineOpcode());
16493 if (II.isCompare() && II.hasImplicitDefOfPhysReg(Reg: AMDGPU::SCC)) {
16494 PhysReg = AMDGPU::SCC;
16495 const TargetRegisterClass *RC =
16496 TRI->getMinimalPhysRegClass(Reg: PhysReg, VT: Def->getSimpleValueType(ResNo));
16497 Cost = RC->getCopyCost();
16498 return true;
16499 }
16500 return false;
16501}
16502
16503void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
16504 AtomicRMWInst::BinOp Op = AI->getOperation();
16505
16506 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
16507 Op == AtomicRMWInst::Xor) {
16508 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16509 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16510 "this cannot be replaced with add");
16511 AI->setOperation(AtomicRMWInst::Add);
16512 return;
16513 }
16514
16515 assert(Subtarget->hasAtomicFaddInsts() &&
16516 "target should have atomic fadd instructions");
16517 assert(AI->getType()->isFloatTy() &&
16518 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
16519 "generic atomicrmw expansion only supports FP32 operand in flat "
16520 "address space");
16521 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16522
16523 // Given: atomicrmw fadd ptr %addr, float %val ordering
16524 //
16525 // With this expansion we produce the following code:
16526 // [...]
16527 // br label %atomicrmw.check.shared
16528 //
16529 // atomicrmw.check.shared:
16530 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16531 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16532 //
16533 // atomicrmw.shared:
16534 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16535 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16536 // float %val ordering
16537 // br label %atomicrmw.phi
16538 //
16539 // atomicrmw.check.private:
16540 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16541 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16542 //
16543 // atomicrmw.private:
16544 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16545 // %loaded.private = load float, ptr addrspace(5) %cast.private
16546 // %val.new = fadd float %loaded.private, %val
16547 // store float %val.new, ptr addrspace(5) %cast.private
16548 // br label %atomicrmw.phi
16549 //
16550 // atomicrmw.global:
16551 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16552 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16553 // float %val ordering
16554 // br label %atomicrmw.phi
16555 //
16556 // atomicrmw.phi:
16557 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16558 // [ %loaded.private, %atomicrmw.private ],
16559 // [ %loaded.global, %atomicrmw.global ]
16560 // br label %atomicrmw.end
16561 //
16562 // atomicrmw.end:
16563 // [...]
16564
16565 IRBuilder<> Builder(AI);
16566 LLVMContext &Ctx = Builder.getContext();
16567
16568 BasicBlock *BB = Builder.GetInsertBlock();
16569 Function *F = BB->getParent();
16570 BasicBlock *ExitBB =
16571 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
16572 BasicBlock *CheckSharedBB =
16573 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.shared", Parent: F, InsertBefore: ExitBB);
16574 BasicBlock *SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
16575 BasicBlock *CheckPrivateBB =
16576 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
16577 BasicBlock *PrivateBB =
16578 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
16579 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
16580 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
16581
16582 Value *Val = AI->getValOperand();
16583 Type *ValTy = Val->getType();
16584 Value *Addr = AI->getPointerOperand();
16585
16586 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16587 Value *Val) -> Value * {
16588 AtomicRMWInst *OldVal =
16589 Builder.CreateAtomicRMW(Op: AI->getOperation(), Ptr: Addr, Val, Align: AI->getAlign(),
16590 Ordering: AI->getOrdering(), SSID: AI->getSyncScopeID());
16591 SmallVector<std::pair<unsigned, MDNode *>> MDs;
16592 AI->getAllMetadata(MDs);
16593 for (auto &P : MDs)
16594 OldVal->setMetadata(KindID: P.first, Node: P.second);
16595 return OldVal;
16596 };
16597
16598 std::prev(x: BB->end())->eraseFromParent();
16599 Builder.SetInsertPoint(BB);
16600 Builder.CreateBr(Dest: CheckSharedBB);
16601
16602 Builder.SetInsertPoint(CheckSharedBB);
16603 CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared, Types: {},
16604 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
16605 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
16606
16607 Builder.SetInsertPoint(SharedBB);
16608 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16609 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
16610 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16611 Builder.CreateBr(Dest: PhiBB);
16612
16613 Builder.SetInsertPoint(CheckPrivateBB);
16614 CallInst *IsPrivate = Builder.CreateIntrinsic(
16615 ID: Intrinsic::amdgcn_is_private, Types: {}, Args: {Addr}, FMFSource: nullptr, Name: "is.private");
16616 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
16617
16618 Builder.SetInsertPoint(PrivateBB);
16619 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16620 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
16621 Value *LoadedPrivate =
16622 Builder.CreateLoad(Ty: ValTy, Ptr: CastToPrivate, Name: "loaded.private");
16623 Value *NewVal = Builder.CreateFAdd(L: LoadedPrivate, R: Val, Name: "val.new");
16624 Builder.CreateStore(Val: NewVal, Ptr: CastToPrivate);
16625 Builder.CreateBr(Dest: PhiBB);
16626
16627 Builder.SetInsertPoint(GlobalBB);
16628 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16629 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
16630 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16631 Builder.CreateBr(Dest: PhiBB);
16632
16633 Builder.SetInsertPoint(PhiBB);
16634 PHINode *Loaded = Builder.CreatePHI(Ty: ValTy, NumReservedValues: 3, Name: "loaded.phi");
16635 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
16636 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
16637 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
16638 Builder.CreateBr(Dest: ExitBB);
16639
16640 AI->replaceAllUsesWith(V: Loaded);
16641 AI->eraseFromParent();
16642}
16643
16644LoadInst *
16645SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
16646 IRBuilder<> Builder(AI);
16647 auto Order = AI->getOrdering();
16648
16649 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16650 // must be flushed if the atomic ordering had a release semantics. This is
16651 // not necessary a fence, a release fence just coincides to do that flush.
16652 // Avoid replacing of an atomicrmw with a release semantics.
16653 if (isReleaseOrStronger(AO: Order))
16654 return nullptr;
16655
16656 LoadInst *LI = Builder.CreateAlignedLoad(
16657 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
16658 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
16659 LI->copyMetadata(SrcInst: *AI);
16660 LI->takeName(V: AI);
16661 AI->replaceAllUsesWith(V: LI);
16662 AI->eraseFromParent();
16663 return LI;
16664}
16665