1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUSelectionDAGInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/FloatingPointMode.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/UniformityAnalysis.h"
29#include "llvm/CodeGen/Analysis.h"
30#include "llvm/CodeGen/ByteProvider.h"
31#include "llvm/CodeGen/FunctionLoweringInfo.h"
32#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
33#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
34#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineLoopInfo.h"
38#include "llvm/CodeGen/PseudoSourceValueManager.h"
39#include "llvm/CodeGen/SDPatternMatch.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/KnownBits.h"
48#include "llvm/Support/ModRef.h"
49#include "llvm/Transforms/Utils/LowerAtomic.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(Val: false));
63
64static cl::opt<bool> UseDivergentRegisterIndexing(
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(Val: false));
68
69static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
72}
73
74static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
75 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
76 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
77}
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
89SITargetLowering::SITargetLowering(const TargetMachine &TM,
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
93 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 32);
100 addRegisterClass(VT: MVT::f32, RC: V32RegClass);
101
102 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 64);
106
107 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
108 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
109 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
110
111 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(VT: MVT::v3f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 96));
113
114 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(VT: MVT::v4f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 128));
119
120 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(VT: MVT::v5f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 160));
122
123 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(VT: MVT::v6f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
125
126 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(VT: MVT::v3f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
128
129 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(VT: MVT::v7f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 224));
131
132 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(VT: MVT::v8f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
134
135 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(VT: MVT::v4f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
137
138 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(VT: MVT::v9f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 288));
140
141 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(VT: MVT::v10f32,
143 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 320));
144
145 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(VT: MVT::v11f32,
147 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 352));
148
149 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(VT: MVT::v12f32,
151 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 384));
152
153 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(VT: MVT::v16f32,
155 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
156
157 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(VT: MVT::v8f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
159
160 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(VT: MVT::v16f64,
162 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
171 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
172 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
177 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
178 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
179 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
180 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
181 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
182 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(VT: MVT::v32f32,
195 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
196
197 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
198
199 // The boolean content concept here is too inflexible. Compares only ever
200 // really produce a 1-bit result. Any copy/extend from these will turn into a
201 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
202 // it's what most targets use.
203 setBooleanContents(ZeroOrOneBooleanContent);
204 setBooleanVectorContents(ZeroOrOneBooleanContent);
205
206 // We need to custom lower vector stores from local memory
207 setOperationAction(Ops: ISD::LOAD,
208 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
212 Action: Custom);
213
214 setOperationAction(Ops: ISD::STORE,
215 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
216 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
217 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
218 MVT::i1, MVT::v32i32},
219 Action: Custom);
220
221 if (isTypeLegal(VT: MVT::bf16)) {
222 for (unsigned Opc :
223 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
224 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
225 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
226 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
227 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
228 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
229 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
230 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
231 ISD::SETCC}) {
232 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
233 }
234
235 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
236
237 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
238 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
239
240 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
241 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
242 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
243
244 // We only need to custom lower because we can't specify an action for bf16
245 // sources.
246 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
247 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
248 }
249
250 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
251 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
252 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
253 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
254 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
255 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
262 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
263 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
264 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
265 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
266
267 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
268 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
269 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
270 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
271 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
272 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
273 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
274
275 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
276 setOperationAction(Ops: ISD::ExternalSymbol, VTs: {MVT::i32, MVT::i64}, Action: Custom);
277
278 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
279 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
280 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
281 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
282
283 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
284
285 setOperationAction(Ops: ISD::SELECT_CC,
286 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
287
288 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
289 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
290 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
291
292 setOperationAction(Ops: ISD::TRUNCATE,
293 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
296 Action: Expand);
297 setOperationAction(Ops: ISD::FP_ROUND,
298 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
301 Action: Expand);
302
303 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
304 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
306 Action: Custom);
307
308 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
309 setOperationAction(Ops: ISD::BR_CC,
310 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
311
312 setOperationAction(Ops: {ISD::ABS, ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
313
314 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
315
316 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
317 Action: Expand);
318
319#if 0
320 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
321#endif
322
323 // We only support LOAD/STORE and vector manipulation ops for vectors
324 // with > 4 elements.
325 for (MVT VT :
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
334 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
335 switch (Op) {
336 case ISD::LOAD:
337 case ISD::STORE:
338 case ISD::BUILD_VECTOR:
339 case ISD::BITCAST:
340 case ISD::UNDEF:
341 case ISD::EXTRACT_VECTOR_ELT:
342 case ISD::INSERT_VECTOR_ELT:
343 case ISD::SCALAR_TO_VECTOR:
344 case ISD::IS_FPCLASS:
345 break;
346 case ISD::EXTRACT_SUBVECTOR:
347 case ISD::INSERT_SUBVECTOR:
348 case ISD::CONCAT_VECTORS:
349 setOperationAction(Op, VT, Action: Custom);
350 break;
351 default:
352 setOperationAction(Op, VT, Action: Expand);
353 break;
354 }
355 }
356 }
357
358 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
359
360 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
361 // is expanded to avoid having two separate loops in case the index is a VGPR.
362
363 // Most operations are naturally 32-bit vector operations. We only support
364 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
365 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
366 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
367 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
368
369 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
370 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
371
372 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
373 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
374
375 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
376 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
377 }
378
379 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
380 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
381 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
382
383 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
384 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
385
386 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
387 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
388
389 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
390 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
391 }
392
393 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
394 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
395 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
396
397 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
398 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
399
400 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
401 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
402
403 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
404 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
405 }
406
407 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
408 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
409 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
410
411 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
412 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
413
414 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
415 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
416
417 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
418 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
419 }
420
421 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
423 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
424
425 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
426 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
427
428 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
429 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
430
431 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
432 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
433 }
434
435 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
436 VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
438 Action: Custom);
439
440 if (Subtarget->hasPkMovB32()) {
441 // TODO: 16-bit element vectors should be legal with even aligned elements.
442 // TODO: Can be legal with wider source types than the result with
443 // subregister extracts.
444 setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
445 }
446
447 setOperationAction(Ops: {ISD::AND, ISD::OR, ISD::XOR}, VT: MVT::v2i32, Action: Legal);
448 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
449 // instead lower to cndmask in SITargetLowering::LowerSELECT().
450 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i32, Action: Custom);
451 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
452 // alignbit.
453 setOperationAction(Op: ISD::ROTR, VT: MVT::v2i32, Action: Custom);
454
455 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
456 Action: Custom);
457
458 // Avoid stack access for these.
459 // TODO: Generalize to more vector types.
460 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
461 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
463 Action: Custom);
464
465 // Deal with vec3 vector operations when widened to vec4.
466 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
467 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
468
469 // Deal with vec5/6/7 vector operations when widened to vec8.
470 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
471 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
475 Action: Custom);
476
477 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
478 // and output demarshalling
479 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
480
481 // We can't return success/failure, only the old value,
482 // let LLVM add the comparison
483 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
484 Action: Expand);
485
486 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
487
488 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
489
490 // FIXME: This should be narrowed to i32, but that only happens if i64 is
491 // illegal.
492 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
493 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
494
495 // On SI this is s_memtime and s_memrealtime on VI.
496 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
497
498 if (Subtarget->hasSMemRealTime() ||
499 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
500 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
501 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
502
503 if (Subtarget->has16BitInsts()) {
504 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
505 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
506 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
507 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Legal);
508 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f16, Action: Legal);
509 } else {
510 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
511 }
512
513 if (Subtarget->hasMadMacF32Insts())
514 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
515
516 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
517 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
518
519 // We only really have 32-bit BFE instructions (and 16-bit on VI).
520 //
521 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
522 // effort to match them now. We want this to be false for i64 cases when the
523 // extraction isn't restricted to the upper or lower half. Ideally we would
524 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
525 // span the midpoint are probably relatively rare, so don't worry about them
526 // for now.
527 setHasExtractBitsInsn(true);
528
529 // Clamp modifier on add/sub
530 if (Subtarget->hasIntClamp())
531 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
532
533 if (Subtarget->hasAddNoCarryInsts())
534 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
535 Action: Legal);
536
537 setOperationAction(
538 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
539 VTs: {MVT::f32, MVT::f64}, Action: Custom);
540
541 // These are really only legal for ieee_mode functions. We should be avoiding
542 // them for functions that don't have ieee_mode enabled, so just say they are
543 // legal.
544 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
545 VTs: {MVT::f32, MVT::f64}, Action: Legal);
546
547 if (Subtarget->haveRoundOpsF64())
548 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
549 Action: Legal);
550 else
551 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
552 VT: MVT::f64, Action: Custom);
553
554 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
555 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
556 Action: Legal);
557 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
558
559 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
560 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
561
562 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
563 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
564
565 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i32,
566 Action: Custom);
567 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i16,
568 Action: Custom);
569 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i1,
570 Action: Custom);
571
572 // Custom lower these because we can't specify a rule based on an illegal
573 // source bf16.
574 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
575 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
576
577 if (Subtarget->has16BitInsts()) {
578 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
579 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
580 VT: MVT::i16, Action: Legal);
581
582 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
583
584 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
585 VT: MVT::i16, Action: Expand);
586
587 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
588 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
589 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
590 ISD::CTPOP},
591 VT: MVT::i16, Action: Promote);
592
593 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
594
595 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
596
597 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
598 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
599 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
600 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
601
602 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
603 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
604 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
605
606 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
607
608 // F16 - Constant Actions.
609 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
610 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
611
612 // F16 - Load/Store Actions.
613 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
614 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
615 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
616 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
617
618 // BF16 - Load/Store Actions.
619 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
620 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
621 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
622 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
623
624 // F16 - VOP1 Actions.
625 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
626 ISD::FSIN, ISD::FROUND},
627 VT: MVT::f16, Action: Custom);
628
629 // BF16 - VOP1 Actions.
630 if (Subtarget->hasBF16TransInsts())
631 setOperationAction(Ops: {ISD::FCOS, ISD::FSIN, ISD::FDIV}, VT: MVT::bf16, Action: Custom);
632
633 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
634 ISD::FP_TO_UINT_SAT},
635 VT: MVT::f16, Action: Promote);
636 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
637 ISD::FP_TO_UINT_SAT},
638 VT: MVT::bf16, Action: Promote);
639
640 // F16 - VOP2 Actions.
641 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
642 Action: Expand);
643 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
644 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
645 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
646
647 // F16 - VOP3 Actions.
648 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
649 if (STI.hasMadF16())
650 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
651
652 for (MVT VT :
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
656 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
657 switch (Op) {
658 case ISD::LOAD:
659 case ISD::STORE:
660 case ISD::BUILD_VECTOR:
661 case ISD::BITCAST:
662 case ISD::UNDEF:
663 case ISD::EXTRACT_VECTOR_ELT:
664 case ISD::INSERT_VECTOR_ELT:
665 case ISD::INSERT_SUBVECTOR:
666 case ISD::SCALAR_TO_VECTOR:
667 case ISD::IS_FPCLASS:
668 break;
669 case ISD::EXTRACT_SUBVECTOR:
670 case ISD::CONCAT_VECTORS:
671 case ISD::FSIN:
672 case ISD::FCOS:
673 setOperationAction(Op, VT, Action: Custom);
674 break;
675 default:
676 setOperationAction(Op, VT, Action: Expand);
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
684 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
685
686 // XXX - Do these do anything? Vector constants turn into build_vector.
687 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
688
689 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
690 Action: Legal);
691
692 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
693 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
694 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
695 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
696
697 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
698 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
699 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
700 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
701
702 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
703 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
704 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
705 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
706 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
707 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
708
709 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
710 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
711 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
712 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
713 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
714 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
715
716 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
717 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
718 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
719 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
720 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
721 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
722
723 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
724 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
725 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
726 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
727 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
728 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
729
730 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
731 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
732 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
733 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
734
735 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
736 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
737 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
738 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
739 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
740 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
741
742 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
743 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
744 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
745 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
746 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
747 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
748
749 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
750 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
751 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
752 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
753 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
754 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
755
756 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
757 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
758 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
759 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
760 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
761 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
762
763 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
764 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
765 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
766 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
767 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
768 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
769
770 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
771 VT: MVT::v2i32, Action: Expand);
772 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
773
774 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
775 VT: MVT::v4i32, Action: Expand);
776
777 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
778 VT: MVT::v8i32, Action: Expand);
779
780 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
781 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
782
783 setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
784 // This isn't really legal, but this avoids the legalizer unrolling it (and
785 // allows matching fneg (fabs x) patterns)
786 setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
787
788 // Can do this in one BFI plus a constant materialize.
789 setOperationAction(Ops: ISD::FCOPYSIGN,
790 VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
793 Action: Custom);
794
795 setOperationAction(
796 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
797 VT: MVT::f16, Action: Custom);
798 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
799
800 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
801 ISD::FMAXIMUMNUM},
802 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
803 Action: Custom);
804
805 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
806 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 Action: Expand);
808
809 for (MVT Vec16 :
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
812 setOperationAction(
813 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
814 VT: Vec16, Action: Custom);
815 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
816 }
817 }
818
819 if (Subtarget->hasVOP3PInsts()) {
820 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
821 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
822 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
823 VT: MVT::v2i16, Action: Legal);
824
825 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
826 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
827 VT: MVT::v2f16, Action: Legal);
828
829 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
830 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
831
832 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
833 VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
836 Action: Custom);
837
838 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
839 // Split vector operations.
840 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
841 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
842 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
843 ISD::SSUBSAT},
844 VT, Action: Custom);
845
846 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
847 // Split vector operations.
848 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
849 VT, Action: Custom);
850
851 setOperationAction(
852 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
853 VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
854
855 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
856 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
857 Action: Custom);
858
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
861 // Split vector operations.
862 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
863 VT, Action: Custom);
864 }
865
866 if (Subtarget->hasPackedFP32Ops()) {
867 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
868 VT: MVT::v2f32, Action: Legal);
869 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
870 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
871 Action: Custom);
872 }
873 }
874
875 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
876
877 if (Subtarget->has16BitInsts()) {
878 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
879 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
880 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
881 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
882 } else {
883 // Legalization hack.
884 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
885
886 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
887 }
888
889 setOperationAction(Ops: ISD::SELECT,
890 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
894 Action: Custom);
895
896 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
897
898 if (Subtarget->hasVectorMulU64())
899 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Legal);
900 else if (Subtarget->hasScalarSMulU64())
901 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
902
903 if (Subtarget->hasMad64_32())
904 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
905
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
907 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
908
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
910 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
911 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
912 } else {
913 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
914 if (Subtarget->hasMinimum3Maximum3F32())
915 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
916
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
918 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
919
920 // If only the vector form is available, we need to widen to a vector.
921 if (!Subtarget->hasMinimum3Maximum3F16())
922 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
923 }
924 }
925
926 if (Subtarget->hasVOP3PInsts()) {
927 // We want to break these into v2f16 pieces, not scalarize.
928 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
929 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
930 Action: Custom);
931 }
932
933 if (Subtarget->hasIntMinMax64())
934 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i64,
935 Action: Legal);
936
937 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
938 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
940 MVT::i8},
941 Action: Custom);
942
943 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
944 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
948 Action: Custom);
949
950 setOperationAction(Ops: ISD::INTRINSIC_VOID,
951 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 Action: Custom);
956
957 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
958 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
959 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
960 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
961 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
962
963 // TODO: Could move this to custom lowering, could benefit from combines on
964 // extract of relevant bits.
965 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
966
967 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
968
969 if (Subtarget->hasBF16ConversionInsts()) {
970 setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
971 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
972 }
973
974 if (Subtarget->hasBF16PackedInsts()) {
975 setOperationAction(
976 Ops: {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
977 VT: MVT::v2bf16, Action: Legal);
978 }
979
980 if (Subtarget->hasBF16TransInsts()) {
981 setOperationAction(Ops: {ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, VT: MVT::bf16, Action: Legal);
982 }
983
984 if (Subtarget->hasCvtPkF16F32Inst()) {
985 setOperationAction(Ops: ISD::FP_ROUND,
986 VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
987 Action: Custom);
988 }
989
990 setTargetDAGCombine({ISD::ADD,
991 ISD::PTRADD,
992 ISD::UADDO_CARRY,
993 ISD::SUB,
994 ISD::USUBO_CARRY,
995 ISD::MUL,
996 ISD::FADD,
997 ISD::FSUB,
998 ISD::FDIV,
999 ISD::FMUL,
1000 ISD::FMINNUM,
1001 ISD::FMAXNUM,
1002 ISD::FMINNUM_IEEE,
1003 ISD::FMAXNUM_IEEE,
1004 ISD::FMINIMUM,
1005 ISD::FMAXIMUM,
1006 ISD::FMINIMUMNUM,
1007 ISD::FMAXIMUMNUM,
1008 ISD::FMA,
1009 ISD::SMIN,
1010 ISD::SMAX,
1011 ISD::UMIN,
1012 ISD::UMAX,
1013 ISD::SETCC,
1014 ISD::SELECT,
1015 ISD::SMIN,
1016 ISD::SMAX,
1017 ISD::UMIN,
1018 ISD::UMAX,
1019 ISD::AND,
1020 ISD::OR,
1021 ISD::XOR,
1022 ISD::SHL,
1023 ISD::SRL,
1024 ISD::SRA,
1025 ISD::FSHR,
1026 ISD::SINT_TO_FP,
1027 ISD::UINT_TO_FP,
1028 ISD::FCANONICALIZE,
1029 ISD::SCALAR_TO_VECTOR,
1030 ISD::ZERO_EXTEND,
1031 ISD::SIGN_EXTEND_INREG,
1032 ISD::ANY_EXTEND,
1033 ISD::EXTRACT_VECTOR_ELT,
1034 ISD::INSERT_VECTOR_ELT,
1035 ISD::FCOPYSIGN});
1036
1037 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1038 setTargetDAGCombine(ISD::FP_ROUND);
1039
1040 // All memory operations. Some folding on the pointer operand is done to help
1041 // matching the constant offsets in the addressing modes.
1042 setTargetDAGCombine({ISD::LOAD,
1043 ISD::STORE,
1044 ISD::ATOMIC_LOAD,
1045 ISD::ATOMIC_STORE,
1046 ISD::ATOMIC_CMP_SWAP,
1047 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1048 ISD::ATOMIC_SWAP,
1049 ISD::ATOMIC_LOAD_ADD,
1050 ISD::ATOMIC_LOAD_SUB,
1051 ISD::ATOMIC_LOAD_AND,
1052 ISD::ATOMIC_LOAD_OR,
1053 ISD::ATOMIC_LOAD_XOR,
1054 ISD::ATOMIC_LOAD_NAND,
1055 ISD::ATOMIC_LOAD_MIN,
1056 ISD::ATOMIC_LOAD_MAX,
1057 ISD::ATOMIC_LOAD_UMIN,
1058 ISD::ATOMIC_LOAD_UMAX,
1059 ISD::ATOMIC_LOAD_FADD,
1060 ISD::ATOMIC_LOAD_FMIN,
1061 ISD::ATOMIC_LOAD_FMAX,
1062 ISD::ATOMIC_LOAD_UINC_WRAP,
1063 ISD::ATOMIC_LOAD_UDEC_WRAP,
1064 ISD::ATOMIC_LOAD_USUB_COND,
1065 ISD::ATOMIC_LOAD_USUB_SAT,
1066 ISD::INTRINSIC_VOID,
1067 ISD::INTRINSIC_W_CHAIN});
1068
1069 // FIXME: In other contexts we pretend this is a per-function property.
1070 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1071
1072 setSchedulingPreference(Sched::RegPressure);
1073}
1074
1075const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1076
1077ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1078 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1079 return RCRegs;
1080}
1081
1082//===----------------------------------------------------------------------===//
1083// TargetLowering queries
1084//===----------------------------------------------------------------------===//
1085
1086// v_mad_mix* support a conversion from f16 to f32.
1087//
1088// There is only one special case when denormals are enabled we don't currently,
1089// where this is OK to use.
1090bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1091 EVT DestVT, EVT SrcVT) const {
1092 return DestVT.getScalarType() == MVT::f32 &&
1093 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1094 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1095 SrcVT.getScalarType() == MVT::f16) ||
1096 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1097 SrcVT.getScalarType() == MVT::bf16)) &&
1098 // TODO: This probably only requires no input flushing?
1099 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1100}
1101
1102bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1103 LLT DestTy, LLT SrcTy) const {
1104 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1105 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1106 DestTy.getScalarSizeInBits() == 32 &&
1107 SrcTy.getScalarSizeInBits() == 16 &&
1108 // TODO: This probably only requires no input flushing?
1109 denormalModeIsFlushAllF32(MF: *MI.getMF());
1110}
1111
1112bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1113 // SI has some legal vector types, but no legal vector operations. Say no
1114 // shuffles are legal in order to prefer scalarizing some vector operations.
1115 return false;
1116}
1117
1118MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1119 CallingConv::ID CC,
1120 EVT VT) const {
1121 if (CC == CallingConv::AMDGPU_KERNEL)
1122 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1123
1124 if (VT.isVector()) {
1125 EVT ScalarVT = VT.getScalarType();
1126 unsigned Size = ScalarVT.getSizeInBits();
1127 if (Size == 16) {
1128 return Subtarget->has16BitInsts()
1129 ? MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), NumElements: 2)
1130 : MVT::i32;
1131 }
1132
1133 if (Size < 16)
1134 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1135 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1136 }
1137
1138 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1139 return MVT::i32;
1140
1141 if (VT.getSizeInBits() > 32)
1142 return MVT::i32;
1143
1144 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1145}
1146
1147unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1148 CallingConv::ID CC,
1149 EVT VT) const {
1150 if (CC == CallingConv::AMDGPU_KERNEL)
1151 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1152
1153 if (VT.isVector()) {
1154 unsigned NumElts = VT.getVectorNumElements();
1155 EVT ScalarVT = VT.getScalarType();
1156 unsigned Size = ScalarVT.getSizeInBits();
1157
1158 // FIXME: Should probably promote 8-bit vectors to i16.
1159 if (Size == 16)
1160 return (NumElts + 1) / 2;
1161
1162 if (Size <= 32)
1163 return NumElts;
1164
1165 if (Size > 32)
1166 return NumElts * ((Size + 31) / 32);
1167 } else if (VT.getSizeInBits() > 32)
1168 return (VT.getSizeInBits() + 31) / 32;
1169
1170 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1171}
1172
1173unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1174 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1175 unsigned &NumIntermediates, MVT &RegisterVT) const {
1176 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1177 unsigned NumElts = VT.getVectorNumElements();
1178 EVT ScalarVT = VT.getScalarType();
1179 unsigned Size = ScalarVT.getSizeInBits();
1180 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1181 // support, but unless we can properly handle 3-vectors, it will be still be
1182 // inconsistent.
1183 if (Size == 16) {
1184 MVT SimpleIntermediateVT =
1185 MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), EC: ElementCount::getFixed(MinVal: 2));
1186 IntermediateVT = SimpleIntermediateVT;
1187 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1188 NumIntermediates = (NumElts + 1) / 2;
1189 return (NumElts + 1) / 2;
1190 }
1191
1192 if (Size == 32) {
1193 RegisterVT = ScalarVT.getSimpleVT();
1194 IntermediateVT = RegisterVT;
1195 NumIntermediates = NumElts;
1196 return NumIntermediates;
1197 }
1198
1199 if (Size < 16 && Subtarget->has16BitInsts()) {
1200 // FIXME: Should probably form v2i16 pieces
1201 RegisterVT = MVT::i16;
1202 IntermediateVT = ScalarVT;
1203 NumIntermediates = NumElts;
1204 return NumIntermediates;
1205 }
1206
1207 if (Size != 16 && Size <= 32) {
1208 RegisterVT = MVT::i32;
1209 IntermediateVT = ScalarVT;
1210 NumIntermediates = NumElts;
1211 return NumIntermediates;
1212 }
1213
1214 if (Size > 32) {
1215 RegisterVT = MVT::i32;
1216 IntermediateVT = RegisterVT;
1217 NumIntermediates = NumElts * ((Size + 31) / 32);
1218 return NumIntermediates;
1219 }
1220 }
1221
1222 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1223 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1224}
1225
1226static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1227 const DataLayout &DL, Type *Ty,
1228 unsigned MaxNumLanes) {
1229 assert(MaxNumLanes != 0);
1230
1231 LLVMContext &Ctx = Ty->getContext();
1232 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1233 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1234 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1235 NumElements: NumElts);
1236 }
1237
1238 return TLI.getValueType(DL, Ty);
1239}
1240
1241// Peek through TFE struct returns to only use the data size.
1242static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1243 const DataLayout &DL, Type *Ty,
1244 unsigned MaxNumLanes) {
1245 auto *ST = dyn_cast<StructType>(Val: Ty);
1246 if (!ST)
1247 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1248
1249 // TFE intrinsics return an aggregate type.
1250 assert(ST->getNumContainedTypes() == 2 &&
1251 ST->getContainedType(1)->isIntegerTy(32));
1252 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1253}
1254
1255/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1256/// in-memory representation. This return value is a custom type because there
1257/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1258/// could cause issues during codegen, these address space 7 pointers will be
1259/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1260/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1261/// for cost modeling, to work. (This also sets us up decently for doing the
1262/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1263MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1264 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1265 return MVT::amdgpuBufferFatPointer;
1266 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1267 DL.getPointerSizeInBits(AS) == 192)
1268 return MVT::amdgpuBufferStridedPointer;
1269 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1270}
1271/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1272/// v8i32 when padding is added.
1273/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1274/// also v8i32 with padding.
1275MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1276 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1277 DL.getPointerSizeInBits(AS) == 160) ||
1278 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1279 DL.getPointerSizeInBits(AS) == 192))
1280 return MVT::v8i32;
1281 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1282}
1283
1284static unsigned getIntrMemWidth(unsigned IntrID) {
1285 switch (IntrID) {
1286 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1287 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1289 return 8;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1295 case Intrinsic::amdgcn_flat_load_monitor_b32:
1296 case Intrinsic::amdgcn_global_load_monitor_b32:
1297 return 32;
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_flat_load_monitor_b64:
1304 case Intrinsic::amdgcn_global_load_monitor_b64:
1305 return 64;
1306 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1307 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1308 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1309 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1310 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1311 case Intrinsic::amdgcn_flat_load_monitor_b128:
1312 case Intrinsic::amdgcn_global_load_monitor_b128:
1313 return 128;
1314 default:
1315 llvm_unreachable("Unknown width");
1316 }
1317}
1318
1319static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
1320 unsigned ArgIdx) {
1321 Value *OrderingArg = CI.getArgOperand(i: ArgIdx);
1322 unsigned Ord = cast<ConstantInt>(Val: OrderingArg)->getZExtValue();
1323 switch (AtomicOrderingCABI(Ord)) {
1324 case AtomicOrderingCABI::acquire:
1325 return AtomicOrdering::Acquire;
1326 break;
1327 case AtomicOrderingCABI::release:
1328 return AtomicOrdering::Release;
1329 break;
1330 case AtomicOrderingCABI::seq_cst:
1331 return AtomicOrdering::SequentiallyConsistent;
1332 break;
1333 default:
1334 return AtomicOrdering::Monotonic;
1335 }
1336}
1337
1338static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1339 MDNode *ScopeMD = cast<MDNode>(
1340 Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: ArgIdx))->getMetadata());
1341 StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: 0))->getString();
1342 return CI.getContext().getOrInsertSyncScopeID(SSN: Scope);
1343}
1344
1345void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
1346 const CallBase &CI,
1347 MachineFunction &MF,
1348 unsigned IntrID) const {
1349 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1350 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1351 Flags |= MachineMemOperand::MOInvariant;
1352 if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1353 Flags |= MachineMemOperand::MONonTemporal;
1354 Flags |= getTargetMMOFlags(I: CI);
1355
1356 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1357 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1358 AttributeSet Attr =
1359 Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1360 MemoryEffects ME = Attr.getMemoryEffects();
1361 if (ME.doesNotAccessMemory())
1362 return;
1363
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1365 if (!IsSPrefetch) {
1366 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1367 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1368 Flags |= MachineMemOperand::MOVolatile;
1369 }
1370 Flags |= MachineMemOperand::MODereferenceable;
1371
1372 IntrinsicInfo Info;
1373 // TODO: Should images get their own address space?
1374 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1375
1376 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1377 if (RsrcIntr->IsImage) {
1378 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1379 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1380 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1381 Info.align.reset();
1382 }
1383
1384 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1385 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1386 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1387 // We conservatively set the memory operand of a buffer intrinsic to the
1388 // base resource pointer, so that we can access alias information about
1389 // those pointers. Cases like "this points at the same value
1390 // but with a different offset" are handled in
1391 // areMemAccessesTriviallyDisjoint.
1392 Info.ptrVal = RsrcArg;
1393 }
1394
1395 if (ME.onlyReadsMemory()) {
1396 if (RsrcIntr->IsImage) {
1397 unsigned MaxNumLanes = 4;
1398
1399 if (!BaseOpcode->Gather4) {
1400 // If this isn't a gather, we may have excess loaded elements in the
1401 // IR type. Check the dmask for the real number of elements loaded.
1402 unsigned DMask =
1403 cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1404 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1405 }
1406
1407 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1408 Ty: CI.getType(), MaxNumLanes);
1409 } else {
1410 Info.memVT =
1411 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1412 MaxNumLanes: std::numeric_limits<unsigned>::max());
1413 }
1414
1415 // FIXME: What does alignment mean for an image?
1416 Info.opc = ISD::INTRINSIC_W_CHAIN;
1417 Info.flags = Flags | MachineMemOperand::MOLoad;
1418 } else if (ME.onlyWritesMemory()) {
1419 Info.opc = ISD::INTRINSIC_VOID;
1420
1421 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1422 if (RsrcIntr->IsImage) {
1423 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1424 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1425 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1426 MaxNumLanes: DMaskLanes);
1427 } else
1428 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1429
1430 Info.flags = Flags | MachineMemOperand::MOStore;
1431 } else {
1432 // Atomic, NoReturn Sampler or prefetch
1433 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1434 : ISD::INTRINSIC_W_CHAIN;
1435
1436 switch (IntrID) {
1437 default:
1438 Info.flags = Flags | MachineMemOperand::MOLoad;
1439 if (!IsSPrefetch)
1440 Info.flags |= MachineMemOperand::MOStore;
1441
1442 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1443 // Fake memory access type for no return sampler intrinsics
1444 Info.memVT = MVT::i32;
1445 } else {
1446 // XXX - Should this be volatile without known ordering?
1447 Info.flags |= MachineMemOperand::MOVolatile;
1448 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1449 }
1450 break;
1451 case Intrinsic::amdgcn_raw_buffer_load_lds:
1452 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1454 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1455 case Intrinsic::amdgcn_struct_buffer_load_lds:
1456 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1459 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1460
1461 // Entry 0: Load from buffer.
1462 // Don't set an offset, since the pointer value always represents the
1463 // base of the buffer.
1464 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1465 Info.flags = Flags | MachineMemOperand::MOLoad;
1466 Infos.push_back(Elt: Info);
1467
1468 // Entry 1: Store to LDS.
1469 // Instruction offset is applied, and an additional per-lane offset
1470 // which we simulate using a larger memory type.
1471 Info.memVT = EVT::getIntegerVT(
1472 Context&: CI.getContext(), BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1473 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1474 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 2))
1475 ->getZExtValue();
1476 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1477 Info.flags = Flags | MachineMemOperand::MOStore;
1478 Infos.push_back(Elt: Info);
1479 return;
1480 }
1481 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1482 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1483 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1484 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1485 Info.memVT =
1486 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1487 MaxNumLanes: std::numeric_limits<unsigned>::max());
1488 Info.flags = Flags | MachineMemOperand::MOLoad;
1489 Infos.push_back(Elt: Info);
1490 return;
1491 }
1492 }
1493 }
1494 Infos.push_back(Elt: Info);
1495 return;
1496 }
1497
1498 IntrinsicInfo Info;
1499 switch (IntrID) {
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap: {
1502 Info.opc = ISD::INTRINSIC_W_CHAIN;
1503 Info.memVT = MVT::getVT(Ty: CI.getType());
1504 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1505 Info.align.reset();
1506 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1507
1508 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1509 if (!Vol->isZero())
1510 Info.flags |= MachineMemOperand::MOVolatile;
1511
1512 Infos.push_back(Elt: Info);
1513 return;
1514 }
1515 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1516 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1517 Info.opc = ISD::INTRINSIC_W_CHAIN;
1518 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1519 Info.ptrVal = nullptr;
1520 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1521 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1522 Infos.push_back(Elt: Info);
1523 return;
1524 }
1525 case Intrinsic::amdgcn_ds_append:
1526 case Intrinsic::amdgcn_ds_consume: {
1527 Info.opc = ISD::INTRINSIC_W_CHAIN;
1528 Info.memVT = MVT::getVT(Ty: CI.getType());
1529 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1530 Info.align.reset();
1531 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1532
1533 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1534 if (!Vol->isZero())
1535 Info.flags |= MachineMemOperand::MOVolatile;
1536
1537 Infos.push_back(Elt: Info);
1538 return;
1539 }
1540 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1541 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1542 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1543 ? ISD::INTRINSIC_W_CHAIN
1544 : ISD::INTRINSIC_VOID;
1545 Info.memVT = MVT::getVT(Ty: CI.getType());
1546 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1547 Info.memVT = MVT::i64;
1548 Info.size = 8;
1549 Info.align.reset();
1550 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1551 Infos.push_back(Elt: Info);
1552 return;
1553 }
1554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1555 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1556 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1557 Info.opc = ISD::INTRINSIC_W_CHAIN;
1558 Info.memVT =
1559 MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1560 ? CI.getType()
1561 : cast<StructType>(Val: CI.getType())
1562 ->getElementType(N: 0)); // XXX: what is correct VT?
1563
1564 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1565 Info.align.reset();
1566 Info.flags = Flags | MachineMemOperand::MOLoad |
1567 MachineMemOperand::MODereferenceable;
1568 Infos.push_back(Elt: Info);
1569 return;
1570 }
1571 case Intrinsic::amdgcn_global_atomic_fmin_num:
1572 case Intrinsic::amdgcn_global_atomic_fmax_num:
1573 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1574 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1575 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1576 Info.opc = ISD::INTRINSIC_W_CHAIN;
1577 Info.memVT = MVT::getVT(Ty: CI.getType());
1578 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1579 Info.align.reset();
1580 Info.flags =
1581 Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1582 MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile;
1583 Infos.push_back(Elt: Info);
1584 return;
1585 }
1586 case Intrinsic::amdgcn_cluster_load_b32:
1587 case Intrinsic::amdgcn_cluster_load_b64:
1588 case Intrinsic::amdgcn_cluster_load_b128:
1589 case Intrinsic::amdgcn_ds_load_tr6_b96:
1590 case Intrinsic::amdgcn_ds_load_tr4_b64:
1591 case Intrinsic::amdgcn_ds_load_tr8_b64:
1592 case Intrinsic::amdgcn_ds_load_tr16_b128:
1593 case Intrinsic::amdgcn_global_load_tr6_b96:
1594 case Intrinsic::amdgcn_global_load_tr4_b64:
1595 case Intrinsic::amdgcn_global_load_tr_b64:
1596 case Intrinsic::amdgcn_global_load_tr_b128:
1597 case Intrinsic::amdgcn_ds_read_tr4_b64:
1598 case Intrinsic::amdgcn_ds_read_tr6_b96:
1599 case Intrinsic::amdgcn_ds_read_tr8_b64:
1600 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1601 Info.opc = ISD::INTRINSIC_W_CHAIN;
1602 Info.memVT = MVT::getVT(Ty: CI.getType());
1603 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1604 Info.align.reset();
1605 Info.flags = Flags | MachineMemOperand::MOLoad;
1606 Infos.push_back(Elt: Info);
1607 return;
1608 }
1609 case Intrinsic::amdgcn_flat_load_monitor_b32:
1610 case Intrinsic::amdgcn_flat_load_monitor_b64:
1611 case Intrinsic::amdgcn_flat_load_monitor_b128:
1612 case Intrinsic::amdgcn_global_load_monitor_b32:
1613 case Intrinsic::amdgcn_global_load_monitor_b64:
1614 case Intrinsic::amdgcn_global_load_monitor_b128: {
1615 Info.opc = ISD::INTRINSIC_W_CHAIN;
1616 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1617 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1618 Info.align.reset();
1619 Info.flags = MachineMemOperand::MOLoad;
1620 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1621 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1622 Infos.push_back(Elt: Info);
1623 return;
1624 }
1625 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1626 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1627 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1628 Info.opc = ISD::INTRINSIC_W_CHAIN;
1629 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1630 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1631 Info.align.reset();
1632 Info.flags = (MachineMemOperand::MOLoad | MOCooperative);
1633 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1634 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1635 Infos.push_back(Elt: Info);
1636 return;
1637 }
1638 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1639 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1640 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1641 Info.opc = ISD::INTRINSIC_VOID;
1642 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1643 Info.ptrVal = CI.getArgOperand(i: 0);
1644 Info.align.reset();
1645 Info.flags = (MachineMemOperand::MOStore | MOCooperative);
1646 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 2);
1647 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 3);
1648 Infos.push_back(Elt: Info);
1649 return;
1650 }
1651 case Intrinsic::amdgcn_ds_gws_init:
1652 case Intrinsic::amdgcn_ds_gws_barrier:
1653 case Intrinsic::amdgcn_ds_gws_sema_v:
1654 case Intrinsic::amdgcn_ds_gws_sema_br:
1655 case Intrinsic::amdgcn_ds_gws_sema_p:
1656 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1657 Info.opc = ISD::INTRINSIC_VOID;
1658
1659 const GCNTargetMachine &TM =
1660 static_cast<const GCNTargetMachine &>(getTargetMachine());
1661
1662 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1663 Info.ptrVal = MFI->getGWSPSV(TM);
1664
1665 // This is an abstract access, but we need to specify a type and size.
1666 Info.memVT = MVT::i32;
1667 Info.size = 4;
1668 Info.align = Align(4);
1669
1670 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1671 Info.flags = Flags | MachineMemOperand::MOLoad;
1672 else
1673 Info.flags = Flags | MachineMemOperand::MOStore;
1674 Infos.push_back(Elt: Info);
1675 return;
1676 }
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1685 // Entry 0: Load from source (global/flat).
1686 Info.opc = ISD::INTRINSIC_VOID;
1687 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1688 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1689 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1690 Info.flags = Flags | MachineMemOperand::MOLoad;
1691 Infos.push_back(Elt: Info);
1692
1693 // Entry 1: Store to LDS (same offset).
1694 Info.flags = Flags | MachineMemOperand::MOStore;
1695 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1696 Infos.push_back(Elt: Info);
1697 return;
1698 }
1699 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1700 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1701 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1703 // Entry 0: Load from LDS.
1704 Info.opc = ISD::INTRINSIC_VOID;
1705 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1706 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1707 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1708 Info.flags = Flags | MachineMemOperand::MOLoad;
1709 Infos.push_back(Elt: Info);
1710
1711 // Entry 1: Store to global (same offset).
1712 Info.flags = Flags | MachineMemOperand::MOStore;
1713 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1714 Infos.push_back(Elt: Info);
1715 return;
1716 }
1717 case Intrinsic::amdgcn_load_to_lds:
1718 case Intrinsic::amdgcn_load_async_to_lds:
1719 case Intrinsic::amdgcn_global_load_lds:
1720 case Intrinsic::amdgcn_global_load_async_lds: {
1721 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1722 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1723 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1724 if (IsVolatile)
1725 Flags |= MachineMemOperand::MOVolatile;
1726
1727 // Entry 0: Load from source (global/flat).
1728 Info.opc = ISD::INTRINSIC_VOID;
1729 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1730 Info.ptrVal = CI.getArgOperand(i: 0); // Source pointer
1731 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 3))->getSExtValue();
1732 Info.flags = Flags | MachineMemOperand::MOLoad;
1733 Infos.push_back(Elt: Info);
1734
1735 // Entry 1: Store to LDS.
1736 // Same offset from the instruction, but an additional per-lane offset is
1737 // added. Represent that using a wider memory type.
1738 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(),
1739 BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1740 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1741 Info.flags = Flags | MachineMemOperand::MOStore;
1742 Infos.push_back(Elt: Info);
1743 return;
1744 }
1745 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1746 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1747 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1748 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1749 Info.opc = ISD::INTRINSIC_W_CHAIN;
1750
1751 const GCNTargetMachine &TM =
1752 static_cast<const GCNTargetMachine &>(getTargetMachine());
1753
1754 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1755 Info.ptrVal = MFI->getGWSPSV(TM);
1756
1757 // This is an abstract access, but we need to specify a type and size.
1758 Info.memVT = MVT::i32;
1759 Info.size = 4;
1760 Info.align = Align(4);
1761
1762 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1763 Infos.push_back(Elt: Info);
1764 return;
1765 }
1766 case Intrinsic::amdgcn_s_prefetch_data:
1767 case Intrinsic::amdgcn_flat_prefetch:
1768 case Intrinsic::amdgcn_global_prefetch: {
1769 Info.opc = ISD::INTRINSIC_VOID;
1770 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: 8);
1771 Info.ptrVal = CI.getArgOperand(i: 0);
1772 Info.flags = Flags | MachineMemOperand::MOLoad;
1773 Infos.push_back(Elt: Info);
1774 return;
1775 }
1776 default:
1777 return;
1778 }
1779}
1780
1781void SITargetLowering::CollectTargetIntrinsicOperands(
1782 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1783 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1784 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1785 // The DAG's ValueType loses the addrspaces.
1786 // Add them as 2 extra Constant operands "from" and "to".
1787 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1788 unsigned DstAS = I.getType()->getPointerAddressSpace();
1789 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1790 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1791 break;
1792 }
1793 default:
1794 break;
1795 }
1796}
1797
1798bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1799 SmallVectorImpl<Value *> &Ops,
1800 Type *&AccessTy) const {
1801 Value *Ptr = nullptr;
1802 switch (II->getIntrinsicID()) {
1803 case Intrinsic::amdgcn_cluster_load_b128:
1804 case Intrinsic::amdgcn_cluster_load_b64:
1805 case Intrinsic::amdgcn_cluster_load_b32:
1806 case Intrinsic::amdgcn_ds_append:
1807 case Intrinsic::amdgcn_ds_consume:
1808 case Intrinsic::amdgcn_ds_load_tr8_b64:
1809 case Intrinsic::amdgcn_ds_load_tr16_b128:
1810 case Intrinsic::amdgcn_ds_load_tr4_b64:
1811 case Intrinsic::amdgcn_ds_load_tr6_b96:
1812 case Intrinsic::amdgcn_ds_read_tr4_b64:
1813 case Intrinsic::amdgcn_ds_read_tr6_b96:
1814 case Intrinsic::amdgcn_ds_read_tr8_b64:
1815 case Intrinsic::amdgcn_ds_read_tr16_b64:
1816 case Intrinsic::amdgcn_ds_ordered_add:
1817 case Intrinsic::amdgcn_ds_ordered_swap:
1818 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1819 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1820 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1821 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1822 case Intrinsic::amdgcn_global_atomic_fmax_num:
1823 case Intrinsic::amdgcn_global_atomic_fmin_num:
1824 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1825 case Intrinsic::amdgcn_global_load_tr_b64:
1826 case Intrinsic::amdgcn_global_load_tr_b128:
1827 case Intrinsic::amdgcn_global_load_tr4_b64:
1828 case Intrinsic::amdgcn_global_load_tr6_b96:
1829 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1830 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1831 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1833 Ptr = II->getArgOperand(i: 0);
1834 break;
1835 case Intrinsic::amdgcn_load_to_lds:
1836 case Intrinsic::amdgcn_load_async_to_lds:
1837 case Intrinsic::amdgcn_global_load_lds:
1838 case Intrinsic::amdgcn_global_load_async_lds:
1839 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1840 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1841 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1843 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1847 Ptr = II->getArgOperand(i: 1);
1848 break;
1849 default:
1850 return false;
1851 }
1852 AccessTy = II->getType();
1853 Ops.push_back(Elt: Ptr);
1854 return true;
1855}
1856
1857bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1858 unsigned AddrSpace) const {
1859 if (!Subtarget->hasFlatInstOffsets()) {
1860 // Flat instructions do not have offsets, and only have the register
1861 // address.
1862 return AM.BaseOffs == 0 && AM.Scale == 0;
1863 }
1864
1865 decltype(SIInstrFlags::FLAT) FlatVariant =
1866 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1867 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1868 : SIInstrFlags::FLAT;
1869
1870 return AM.Scale == 0 &&
1871 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1872 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1873}
1874
1875bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1876 if (Subtarget->hasFlatGlobalInsts())
1877 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1878
1879 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1880 // Assume the we will use FLAT for all global memory accesses
1881 // on VI.
1882 // FIXME: This assumption is currently wrong. On VI we still use
1883 // MUBUF instructions for the r + i addressing mode. As currently
1884 // implemented, the MUBUF instructions only work on buffer < 4GB.
1885 // It may be possible to support > 4GB buffers with MUBUF instructions,
1886 // by setting the stride value in the resource descriptor which would
1887 // increase the size limit to (stride * 4GB). However, this is risky,
1888 // because it has never been validated.
1889 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1890 }
1891
1892 return isLegalMUBUFAddressingMode(AM);
1893}
1894
1895bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1896 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1897 // additionally can do r + r + i with addr64. 32-bit has more addressing
1898 // mode options. Depending on the resource constant, it can also do
1899 // (i64 r0) + (i32 r1) * (i14 i).
1900 //
1901 // Private arrays end up using a scratch buffer most of the time, so also
1902 // assume those use MUBUF instructions. Scratch loads / stores are currently
1903 // implemented as mubuf instructions with offen bit set, so slightly
1904 // different than the normal addr64.
1905 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1906 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1907 return false;
1908
1909 // FIXME: Since we can split immediate into soffset and immediate offset,
1910 // would it make sense to allow any immediate?
1911
1912 switch (AM.Scale) {
1913 case 0: // r + i or just i, depending on HasBaseReg.
1914 return true;
1915 case 1:
1916 return true; // We have r + r or r + i.
1917 case 2:
1918 if (AM.HasBaseReg) {
1919 // Reject 2 * r + r.
1920 return false;
1921 }
1922
1923 // Allow 2 * r as r + r
1924 // Or 2 * r + i is allowed as r + r + i.
1925 return true;
1926 default: // Don't allow n * r
1927 return false;
1928 }
1929}
1930
1931bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1932 const AddrMode &AM, Type *Ty,
1933 unsigned AS,
1934 Instruction *I) const {
1935 // No global is ever allowed as a base.
1936 if (AM.BaseGV)
1937 return false;
1938
1939 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1940 return isLegalGlobalAddressingMode(AM);
1941
1942 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1943 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1944 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1945 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1946 // If the offset isn't a multiple of 4, it probably isn't going to be
1947 // correctly aligned.
1948 // FIXME: Can we get the real alignment here?
1949 if (AM.BaseOffs % 4 != 0)
1950 return isLegalMUBUFAddressingMode(AM);
1951
1952 if (!Subtarget->hasScalarSubwordLoads()) {
1953 // There are no SMRD extloads, so if we have to do a small type access we
1954 // will use a MUBUF load.
1955 // FIXME?: We also need to do this if unaligned, but we don't know the
1956 // alignment here.
1957 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1958 return isLegalGlobalAddressingMode(AM);
1959 }
1960
1961 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1962 // SMRD instructions have an 8-bit, dword offset on SI.
1963 if (!isUInt<8>(x: AM.BaseOffs / 4))
1964 return false;
1965 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1966 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1967 // in 8-bits, it can use a smaller encoding.
1968 if (!isUInt<32>(x: AM.BaseOffs / 4))
1969 return false;
1970 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1971 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1972 if (!isUInt<20>(x: AM.BaseOffs))
1973 return false;
1974 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1975 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1976 // for S_BUFFER_* instructions).
1977 if (!isInt<21>(x: AM.BaseOffs))
1978 return false;
1979 } else {
1980 // On GFX12, all offsets are signed 24-bit in bytes.
1981 if (!isInt<24>(x: AM.BaseOffs))
1982 return false;
1983 }
1984
1985 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1986 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1987 AM.BaseOffs < 0) {
1988 // Scalar (non-buffer) loads can only use a negative offset if
1989 // soffset+offset is non-negative. Since the compiler can only prove that
1990 // in a few special cases, it is safer to claim that negative offsets are
1991 // not supported.
1992 return false;
1993 }
1994
1995 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1996 return true;
1997
1998 if (AM.Scale == 1 && AM.HasBaseReg)
1999 return true;
2000
2001 return false;
2002 }
2003
2004 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2005 return Subtarget->hasFlatScratchEnabled()
2006 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
2007 : isLegalMUBUFAddressingMode(AM);
2008
2009 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2010 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2011 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2012 // field.
2013 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2014 // an 8-bit dword offset but we don't know the alignment here.
2015 if (!isUInt<16>(x: AM.BaseOffs))
2016 return false;
2017
2018 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2019 return true;
2020
2021 if (AM.Scale == 1 && AM.HasBaseReg)
2022 return true;
2023
2024 return false;
2025 }
2026
2027 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
2028 // For an unknown address space, this usually means that this is for some
2029 // reason being used for pure arithmetic, and not based on some addressing
2030 // computation. We don't have instructions that compute pointers with any
2031 // addressing modes, so treat them as having no offset like flat
2032 // instructions.
2033 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
2034 }
2035
2036 // Assume a user alias of global for unknown address spaces.
2037 return isLegalGlobalAddressingMode(AM);
2038}
2039
2040bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
2041 const MachineFunction &MF) const {
2042 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2043 return (MemVT.getSizeInBits() <= 4 * 32);
2044 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2045 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2046 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2047 }
2048 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
2049 return (MemVT.getSizeInBits() <= 2 * 32);
2050 return true;
2051}
2052
2053bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
2054 unsigned Size, unsigned AddrSpace, Align Alignment,
2055 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2056 if (IsFast)
2057 *IsFast = 0;
2058
2059 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2060 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2061 // Check if alignment requirements for ds_read/write instructions are
2062 // disabled.
2063 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2064 return false;
2065
2066 Align RequiredAlignment(
2067 PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))); // Natural alignment.
2068 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2069 Alignment < RequiredAlignment)
2070 return false;
2071
2072 // Either, the alignment requirements are "enabled", or there is an
2073 // unaligned LDS access related hardware bug though alignment requirements
2074 // are "disabled". In either case, we need to check for proper alignment
2075 // requirements.
2076 //
2077 switch (Size) {
2078 case 64:
2079 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2080 // address is negative, then the instruction is incorrectly treated as
2081 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2082 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2083 // load later in the SILoadStoreOptimizer.
2084 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2085 return false;
2086
2087 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2088 // can do a 4 byte aligned, 8 byte access in a single operation using
2089 // ds_read2/write2_b32 with adjacent offsets.
2090 RequiredAlignment = Align(4);
2091
2092 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2093 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2094 // ds_write2_b32 depending on the alignment. In either case with either
2095 // alignment there is no faster way of doing this.
2096
2097 // The numbers returned here and below are not additive, it is a 'speed
2098 // rank'. They are just meant to be compared to decide if a certain way
2099 // of lowering an operation is faster than another. For that purpose
2100 // naturally aligned operation gets it bitsize to indicate that "it
2101 // operates with a speed comparable to N-bit wide load". With the full
2102 // alignment ds128 is slower than ds96 for example. If underaligned it
2103 // is comparable to a speed of a single dword access, which would then
2104 // mean 32 < 128 and it is faster to issue a wide load regardless.
2105 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2106 // wider load which will not be aligned anymore the latter is slower.
2107 if (IsFast)
2108 *IsFast = (Alignment >= RequiredAlignment) ? 64
2109 : (Alignment < Align(4)) ? 32
2110 : 1;
2111 return true;
2112 }
2113
2114 break;
2115 case 96:
2116 if (!Subtarget->hasDS96AndDS128())
2117 return false;
2118
2119 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2120 // gfx8 and older.
2121
2122 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2123 // Naturally aligned access is fastest. However, also report it is Fast
2124 // if memory is aligned less than DWORD. A narrow load or store will be
2125 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2126 // be more of them, so overall we will pay less penalty issuing a single
2127 // instruction.
2128
2129 // See comment on the values above.
2130 if (IsFast)
2131 *IsFast = (Alignment >= RequiredAlignment) ? 96
2132 : (Alignment < Align(4)) ? 32
2133 : 1;
2134 return true;
2135 }
2136
2137 break;
2138 case 128:
2139 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2140 return false;
2141
2142 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2143 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2144 // single operation using ds_read2/write2_b64.
2145 RequiredAlignment = Align(8);
2146
2147 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2148 // Naturally aligned access is fastest. However, also report it is Fast
2149 // if memory is aligned less than DWORD. A narrow load or store will be
2150 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2151 // will be more of them, so overall we will pay less penalty issuing a
2152 // single instruction.
2153
2154 // See comment on the values above.
2155 if (IsFast)
2156 *IsFast = (Alignment >= RequiredAlignment) ? 128
2157 : (Alignment < Align(4)) ? 32
2158 : 1;
2159 return true;
2160 }
2161
2162 break;
2163 default:
2164 if (Size > 32)
2165 return false;
2166
2167 break;
2168 }
2169
2170 // See comment on the values above.
2171 // Note that we have a single-dword or sub-dword here, so if underaligned
2172 // it is a slowest possible access, hence returned value is 0.
2173 if (IsFast)
2174 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2175
2176 return Alignment >= RequiredAlignment ||
2177 Subtarget->hasUnalignedDSAccessEnabled();
2178 }
2179
2180 // FIXME: We have to be conservative here and assume that flat operations
2181 // will access scratch. If we had access to the IR function, then we
2182 // could determine if any private memory was used in the function.
2183 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2184 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2185 bool AlignedBy4 = Alignment >= Align(4);
2186 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2187 if (IsFast)
2188 *IsFast = AlignedBy4 ? Size : 1;
2189 return true;
2190 }
2191
2192 if (IsFast)
2193 *IsFast = AlignedBy4;
2194
2195 return AlignedBy4;
2196 }
2197
2198 // So long as they are correct, wide global memory operations perform better
2199 // than multiple smaller memory ops -- even when misaligned
2200 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
2201 if (IsFast)
2202 *IsFast = Size;
2203
2204 return Alignment >= Align(4) ||
2205 Subtarget->hasUnalignedBufferAccessEnabled();
2206 }
2207
2208 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2209 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2210 // out-of-bounds behavior, but in the edge case where an access starts
2211 // out-of-bounds and then enter in-bounds, the entire access would be treated
2212 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2213 // natural alignment of buffer accesses.
2214 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2215 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2216 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2217 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2218 Alignment < Align(PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))))
2219 return false;
2220 }
2221
2222 // Smaller than dword value must be aligned.
2223 if (Size < 32)
2224 return false;
2225
2226 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2227 // byte-address are ignored, thus forcing Dword alignment.
2228 // This applies to private, global, and constant memory.
2229 if (IsFast)
2230 *IsFast = 1;
2231
2232 return Size >= 32 && Alignment >= Align(4);
2233}
2234
2235bool SITargetLowering::allowsMisalignedMemoryAccesses(
2236 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2237 unsigned *IsFast) const {
2238 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
2239 Alignment, Flags, IsFast);
2240}
2241
2242EVT SITargetLowering::getOptimalMemOpType(
2243 LLVMContext &Context, const MemOp &Op,
2244 const AttributeList &FuncAttributes) const {
2245 // FIXME: Should account for address space here.
2246
2247 // The default fallback uses the private pointer size as a guess for a type to
2248 // use. Make sure we switch these to 64-bit accesses.
2249
2250 if (Op.size() >= 16 &&
2251 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
2252 return MVT::v4i32;
2253
2254 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
2255 return MVT::v2i32;
2256
2257 // Use the default.
2258 return MVT::Other;
2259}
2260
2261bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2262 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2263 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2264}
2265
2266bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2267 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2268 AS == AMDGPUAS::PRIVATE_ADDRESS;
2269}
2270
2271bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2272 unsigned DestAS) const {
2273 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2274 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2275 Subtarget->hasGloballyAddressableScratch()) {
2276 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2277 return false;
2278 }
2279
2280 // Flat -> private/local is a simple truncate.
2281 // Flat -> global is no-op
2282 return true;
2283 }
2284
2285 const GCNTargetMachine &TM =
2286 static_cast<const GCNTargetMachine &>(getTargetMachine());
2287 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2288}
2289
2290TargetLoweringBase::LegalizeTypeAction
2291SITargetLowering::getPreferredVectorAction(MVT VT) const {
2292 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2293 VT.getScalarType().bitsLE(VT: MVT::i16))
2294 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2295 return TargetLoweringBase::getPreferredVectorAction(VT);
2296}
2297
2298bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2299 Type *Ty) const {
2300 // FIXME: Could be smarter if called for vector constants.
2301 return true;
2302}
2303
2304bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2305 unsigned Index) const {
2306 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2307 return false;
2308
2309 // TODO: Add more cases that are cheap.
2310 return Index == 0;
2311}
2312
2313bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2314 // TODO: This should be more aggressive, particular for 16-bit element
2315 // vectors. However there are some mixed improvements and regressions.
2316 EVT EltTy = VT.getVectorElementType();
2317 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2318 return EltTy.getSizeInBits() % MinAlign == 0;
2319}
2320
2321bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2322 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2323 switch (Op) {
2324 case ISD::LOAD:
2325 case ISD::STORE:
2326 return true;
2327 default:
2328 return false;
2329 }
2330 }
2331
2332 // SimplifySetCC uses this function to determine whether or not it should
2333 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2334 if (VT == MVT::i1 && Op == ISD::SETCC)
2335 return false;
2336
2337 return TargetLowering::isTypeDesirableForOp(Op, VT);
2338}
2339
2340MachinePointerInfo
2341SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
2342 // This isn't really a constant pool but close enough.
2343 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
2344 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
2345 return PtrInfo;
2346}
2347
2348SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2349 const SDLoc &SL,
2350 SDValue Chain,
2351 uint64_t Offset) const {
2352 const DataLayout &DL = DAG.getDataLayout();
2353 MachineFunction &MF = DAG.getMachineFunction();
2354 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2355 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2356
2357 auto [InputPtrReg, RC, ArgTy] =
2358 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2359
2360 // We may not have the kernarg segment argument if we have no kernel
2361 // arguments.
2362 if (!InputPtrReg)
2363 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2364
2365 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2366 SDValue BasePtr = DAG.getCopyFromReg(
2367 Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2368
2369 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2370}
2371
2372SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2373 const SDLoc &SL) const {
2374 uint64_t Offset =
2375 getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2376 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2377}
2378
2379SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2380 const SDLoc &SL) const {
2381
2382 Function &F = DAG.getMachineFunction().getFunction();
2383 std::optional<uint32_t> KnownSize =
2384 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2385 if (KnownSize.has_value())
2386 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2387 return SDValue();
2388}
2389
2390SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2391 const SDLoc &SL, SDValue Val,
2392 bool Signed,
2393 const ISD::InputArg *Arg) const {
2394 // First, if it is a widened vector, narrow it.
2395 if (VT.isVector() &&
2396 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2397 EVT NarrowedVT =
2398 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2399 NumElements: VT.getVectorNumElements());
2400 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2401 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2402 }
2403
2404 // Then convert the vector elements or scalar value.
2405 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2406 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2407 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2408 }
2409
2410 if (MemVT.isFloatingPoint()) {
2411 if (VT.isFloatingPoint()) {
2412 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2413 } else {
2414 assert(!MemVT.isVector());
2415 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
2416 SDValue Cast = DAG.getBitcast(VT: IntVT, V: Val);
2417 Val = DAG.getAnyExtOrTrunc(Op: Cast, DL: SL, VT);
2418 }
2419 } else if (Signed)
2420 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2421 else
2422 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2423
2424 return Val;
2425}
2426
2427SDValue SITargetLowering::lowerKernargMemParameter(
2428 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2429 uint64_t Offset, Align Alignment, bool Signed,
2430 const ISD::InputArg *Arg) const {
2431
2432 MachinePointerInfo PtrInfo =
2433 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
2434
2435 // Try to avoid using an extload by loading earlier than the argument address,
2436 // and extracting the relevant bits. The load should hopefully be merged with
2437 // the previous argument.
2438 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2439 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2440 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2441 int64_t OffsetDiff = Offset - AlignDownOffset;
2442
2443 EVT IntVT = MemVT.changeTypeToInteger();
2444
2445 // TODO: If we passed in the base kernel offset we could have a better
2446 // alignment than 4, but we don't really need it.
2447 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2448 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr,
2449 PtrInfo: PtrInfo.getWithOffset(O: AlignDownOffset), Alignment: Align(4),
2450 MMOFlags: MachineMemOperand::MODereferenceable |
2451 MachineMemOperand::MOInvariant);
2452
2453 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2454 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2455
2456 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2457 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2458 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2459
2460 return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: 1)}, dl: SL);
2461 }
2462
2463 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2464 SDValue Load = DAG.getLoad(
2465 VT: MemVT, dl: SL, Chain, Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
2466 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
2467
2468 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2469 return DAG.getMergeValues(Ops: {Val, Load.getValue(R: 1)}, dl: SL);
2470}
2471
2472/// Coerce an argument which was passed in a different ABI type to the original
2473/// expected value type.
2474SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2475 SDValue Val,
2476 CCValAssign &VA,
2477 const SDLoc &SL) const {
2478 EVT ValVT = VA.getValVT();
2479
2480 // If this is an 8 or 16-bit value, it is really passed promoted
2481 // to 32 bits. Insert an assert[sz]ext to capture this, then
2482 // truncate to the right size.
2483 switch (VA.getLocInfo()) {
2484 case CCValAssign::Full:
2485 return Val;
2486 case CCValAssign::BCvt:
2487 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ValVT, Operand: Val);
2488 case CCValAssign::SExt:
2489 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: SL, VT: VA.getLocVT(), N1: Val,
2490 N2: DAG.getValueType(ValVT));
2491 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2492 case CCValAssign::ZExt:
2493 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: VA.getLocVT(), N1: Val,
2494 N2: DAG.getValueType(ValVT));
2495 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2496 case CCValAssign::AExt:
2497 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2498 default:
2499 llvm_unreachable("Unknown loc info!");
2500 }
2501}
2502
2503SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2504 CCValAssign &VA, const SDLoc &SL,
2505 SDValue Chain,
2506 const ISD::InputArg &Arg) const {
2507 MachineFunction &MF = DAG.getMachineFunction();
2508 MachineFrameInfo &MFI = MF.getFrameInfo();
2509
2510 if (Arg.Flags.isByVal()) {
2511 unsigned Size = Arg.Flags.getByValSize();
2512 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2513 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2514 }
2515
2516 unsigned ArgOffset = VA.getLocMemOffset();
2517 unsigned ArgSize = VA.getValVT().getStoreSize();
2518
2519 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2520
2521 // Create load nodes to retrieve arguments from the stack.
2522 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2523
2524 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2525 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2526 MVT MemVT = VA.getValVT();
2527
2528 switch (VA.getLocInfo()) {
2529 default:
2530 break;
2531 case CCValAssign::BCvt:
2532 MemVT = VA.getLocVT();
2533 break;
2534 case CCValAssign::SExt:
2535 ExtType = ISD::SEXTLOAD;
2536 break;
2537 case CCValAssign::ZExt:
2538 ExtType = ISD::ZEXTLOAD;
2539 break;
2540 case CCValAssign::AExt:
2541 ExtType = ISD::EXTLOAD;
2542 break;
2543 }
2544
2545 SDValue ArgValue = DAG.getExtLoad(
2546 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2547 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2548
2549 SDValue ConvertedVal = convertABITypeToValueType(DAG, Val: ArgValue, VA, SL);
2550 if (ConvertedVal == ArgValue)
2551 return ConvertedVal;
2552
2553 return DAG.getMergeValues(Ops: {ConvertedVal, ArgValue.getValue(R: 1)}, dl: SL);
2554}
2555
2556SDValue SITargetLowering::lowerWorkGroupId(
2557 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2558 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
2559 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
2560 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2561 if (!Subtarget->hasClusters())
2562 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2563
2564 // Clusters are supported. Return the global position in the grid. If clusters
2565 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2566
2567 // WorkGroupIdXYZ = ClusterId == 0 ?
2568 // ClusterIdXYZ :
2569 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2570 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2571 SDLoc SL(ClusterIdXYZ);
2572 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2573 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT);
2574 SDValue ClusterSizeXYZ = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterMaxIdXYZ, N2: One);
2575 SDValue ClusterWorkGroupIdXYZ =
2576 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2577 SDValue GlobalIdXYZ =
2578 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterWorkGroupIdXYZ,
2579 N2: DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: ClusterIdXYZ, N2: ClusterSizeXYZ));
2580
2581 switch (MFI.getClusterDims().getKind()) {
2582 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
2583 case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
2584 return GlobalIdXYZ;
2585 case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
2586 return ClusterIdXYZ;
2587 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
2588 using namespace AMDGPU::Hwreg;
2589 SDValue ClusterIdField =
2590 DAG.getTargetConstant(Val: HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4), DL: SL, VT);
2591 SDNode *GetReg =
2592 DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT, Op1: ClusterIdField);
2593 SDValue ClusterId(GetReg, 0);
2594 SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT);
2595 return DAG.getNode(Opcode: ISD::SELECT_CC, DL: SL, VT, N1: ClusterId, N2: Zero, N3: ClusterIdXYZ,
2596 N4: GlobalIdXYZ, N5: DAG.getCondCode(Cond: ISD::SETEQ));
2597 }
2598 }
2599
2600 llvm_unreachable("nothing should reach here");
2601}
2602
2603SDValue SITargetLowering::getPreloadedValue(
2604 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2605 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2606 const ArgDescriptor *Reg = nullptr;
2607 const TargetRegisterClass *RC;
2608 LLT Ty;
2609
2610 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2611 const ArgDescriptor WorkGroupIDX =
2612 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2613 // If GridZ is not programmed in an entry function then the hardware will set
2614 // it to all zeros, so there is no need to mask the GridY value in the low
2615 // order bits.
2616 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2617 Reg: AMDGPU::TTMP7,
2618 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2619 const ArgDescriptor WorkGroupIDZ =
2620 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2621 const ArgDescriptor ClusterWorkGroupIDX =
2622 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
2623 const ArgDescriptor ClusterWorkGroupIDY =
2624 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
2625 const ArgDescriptor ClusterWorkGroupIDZ =
2626 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
2627 const ArgDescriptor ClusterWorkGroupMaxIDX =
2628 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
2629 const ArgDescriptor ClusterWorkGroupMaxIDY =
2630 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
2631 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2632 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
2633 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2634 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
2635
2636 auto LoadConstant = [&](unsigned N) {
2637 return DAG.getConstant(Val: N, DL: SDLoc(), VT);
2638 };
2639
2640 if (Subtarget->hasArchitectedSGPRs() &&
2641 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2642 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2643 bool HasFixedDims = ClusterDims.isFixedDims();
2644
2645 switch (PVID) {
2646 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2647 Reg = &WorkGroupIDX;
2648 RC = &AMDGPU::SReg_32RegClass;
2649 Ty = LLT::scalar(SizeInBits: 32);
2650 break;
2651 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2652 Reg = &WorkGroupIDY;
2653 RC = &AMDGPU::SReg_32RegClass;
2654 Ty = LLT::scalar(SizeInBits: 32);
2655 break;
2656 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2657 Reg = &WorkGroupIDZ;
2658 RC = &AMDGPU::SReg_32RegClass;
2659 Ty = LLT::scalar(SizeInBits: 32);
2660 break;
2661 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
2662 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2663 return LoadConstant(0);
2664 Reg = &ClusterWorkGroupIDX;
2665 RC = &AMDGPU::SReg_32RegClass;
2666 Ty = LLT::scalar(SizeInBits: 32);
2667 break;
2668 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
2669 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2670 return LoadConstant(0);
2671 Reg = &ClusterWorkGroupIDY;
2672 RC = &AMDGPU::SReg_32RegClass;
2673 Ty = LLT::scalar(SizeInBits: 32);
2674 break;
2675 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
2676 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2677 return LoadConstant(0);
2678 Reg = &ClusterWorkGroupIDZ;
2679 RC = &AMDGPU::SReg_32RegClass;
2680 Ty = LLT::scalar(SizeInBits: 32);
2681 break;
2682 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
2683 if (HasFixedDims)
2684 return LoadConstant(ClusterDims.getDims()[0] - 1);
2685 Reg = &ClusterWorkGroupMaxIDX;
2686 RC = &AMDGPU::SReg_32RegClass;
2687 Ty = LLT::scalar(SizeInBits: 32);
2688 break;
2689 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
2690 if (HasFixedDims)
2691 return LoadConstant(ClusterDims.getDims()[1] - 1);
2692 Reg = &ClusterWorkGroupMaxIDY;
2693 RC = &AMDGPU::SReg_32RegClass;
2694 Ty = LLT::scalar(SizeInBits: 32);
2695 break;
2696 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
2697 if (HasFixedDims)
2698 return LoadConstant(ClusterDims.getDims()[2] - 1);
2699 Reg = &ClusterWorkGroupMaxIDZ;
2700 RC = &AMDGPU::SReg_32RegClass;
2701 Ty = LLT::scalar(SizeInBits: 32);
2702 break;
2703 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
2704 Reg = &ClusterWorkGroupMaxFlatID;
2705 RC = &AMDGPU::SReg_32RegClass;
2706 Ty = LLT::scalar(SizeInBits: 32);
2707 break;
2708 default:
2709 break;
2710 }
2711 }
2712
2713 if (!Reg)
2714 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2715 if (!Reg) {
2716 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2717 // It's possible for a kernarg intrinsic call to appear in a kernel with
2718 // no allocated segment, in which case we do not add the user sgpr
2719 // argument, so just return null.
2720 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2721 }
2722
2723 // It's undefined behavior if a function marked with the amdgpu-no-*
2724 // attributes uses the corresponding intrinsic.
2725 return DAG.getPOISON(VT);
2726 }
2727
2728 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2729}
2730
2731static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2732 CallingConv::ID CallConv,
2733 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2734 FunctionType *FType,
2735 SIMachineFunctionInfo *Info) {
2736 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2737 const ISD::InputArg *Arg = &Ins[I];
2738
2739 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2740 "vector type argument should have been split");
2741
2742 // First check if it's a PS input addr.
2743 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2744 PSInputNum <= 15) {
2745 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2746
2747 // Inconveniently only the first part of the split is marked as isSplit,
2748 // so skip to the end. We only want to increment PSInputNum once for the
2749 // entire split argument.
2750 if (Arg->Flags.isSplit()) {
2751 while (!Arg->Flags.isSplitEnd()) {
2752 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2753 "unexpected vector split in ps argument type");
2754 if (!SkipArg)
2755 Splits.push_back(Elt: *Arg);
2756 Arg = &Ins[++I];
2757 }
2758 }
2759
2760 if (SkipArg) {
2761 // We can safely skip PS inputs.
2762 Skipped.set(Arg->getOrigArgIndex());
2763 ++PSInputNum;
2764 continue;
2765 }
2766
2767 Info->markPSInputAllocated(Index: PSInputNum);
2768 if (Arg->Used)
2769 Info->markPSInputEnabled(Index: PSInputNum);
2770
2771 ++PSInputNum;
2772 }
2773
2774 Splits.push_back(Elt: *Arg);
2775 }
2776}
2777
2778// Allocate special inputs passed in VGPRs.
2779void SITargetLowering::allocateSpecialEntryInputVGPRs(
2780 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2781 SIMachineFunctionInfo &Info) const {
2782 const LLT S32 = LLT::scalar(SizeInBits: 32);
2783 MachineRegisterInfo &MRI = MF.getRegInfo();
2784
2785 if (Info.hasWorkItemIDX()) {
2786 Register Reg = AMDGPU::VGPR0;
2787 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2788
2789 CCInfo.AllocateReg(Reg);
2790 unsigned Mask =
2791 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2792 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2793 }
2794
2795 if (Info.hasWorkItemIDY()) {
2796 assert(Info.hasWorkItemIDX());
2797 if (Subtarget->hasPackedTID()) {
2798 Info.setWorkItemIDY(
2799 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 10));
2800 } else {
2801 unsigned Reg = AMDGPU::VGPR1;
2802 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2803
2804 CCInfo.AllocateReg(Reg);
2805 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2806 }
2807 }
2808
2809 if (Info.hasWorkItemIDZ()) {
2810 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2811 if (Subtarget->hasPackedTID()) {
2812 Info.setWorkItemIDZ(
2813 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 20));
2814 } else {
2815 unsigned Reg = AMDGPU::VGPR2;
2816 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2817
2818 CCInfo.AllocateReg(Reg);
2819 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2820 }
2821 }
2822}
2823
2824// Try to allocate a VGPR at the end of the argument list, or if no argument
2825// VGPRs are left allocating a stack slot.
2826// If \p Mask is is given it indicates bitfield position in the register.
2827// If \p Arg is given use it with new ]p Mask instead of allocating new.
2828static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2829 ArgDescriptor Arg = ArgDescriptor()) {
2830 if (Arg.isSet())
2831 return ArgDescriptor::createArg(Arg, Mask);
2832
2833 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2834 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2835 if (RegIdx == ArgVGPRs.size()) {
2836 // Spill to stack required.
2837 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2838
2839 return ArgDescriptor::createStack(Offset, Mask);
2840 }
2841
2842 unsigned Reg = ArgVGPRs[RegIdx];
2843 Reg = CCInfo.AllocateReg(Reg);
2844 assert(Reg != AMDGPU::NoRegister);
2845
2846 MachineFunction &MF = CCInfo.getMachineFunction();
2847 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2848 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2849 return ArgDescriptor::createRegister(Reg, Mask);
2850}
2851
2852static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2853 const TargetRegisterClass *RC,
2854 unsigned NumArgRegs) {
2855 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2856 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2857 if (RegIdx == ArgSGPRs.size())
2858 report_fatal_error(reason: "ran out of SGPRs for arguments");
2859
2860 unsigned Reg = ArgSGPRs[RegIdx];
2861 Reg = CCInfo.AllocateReg(Reg);
2862 assert(Reg != AMDGPU::NoRegister);
2863
2864 MachineFunction &MF = CCInfo.getMachineFunction();
2865 MF.addLiveIn(PReg: Reg, RC);
2866 return ArgDescriptor::createRegister(Reg);
2867}
2868
2869// If this has a fixed position, we still should allocate the register in the
2870// CCInfo state. Technically we could get away with this for values passed
2871// outside of the normal argument range.
2872static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2873 const TargetRegisterClass *RC,
2874 MCRegister Reg) {
2875 Reg = CCInfo.AllocateReg(Reg);
2876 assert(Reg != AMDGPU::NoRegister);
2877 MachineFunction &MF = CCInfo.getMachineFunction();
2878 MF.addLiveIn(PReg: Reg, RC);
2879}
2880
2881static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2882 if (Arg) {
2883 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2884 Reg: Arg.getRegister());
2885 } else
2886 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2887}
2888
2889static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2890 if (Arg) {
2891 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2892 Reg: Arg.getRegister());
2893 } else
2894 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2895}
2896
2897/// Allocate implicit function VGPR arguments at the end of allocated user
2898/// arguments.
2899void SITargetLowering::allocateSpecialInputVGPRs(
2900 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2901 SIMachineFunctionInfo &Info) const {
2902 const unsigned Mask = 0x3ff;
2903 ArgDescriptor Arg;
2904
2905 if (Info.hasWorkItemIDX()) {
2906 Arg = allocateVGPR32Input(CCInfo, Mask);
2907 Info.setWorkItemIDX(Arg);
2908 }
2909
2910 if (Info.hasWorkItemIDY()) {
2911 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2912 Info.setWorkItemIDY(Arg);
2913 }
2914
2915 if (Info.hasWorkItemIDZ())
2916 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2917}
2918
2919/// Allocate implicit function VGPR arguments in fixed registers.
2920void SITargetLowering::allocateSpecialInputVGPRsFixed(
2921 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2922 SIMachineFunctionInfo &Info) const {
2923 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2924 if (!Reg)
2925 report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2926
2927 const unsigned Mask = 0x3ff;
2928 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2929 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2930 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2931}
2932
2933void SITargetLowering::allocateSpecialInputSGPRs(
2934 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2935 SIMachineFunctionInfo &Info) const {
2936 auto &ArgInfo = Info.getArgInfo();
2937 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2938
2939 // TODO: Unify handling with private memory pointers.
2940 if (UserSGPRInfo.hasDispatchPtr())
2941 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2942
2943 if (UserSGPRInfo.hasQueuePtr())
2944 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2945
2946 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2947 // constant offset from the kernarg segment.
2948 if (Info.hasImplicitArgPtr())
2949 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2950
2951 if (UserSGPRInfo.hasDispatchID())
2952 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2953
2954 // flat_scratch_init is not applicable for non-kernel functions.
2955
2956 if (Info.hasWorkGroupIDX())
2957 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2958
2959 if (Info.hasWorkGroupIDY())
2960 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2961
2962 if (Info.hasWorkGroupIDZ())
2963 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2964
2965 if (Info.hasLDSKernelId())
2966 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2967}
2968
2969// Allocate special inputs passed in user SGPRs.
2970void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2971 MachineFunction &MF,
2972 const SIRegisterInfo &TRI,
2973 SIMachineFunctionInfo &Info) const {
2974 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2975 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2976 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2977 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2978 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2979 }
2980
2981 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2982 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2983 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2984 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2985 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2986 }
2987
2988 if (UserSGPRInfo.hasDispatchPtr()) {
2989 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2990 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2991 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2992 }
2993
2994 if (UserSGPRInfo.hasQueuePtr()) {
2995 Register QueuePtrReg = Info.addQueuePtr(TRI);
2996 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
2997 CCInfo.AllocateReg(Reg: QueuePtrReg);
2998 }
2999
3000 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3001 MachineRegisterInfo &MRI = MF.getRegInfo();
3002 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3003 CCInfo.AllocateReg(Reg: InputPtrReg);
3004
3005 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3006 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
3007 }
3008
3009 if (UserSGPRInfo.hasDispatchID()) {
3010 Register DispatchIDReg = Info.addDispatchID(TRI);
3011 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
3012 CCInfo.AllocateReg(Reg: DispatchIDReg);
3013 }
3014
3015 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3016 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3017 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
3018 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
3019 }
3020
3021 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3022 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3023 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
3024 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
3025 }
3026
3027 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3028 // these from the dispatch pointer.
3029}
3030
3031// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3032// sequential starting from the first argument.
3033void SITargetLowering::allocatePreloadKernArgSGPRs(
3034 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3035 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
3036 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3037 Function &F = MF.getFunction();
3038 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3039 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3040 bool InPreloadSequence = true;
3041 unsigned InIdx = 0;
3042 bool AlignedForImplictArgs = false;
3043 unsigned ImplicitArgOffset = 0;
3044 for (auto &Arg : F.args()) {
3045 if (!InPreloadSequence || !Arg.hasInRegAttr())
3046 break;
3047
3048 unsigned ArgIdx = Arg.getArgNo();
3049 // Don't preload non-original args or parts not in the current preload
3050 // sequence.
3051 if (InIdx < Ins.size() &&
3052 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3053 break;
3054
3055 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3056 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3057 InIdx++) {
3058 assert(ArgLocs[ArgIdx].isMemLoc());
3059 auto &ArgLoc = ArgLocs[InIdx];
3060 const Align KernelArgBaseAlign = Align(16);
3061 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3062 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
3063 unsigned NumAllocSGPRs =
3064 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
3065
3066 // Fix alignment for hidden arguments.
3067 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
3068 if (!AlignedForImplictArgs) {
3069 ImplicitArgOffset =
3070 alignTo(Size: LastExplicitArgOffset,
3071 A: Subtarget->getAlignmentForImplicitArgPtr()) -
3072 LastExplicitArgOffset;
3073 AlignedForImplictArgs = true;
3074 }
3075 ArgOffset += ImplicitArgOffset;
3076 }
3077
3078 // Arg is preloaded into the previous SGPR.
3079 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3080 assert(InIdx >= 1 && "No previous SGPR");
3081 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3082 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3083 continue;
3084 }
3085
3086 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3087 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
3088 // Check for free user SGPRs for preloading.
3089 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3090 InPreloadSequence = false;
3091 break;
3092 }
3093
3094 // Preload this argument.
3095 const TargetRegisterClass *RC =
3096 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
3097 SmallVectorImpl<MCRegister> *PreloadRegs =
3098 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
3099
3100 if (PreloadRegs->size() > 1)
3101 RC = &AMDGPU::SGPR_32RegClass;
3102 for (auto &Reg : *PreloadRegs) {
3103 assert(Reg);
3104 MF.addLiveIn(PReg: Reg, RC);
3105 CCInfo.AllocateReg(Reg);
3106 }
3107
3108 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3109 }
3110 }
3111}
3112
3113void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
3114 const SIRegisterInfo &TRI,
3115 SIMachineFunctionInfo &Info) const {
3116 // Always allocate this last since it is a synthetic preload.
3117 if (Info.hasLDSKernelId()) {
3118 Register Reg = Info.addLDSKernelId();
3119 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3120 CCInfo.AllocateReg(Reg);
3121 }
3122}
3123
3124// Allocate special input registers that are initialized per-wave.
3125void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
3126 SIMachineFunctionInfo &Info,
3127 CallingConv::ID CallConv,
3128 bool IsShader) const {
3129 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3130 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3131 // Note: user SGPRs are handled by the front-end for graphics shaders
3132 // Pad up the used user SGPRs with dead inputs.
3133
3134 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3135 // before enabling architected SGPRs for workgroup IDs.
3136 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3137
3138 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3139 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3140 // rely on it to reach 16 since if we end up having no stack usage, it will
3141 // not really be added.
3142 unsigned NumRequiredSystemSGPRs =
3143 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3144 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3145 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3146 Register Reg = Info.addReservedUserSGPR();
3147 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3148 CCInfo.AllocateReg(Reg);
3149 }
3150 }
3151
3152 if (!HasArchitectedSGPRs) {
3153 if (Info.hasWorkGroupIDX()) {
3154 Register Reg = Info.addWorkGroupIDX();
3155 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3156 CCInfo.AllocateReg(Reg);
3157 }
3158
3159 if (Info.hasWorkGroupIDY()) {
3160 Register Reg = Info.addWorkGroupIDY();
3161 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3162 CCInfo.AllocateReg(Reg);
3163 }
3164
3165 if (Info.hasWorkGroupIDZ()) {
3166 Register Reg = Info.addWorkGroupIDZ();
3167 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3168 CCInfo.AllocateReg(Reg);
3169 }
3170 }
3171
3172 if (Info.hasWorkGroupInfo()) {
3173 Register Reg = Info.addWorkGroupInfo();
3174 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3175 CCInfo.AllocateReg(Reg);
3176 }
3177
3178 if (Info.hasPrivateSegmentWaveByteOffset()) {
3179 // Scratch wave offset passed in system SGPR.
3180 unsigned PrivateSegmentWaveByteOffsetReg;
3181
3182 if (IsShader) {
3183 PrivateSegmentWaveByteOffsetReg =
3184 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3185
3186 // This is true if the scratch wave byte offset doesn't have a fixed
3187 // location.
3188 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3189 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3190 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3191 }
3192 } else
3193 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3194
3195 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
3196 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
3197 }
3198
3199 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3200 Info.getNumPreloadedSGPRs() >= 16);
3201}
3202
3203static void reservePrivateMemoryRegs(const TargetMachine &TM,
3204 MachineFunction &MF,
3205 const SIRegisterInfo &TRI,
3206 SIMachineFunctionInfo &Info) {
3207 // Now that we've figured out where the scratch register inputs are, see if
3208 // should reserve the arguments and use them directly.
3209 MachineFrameInfo &MFI = MF.getFrameInfo();
3210 bool HasStackObjects = MFI.hasStackObjects();
3211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3212
3213 // Record that we know we have non-spill stack objects so we don't need to
3214 // check all stack objects later.
3215 if (HasStackObjects)
3216 Info.setHasNonSpillStackObjects(true);
3217
3218 // Everything live out of a block is spilled with fast regalloc, so it's
3219 // almost certain that spilling will be required.
3220 if (TM.getOptLevel() == CodeGenOptLevel::None)
3221 HasStackObjects = true;
3222
3223 // For now assume stack access is needed in any callee functions, so we need
3224 // the scratch registers to pass in.
3225 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3226
3227 if (!ST.hasFlatScratchEnabled()) {
3228 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
3229 // If we have stack objects, we unquestionably need the private buffer
3230 // resource. For the Code Object V2 ABI, this will be the first 4 user
3231 // SGPR inputs. We can reserve those and use them directly.
3232
3233 Register PrivateSegmentBufferReg =
3234 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
3235 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3236 } else {
3237 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3238 // We tentatively reserve the last registers (skipping the last registers
3239 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3240 // we'll replace these with the ones immediately after those which were
3241 // really allocated. In the prologue copies will be inserted from the
3242 // argument to these reserved registers.
3243
3244 // Without HSA, relocations are used for the scratch pointer and the
3245 // buffer resource setup is always inserted in the prologue. Scratch wave
3246 // offset is still in an input SGPR.
3247 Info.setScratchRSrcReg(ReservedBufferReg);
3248 }
3249 }
3250
3251 MachineRegisterInfo &MRI = MF.getRegInfo();
3252
3253 // For entry functions we have to set up the stack pointer if we use it,
3254 // whereas non-entry functions get this "for free". This means there is no
3255 // intrinsic advantage to using S32 over S34 in cases where we do not have
3256 // calls but do need a frame pointer (i.e. if we are requested to have one
3257 // because frame pointer elimination is disabled). To keep things simple we
3258 // only ever use S32 as the call ABI stack pointer, and so using it does not
3259 // imply we need a separate frame pointer.
3260 //
3261 // Try to use s32 as the SP, but move it if it would interfere with input
3262 // arguments. This won't work with calls though.
3263 //
3264 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3265 // registers.
3266 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
3267 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3268 } else {
3269 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
3270
3271 if (MFI.hasCalls())
3272 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
3273
3274 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3275 if (!MRI.isLiveIn(Reg)) {
3276 Info.setStackPtrOffsetReg(Reg);
3277 break;
3278 }
3279 }
3280
3281 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3282 report_fatal_error(reason: "failed to find register for SP");
3283 }
3284
3285 // hasFP should be accurate for entry functions even before the frame is
3286 // finalized, because it does not rely on the known stack size, only
3287 // properties like whether variable sized objects are present.
3288 if (ST.getFrameLowering()->hasFP(MF)) {
3289 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3290 }
3291}
3292
3293bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
3294 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3295 return !Info->isEntryFunction();
3296}
3297
3298void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
3299
3300void SITargetLowering::insertCopiesSplitCSR(
3301 MachineBasicBlock *Entry,
3302 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3303 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3304
3305 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
3306 if (!IStart)
3307 return;
3308
3309 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3310 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3311 MachineBasicBlock::iterator MBBI = Entry->begin();
3312 for (const MCPhysReg *I = IStart; *I; ++I) {
3313 const TargetRegisterClass *RC = nullptr;
3314 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3315 RC = &AMDGPU::SGPR_64RegClass;
3316 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3317 RC = &AMDGPU::SGPR_32RegClass;
3318 else
3319 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3320
3321 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
3322 // Create copy from CSR to a virtual register.
3323 Entry->addLiveIn(PhysReg: *I);
3324 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
3325 .addReg(RegNo: *I);
3326
3327 // Insert the copy-back instructions right before the terminator.
3328 for (auto *Exit : Exits)
3329 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
3330 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
3331 .addReg(RegNo: NewVR);
3332 }
3333}
3334
3335SDValue SITargetLowering::LowerFormalArguments(
3336 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3337 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3338 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3339 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3340
3341 MachineFunction &MF = DAG.getMachineFunction();
3342 const Function &Fn = MF.getFunction();
3343 FunctionType *FType = MF.getFunction().getFunctionType();
3344 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3345 bool IsError = false;
3346
3347 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
3348 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3349 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3350 IsError = true;
3351 }
3352
3353 SmallVector<ISD::InputArg, 16> Splits;
3354 SmallVector<CCValAssign, 16> ArgLocs;
3355 BitVector Skipped(Ins.size());
3356 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3357 *DAG.getContext());
3358
3359 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
3360 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
3361 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
3362
3363 if (IsGraphics) {
3364 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3365 assert(!UserSGPRInfo.hasDispatchPtr() &&
3366 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3367 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3368 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3369 (void)UserSGPRInfo;
3370 if (!Subtarget->hasFlatScratchEnabled())
3371 assert(!UserSGPRInfo.hasFlatScratchInit());
3372 if ((CallConv != CallingConv::AMDGPU_CS &&
3373 CallConv != CallingConv::AMDGPU_Gfx &&
3374 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3375 !Subtarget->hasArchitectedSGPRs())
3376 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3377 !Info->hasWorkGroupIDZ());
3378 }
3379
3380 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3381
3382 if (CallConv == CallingConv::AMDGPU_PS) {
3383 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3384
3385 // At least one interpolation mode must be enabled or else the GPU will
3386 // hang.
3387 //
3388 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3389 // set PSInputAddr, the user wants to enable some bits after the compilation
3390 // based on run-time states. Since we can't know what the final PSInputEna
3391 // will look like, so we shouldn't do anything here and the user should take
3392 // responsibility for the correct programming.
3393 //
3394 // Otherwise, the following restrictions apply:
3395 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3396 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3397 // enabled too.
3398 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3399 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
3400 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
3401 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
3402 Info->markPSInputAllocated(Index: 0);
3403 Info->markPSInputEnabled(Index: 0);
3404 }
3405 if (Subtarget->isAmdPalOS()) {
3406 // For isAmdPalOS, the user does not enable some bits after compilation
3407 // based on run-time states; the register values being generated here are
3408 // the final ones set in hardware. Therefore we need to apply the
3409 // workaround to PSInputAddr and PSInputEnable together. (The case where
3410 // a bit is set in PSInputAddr but not PSInputEnable is where the
3411 // frontend set up an input arg for a particular interpolation mode, but
3412 // nothing uses that input arg. Really we should have an earlier pass
3413 // that removes such an arg.)
3414 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3415 if ((PsInputBits & 0x7F) == 0 ||
3416 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3417 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
3418 }
3419 } else if (IsKernel) {
3420 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3421 } else {
3422 Splits.append(in_start: IsWholeWaveFunc ? std::next(x: Ins.begin()) : Ins.begin(),
3423 in_end: Ins.end());
3424 }
3425
3426 if (IsKernel)
3427 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
3428
3429 if (IsEntryFunc) {
3430 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3431 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3432 if (IsKernel && Subtarget->hasKernargPreload())
3433 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
3434
3435 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
3436 } else if (!IsGraphics) {
3437 // For the fixed ABI, pass workitem IDs in the last argument register.
3438 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
3439
3440 // FIXME: Sink this into allocateSpecialInputSGPRs
3441 if (!Subtarget->hasFlatScratchEnabled())
3442 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3443
3444 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3445 }
3446
3447 if (!IsKernel) {
3448 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3449 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3450
3451 // This assumes the registers are allocated by CCInfo in ascending order
3452 // with no gaps.
3453 Info->setNumWaveDispatchSGPRs(
3454 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
3455 Info->setNumWaveDispatchVGPRs(
3456 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
3457 } else if (Info->getNumKernargPreloadedSGPRs()) {
3458 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3459 }
3460
3461 SmallVector<SDValue, 16> Chains;
3462
3463 if (IsWholeWaveFunc) {
3464 SDValue Setup = DAG.getNode(Opcode: AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3465 ResultTys: {MVT::i1, MVT::Other}, Ops: Chain);
3466 InVals.push_back(Elt: Setup.getValue(R: 0));
3467 Chains.push_back(Elt: Setup.getValue(R: 1));
3468 }
3469
3470 // FIXME: This is the minimum kernel argument alignment. We should improve
3471 // this to the maximum alignment of the arguments.
3472 //
3473 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3474 // kern arg offset.
3475 const Align KernelArgBaseAlign = Align(16);
3476
3477 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3478 ++i) {
3479 const ISD::InputArg &Arg = Ins[i];
3480 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3481 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3482 continue;
3483 }
3484
3485 CCValAssign &VA = ArgLocs[ArgIdx++];
3486 MVT VT = VA.getLocVT();
3487
3488 if (IsEntryFunc && VA.isMemLoc()) {
3489 VT = Ins[i].VT;
3490 EVT MemVT = VA.getLocVT();
3491
3492 const uint64_t Offset = VA.getLocMemOffset();
3493 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3494
3495 if (Arg.Flags.isByRef()) {
3496 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3497
3498 const GCNTargetMachine &TM =
3499 static_cast<const GCNTargetMachine &>(getTargetMachine());
3500 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3501 DestAS: Arg.Flags.getPointerAddrSpace())) {
3502 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3503 DestAS: Arg.Flags.getPointerAddrSpace());
3504 }
3505
3506 InVals.push_back(Elt: Ptr);
3507 continue;
3508 }
3509
3510 SDValue NewArg;
3511 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3512 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3513 // In this case the argument is packed into the previous preload SGPR.
3514 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
3515 int64_t OffsetDiff = Offset - AlignDownOffset;
3516 EVT IntVT = MemVT.changeTypeToInteger();
3517
3518 const SIMachineFunctionInfo *Info =
3519 MF.getInfo<SIMachineFunctionInfo>();
3520 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3521 Register Reg =
3522 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
3523
3524 assert(Reg);
3525 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3526 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3527
3528 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
3529 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3530
3531 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3532 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3533 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3534 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3535
3536 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
3537 } else {
3538 const SIMachineFunctionInfo *Info =
3539 MF.getInfo<SIMachineFunctionInfo>();
3540 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3541 const SmallVectorImpl<MCRegister> &PreloadRegs =
3542 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3543
3544 SDValue Copy;
3545 if (PreloadRegs.size() == 1) {
3546 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
3547 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3548 NewArg = DAG.getCopyFromReg(
3549 Chain, dl: DL, Reg: VReg,
3550 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3551 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3552
3553 } else {
3554 // If the kernarg alignment does not match the alignment of the SGPR
3555 // tuple RC that can accommodate this argument, it will be built up
3556 // via copies from from the individual SGPRs that the argument was
3557 // preloaded to.
3558 SmallVector<SDValue, 4> Elts;
3559 for (auto Reg : PreloadRegs) {
3560 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3561 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3562 Elts.push_back(Elt: Copy);
3563 }
3564 NewArg =
3565 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3566 NumElements: PreloadRegs.size()),
3567 DL, Ops: Elts);
3568 }
3569
3570 // If the argument was preloaded to multiple consecutive 32-bit
3571 // registers because of misalignment between addressable SGPR tuples
3572 // and the argument size, we can still assume that because of kernarg
3573 // segment alignment restrictions that NewArg's size is the same as
3574 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3575 // truncate since we cannot preload to less than a single SGPR and the
3576 // MemVT may be smaller.
3577 EVT MemVTInt =
3578 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3579 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3580 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3581
3582 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3583 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3584 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3585 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3586 }
3587 } else {
3588 // Hidden arguments that are in the kernel signature must be preloaded
3589 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3590 // the argument list and is not preloaded.
3591 if (Arg.isOrigArg()) {
3592 Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3593 if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3594 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3595 *OrigArg->getParent(),
3596 "hidden argument in kernel signature was not preloaded",
3597 DL.getDebugLoc()));
3598 }
3599 }
3600
3601 NewArg =
3602 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3603 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3604 }
3605 Chains.push_back(Elt: NewArg.getValue(R: 1));
3606
3607 auto *ParamTy =
3608 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3609 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3610 ParamTy &&
3611 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3612 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3613 // On SI local pointers are just offsets into LDS, so they are always
3614 // less than 16-bits. On CI and newer they could potentially be
3615 // real pointers, so we can't guarantee their size.
3616 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3617 N2: DAG.getValueType(MVT::i16));
3618 }
3619
3620 InVals.push_back(Elt: NewArg);
3621 continue;
3622 }
3623 if (!IsEntryFunc && VA.isMemLoc()) {
3624 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3625 InVals.push_back(Elt: Val);
3626 if (!Arg.Flags.isByVal())
3627 Chains.push_back(Elt: Val.getValue(R: 1));
3628 continue;
3629 }
3630
3631 assert(VA.isRegLoc() && "Parameter must be in a register!");
3632
3633 Register Reg = VA.getLocReg();
3634 const TargetRegisterClass *RC = nullptr;
3635 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3636 RC = &AMDGPU::VGPR_32RegClass;
3637 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3638 RC = &AMDGPU::SGPR_32RegClass;
3639 else
3640 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3641
3642 Reg = MF.addLiveIn(PReg: Reg, RC);
3643 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3644
3645 if (Arg.Flags.isSRet()) {
3646 // The return object should be reasonably addressable.
3647
3648 // FIXME: This helps when the return is a real sret. If it is a
3649 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3650 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3651 unsigned NumBits =
3652 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3653 Val = DAG.getNode(
3654 Opcode: ISD::AssertZext, DL, VT, N1: Val,
3655 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3656 }
3657
3658 Val = convertABITypeToValueType(DAG, Val, VA, SL: DL);
3659 InVals.push_back(Elt: Val);
3660 }
3661
3662 // Start adding system SGPRs.
3663 if (IsEntryFunc)
3664 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3665
3666 unsigned StackArgSize = CCInfo.getStackSize();
3667 Info->setBytesInStackArgArea(StackArgSize);
3668
3669 return Chains.empty() ? Chain
3670 : DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3671}
3672
3673// TODO: If return values can't fit in registers, we should return as many as
3674// possible in registers before passing on stack.
3675bool SITargetLowering::CanLowerReturn(
3676 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3677 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3678 const Type *RetTy) const {
3679 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3680 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3681 // for shaders. Vector types should be explicitly handled by CC.
3682 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3683 return true;
3684
3685 SmallVector<CCValAssign, 16> RVLocs;
3686 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3687 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3688 return false;
3689
3690 // We must use the stack if return would require unavailable registers.
3691 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3692 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3693 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3694 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3695 return false;
3696
3697 return true;
3698}
3699
3700SDValue
3701SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3702 bool isVarArg,
3703 const SmallVectorImpl<ISD::OutputArg> &Outs,
3704 const SmallVectorImpl<SDValue> &OutVals,
3705 const SDLoc &DL, SelectionDAG &DAG) const {
3706 MachineFunction &MF = DAG.getMachineFunction();
3707 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3708 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3709
3710 if (AMDGPU::isKernel(CC: CallConv)) {
3711 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3712 OutVals, DL, DAG);
3713 }
3714
3715 bool IsShader = AMDGPU::isShader(CC: CallConv);
3716
3717 Info->setIfReturnsVoid(Outs.empty());
3718 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3719
3720 // CCValAssign - represent the assignment of the return value to a location.
3721 SmallVector<CCValAssign, 48> RVLocs;
3722
3723 // CCState - Info about the registers and stack slots.
3724 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3725 *DAG.getContext());
3726
3727 // Analyze outgoing return values.
3728 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3729
3730 SDValue Glue;
3731 SmallVector<SDValue, 48> RetOps;
3732 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3733
3734 SDValue ReadFirstLane =
3735 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3736 // Copy the result values into the output registers.
3737 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3738 ++I, ++RealRVLocIdx) {
3739 CCValAssign &VA = RVLocs[I];
3740 assert(VA.isRegLoc() && "Can only return in registers!");
3741 // TODO: Partially return in registers if return values don't fit.
3742 SDValue Arg = OutVals[RealRVLocIdx];
3743
3744 // Copied from other backends.
3745 switch (VA.getLocInfo()) {
3746 case CCValAssign::Full:
3747 break;
3748 case CCValAssign::BCvt:
3749 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3750 break;
3751 case CCValAssign::SExt:
3752 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3753 break;
3754 case CCValAssign::ZExt:
3755 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3756 break;
3757 case CCValAssign::AExt:
3758 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3759 break;
3760 default:
3761 llvm_unreachable("Unknown loc info!");
3762 }
3763 if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3764 Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3765 N1: ReadFirstLane, N2: Arg);
3766 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3767 Glue = Chain.getValue(R: 1);
3768 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3769 }
3770
3771 // FIXME: Does sret work properly?
3772 if (!Info->isEntryFunction()) {
3773 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3774 const MCPhysReg *I =
3775 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3776 if (I) {
3777 for (; *I; ++I) {
3778 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3779 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3780 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3781 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3782 else
3783 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3784 }
3785 }
3786 }
3787
3788 // Update chain and glue.
3789 RetOps[0] = Chain;
3790 if (Glue.getNode())
3791 RetOps.push_back(Elt: Glue);
3792
3793 unsigned Opc = AMDGPUISD::ENDPGM;
3794 if (!IsWaveEnd)
3795 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3796 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3797 : AMDGPUISD::RET_GLUE;
3798 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3799}
3800
3801SDValue SITargetLowering::LowerCallResult(
3802 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3803 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3804 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3805 SDValue ThisVal) const {
3806 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3807
3808 // Assign locations to each value returned by this call.
3809 SmallVector<CCValAssign, 16> RVLocs;
3810 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3811 *DAG.getContext());
3812 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3813
3814 // Copy all of the result registers out of their specified physreg.
3815 for (CCValAssign VA : RVLocs) {
3816 SDValue Val;
3817
3818 if (VA.isRegLoc()) {
3819 Val =
3820 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3821 Chain = Val.getValue(R: 1);
3822 InGlue = Val.getValue(R: 2);
3823 } else if (VA.isMemLoc()) {
3824 report_fatal_error(reason: "TODO: return values in memory");
3825 } else
3826 llvm_unreachable("unknown argument location type");
3827
3828 switch (VA.getLocInfo()) {
3829 case CCValAssign::Full:
3830 break;
3831 case CCValAssign::BCvt:
3832 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3833 break;
3834 case CCValAssign::ZExt:
3835 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3836 N2: DAG.getValueType(VA.getValVT()));
3837 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3838 break;
3839 case CCValAssign::SExt:
3840 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3841 N2: DAG.getValueType(VA.getValVT()));
3842 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3843 break;
3844 case CCValAssign::AExt:
3845 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3846 break;
3847 default:
3848 llvm_unreachable("Unknown loc info!");
3849 }
3850
3851 InVals.push_back(Elt: Val);
3852 }
3853
3854 return Chain;
3855}
3856
3857// Add code to pass special inputs required depending on used features separate
3858// from the explicit user arguments present in the IR.
3859void SITargetLowering::passSpecialInputs(
3860 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3861 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3862 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3863 // If we don't have a call site, this was a call inserted by
3864 // legalization. These can never use special inputs.
3865 if (!CLI.CB)
3866 return;
3867
3868 SelectionDAG &DAG = CLI.DAG;
3869 const SDLoc &DL = CLI.DL;
3870 const Function &F = DAG.getMachineFunction().getFunction();
3871
3872 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3873 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3874
3875 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3876 AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
3877
3878 // TODO: Unify with private memory register handling. This is complicated by
3879 // the fact that at least in kernels, the input argument is not necessarily
3880 // in the same location as the input.
3881 // clang-format off
3882 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3883 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3884 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3885 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3886 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3887 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3888 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3889 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3890 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3891 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3892 };
3893 // clang-format on
3894
3895 for (auto [InputID, Attrs] : ImplicitAttrs) {
3896 // If the callee does not use the attribute value, skip copying the value.
3897 if (all_of(Range&: Attrs, P: [&](StringRef Attr) {
3898 return Attr.empty() || CLI.CB->hasFnAttr(Kind: Attr);
3899 }))
3900 continue;
3901
3902 const auto [OutgoingArg, ArgRC, ArgTy] =
3903 CalleeArgInfo.getPreloadedValue(Value: InputID);
3904 if (!OutgoingArg)
3905 continue;
3906
3907 const auto [IncomingArg, IncomingArgRC, Ty] =
3908 CallerArgInfo.getPreloadedValue(Value: InputID);
3909 assert(IncomingArgRC == ArgRC);
3910
3911 // All special arguments are ints for now.
3912 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3913 SDValue InputReg;
3914
3915 if (IncomingArg) {
3916 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3917 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3918 // The implicit arg ptr is special because it doesn't have a corresponding
3919 // input for kernels, and is computed from the kernarg segment pointer.
3920 InputReg = getImplicitArgPtr(DAG, SL: DL);
3921 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3922 std::optional<uint32_t> Id =
3923 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3924 if (Id.has_value()) {
3925 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3926 } else {
3927 InputReg = DAG.getPOISON(VT: ArgVT);
3928 }
3929 } else {
3930 // We may have proven the input wasn't needed, although the ABI is
3931 // requiring it. We just need to allocate the register appropriately.
3932 InputReg = DAG.getPOISON(VT: ArgVT);
3933 }
3934
3935 if (OutgoingArg->isRegister()) {
3936 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3937 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3938 report_fatal_error(reason: "failed to allocate implicit input argument");
3939 } else {
3940 unsigned SpecialArgOffset =
3941 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3942 SDValue ArgStore =
3943 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3944 MemOpChains.push_back(Elt: ArgStore);
3945 }
3946 }
3947
3948 // Pack workitem IDs into a single register or pass it as is if already
3949 // packed.
3950
3951 auto [OutgoingArg, ArgRC, Ty] =
3952 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3953 if (!OutgoingArg)
3954 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3955 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3956 if (!OutgoingArg)
3957 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3958 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3959 if (!OutgoingArg)
3960 return;
3961
3962 const ArgDescriptor *IncomingArgX = std::get<0>(
3963 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3964 const ArgDescriptor *IncomingArgY = std::get<0>(
3965 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3966 const ArgDescriptor *IncomingArgZ = std::get<0>(
3967 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3968
3969 SDValue InputReg;
3970 SDLoc SL;
3971
3972 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3973 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3974 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3975
3976 // If incoming ids are not packed we need to pack them.
3977 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3978 NeedWorkItemIDX) {
3979 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
3980 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3981 } else {
3982 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3983 }
3984 }
3985
3986 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3987 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
3988 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
3989 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
3990 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
3991 InputReg = InputReg.getNode()
3992 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
3993 : Y;
3994 }
3995
3996 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
3997 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
3998 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
3999 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
4000 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
4001 InputReg = InputReg.getNode()
4002 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
4003 : Z;
4004 }
4005
4006 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4007 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4008 // We're in a situation where the outgoing function requires the workitem
4009 // ID, but the calling function does not have it (e.g a graphics function
4010 // calling a C calling convention function). This is illegal, but we need
4011 // to produce something.
4012 InputReg = DAG.getPOISON(VT: MVT::i32);
4013 } else {
4014 // Workitem ids are already packed, any of present incoming arguments
4015 // will carry all required fields.
4016 ArgDescriptor IncomingArg =
4017 ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
4018 : IncomingArgY ? *IncomingArgY
4019 : *IncomingArgZ,
4020 Mask: ~0u);
4021 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
4022 }
4023 }
4024
4025 if (OutgoingArg->isRegister()) {
4026 if (InputReg)
4027 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
4028
4029 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
4030 } else {
4031 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
4032 if (InputReg) {
4033 SDValue ArgStore =
4034 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4035 MemOpChains.push_back(Elt: ArgStore);
4036 }
4037 }
4038}
4039
4040bool SITargetLowering::isEligibleForTailCallOptimization(
4041 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4042 const SmallVectorImpl<ISD::OutputArg> &Outs,
4043 const SmallVectorImpl<SDValue> &OutVals,
4044 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4045 if (AMDGPU::isChainCC(CC: CalleeCC))
4046 return true;
4047
4048 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
4049 return false;
4050
4051 // For a divergent call target, we need to do a waterfall loop over the
4052 // possible callees which precludes us from using a simple jump.
4053 if (Callee->isDivergent())
4054 return false;
4055
4056 MachineFunction &MF = DAG.getMachineFunction();
4057 const Function &CallerF = MF.getFunction();
4058 CallingConv::ID CallerCC = CallerF.getCallingConv();
4059 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4060 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4061
4062 // Kernels aren't callable, and don't have a live in return address so it
4063 // doesn't make sense to do a tail call with entry functions.
4064 if (!CallerPreserved)
4065 return false;
4066
4067 bool CCMatch = CallerCC == CalleeCC;
4068
4069 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4070 if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
4071 return true;
4072 return false;
4073 }
4074
4075 // TODO: Can we handle var args?
4076 if (IsVarArg)
4077 return false;
4078
4079 for (const Argument &Arg : CallerF.args()) {
4080 if (Arg.hasByValAttr())
4081 return false;
4082 }
4083
4084 LLVMContext &Ctx = *DAG.getContext();
4085
4086 // Check that the call results are passed in the same way.
4087 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
4088 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
4089 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
4090 return false;
4091
4092 // The callee has to preserve all registers the caller needs to preserve.
4093 if (!CCMatch) {
4094 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4095 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
4096 return false;
4097 }
4098
4099 // Nothing more to check if the callee is taking no arguments.
4100 if (Outs.empty())
4101 return true;
4102
4103 SmallVector<CCValAssign, 16> ArgLocs;
4104 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4105
4106 // FIXME: We are not allocating special input registers, so we will be
4107 // deciding based on incorrect register assignments.
4108 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
4109
4110 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4111 // If the stack arguments for this call do not fit into our own save area then
4112 // the call cannot be made tail.
4113 // TODO: Is this really necessary?
4114 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4115 return false;
4116
4117 for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
4118 // FIXME: What about inreg arguments that end up passed in memory?
4119 if (!CCVA.isRegLoc())
4120 continue;
4121
4122 // If we are passing an argument in an SGPR, and the value is divergent,
4123 // this call requires a waterfall loop.
4124 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
4125 LLVM_DEBUG(
4126 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4127 << printReg(CCVA.getLocReg(), TRI) << '\n');
4128 return false;
4129 }
4130 }
4131
4132 const MachineRegisterInfo &MRI = MF.getRegInfo();
4133 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
4134}
4135
4136bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
4137 if (!CI->isTailCall())
4138 return false;
4139
4140 const Function *ParentFn = CI->getFunction();
4141 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
4142 return false;
4143 return true;
4144}
4145
4146namespace {
4147// Chain calls have special arguments that we need to handle. These are
4148// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4149// arguments (index 0 and 1 respectively).
4150enum ChainCallArgIdx {
4151 Exec = 2,
4152 Flags,
4153 NumVGPRs,
4154 FallbackExec,
4155 FallbackCallee
4156};
4157} // anonymous namespace
4158
4159// The wave scratch offset register is used as the global base pointer.
4160SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4161 SmallVectorImpl<SDValue> &InVals) const {
4162 CallingConv::ID CallConv = CLI.CallConv;
4163 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
4164
4165 SelectionDAG &DAG = CLI.DAG;
4166
4167 const SDLoc &DL = CLI.DL;
4168 SDValue Chain = CLI.Chain;
4169 SDValue Callee = CLI.Callee;
4170
4171 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4172 bool UsesDynamicVGPRs = false;
4173 if (IsChainCallConv) {
4174 // The last arguments should be the value that we need to put in EXEC,
4175 // followed by the flags and any other arguments with special meanings.
4176 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4177 // we don't treat them like the "real" arguments.
4178 auto RequestedExecIt =
4179 llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
4180 return Arg.OrigArgIndex == 2;
4181 });
4182 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4183
4184 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4185 CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
4186 CE: CLI.OutVals.end());
4187 CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
4188
4189 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4190 "Haven't popped all the special args");
4191
4192 TargetLowering::ArgListEntry RequestedExecArg =
4193 CLI.Args[ChainCallArgIdx::Exec];
4194 if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
4195 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
4196
4197 // Convert constants into TargetConstants, so they become immediate operands
4198 // instead of being selected into S_MOV.
4199 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4200 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
4201 ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
4202 Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: 0)));
4203 } else
4204 ChainCallSpecialArgs.push_back(Elt: Arg.Node);
4205 };
4206
4207 PushNodeOrTargetConstant(RequestedExecArg);
4208
4209 // Process any other special arguments depending on the value of the flags.
4210 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4211
4212 const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
4213 if (FlagsValue.isZero()) {
4214 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4215 return lowerUnhandledCall(CLI, InVals,
4216 Reason: "no additional args allowed if flags == 0");
4217 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
4218 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4219 return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
4220 }
4221
4222 if (!Subtarget->isWave32()) {
4223 return lowerUnhandledCall(
4224 CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
4225 }
4226
4227 UsesDynamicVGPRs = true;
4228 std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4229 last: CLI.Args.end(), f: PushNodeOrTargetConstant);
4230 }
4231 }
4232
4233 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
4234 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4235 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
4236 bool &IsTailCall = CLI.IsTailCall;
4237 bool IsVarArg = CLI.IsVarArg;
4238 bool IsSibCall = false;
4239 MachineFunction &MF = DAG.getMachineFunction();
4240
4241 if (Callee.isUndef() || isNullConstant(V: Callee)) {
4242 if (!CLI.IsTailCall) {
4243 for (ISD::InputArg &Arg : CLI.Ins)
4244 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
4245 }
4246
4247 return Chain;
4248 }
4249
4250 if (IsVarArg) {
4251 return lowerUnhandledCall(CLI, InVals,
4252 Reason: "unsupported call to variadic function ");
4253 }
4254
4255 if (!CLI.CB)
4256 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
4257
4258 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4259 return lowerUnhandledCall(CLI, InVals,
4260 Reason: "unsupported required tail call to function ");
4261 }
4262
4263 if (IsTailCall) {
4264 IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
4265 Outs, OutVals, Ins, DAG);
4266 if (!IsTailCall &&
4267 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4268 report_fatal_error(reason: "failed to perform tail call elimination on a call "
4269 "site marked musttail or on llvm.amdgcn.cs.chain");
4270 }
4271
4272 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4273
4274 // A sibling call is one where we're under the usual C ABI and not planning
4275 // to change that but can still do a tail call:
4276 if (!TailCallOpt && IsTailCall)
4277 IsSibCall = true;
4278
4279 if (IsTailCall)
4280 ++NumTailCalls;
4281 }
4282
4283 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4284 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
4285 SmallVector<SDValue, 8> MemOpChains;
4286
4287 // Analyze operands of the call, assigning locations to each operand.
4288 SmallVector<CCValAssign, 16> ArgLocs;
4289 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4290 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
4291
4292 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv) &&
4293 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
4294 // With a fixed ABI, allocate fixed registers before user arguments.
4295 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
4296 }
4297
4298 // Mark the scratch resource descriptor as allocated so the CC analysis
4299 // does not assign user arguments to these registers, matching the callee.
4300 if (!Subtarget->hasFlatScratchEnabled())
4301 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
4302
4303 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
4304
4305 // Get a count of how many bytes are to be pushed on the stack.
4306 unsigned NumBytes = CCInfo.getStackSize();
4307
4308 if (IsSibCall) {
4309 // Since we're not changing the ABI to make this a tail call, the memory
4310 // operands are already available in the caller's incoming argument space.
4311 NumBytes = 0;
4312 }
4313
4314 // FPDiff is the byte offset of the call's argument area from the callee's.
4315 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4316 // by this amount for a tail call. In a sibling call it must be 0 because the
4317 // caller will deallocate the entire stack and the callee still expects its
4318 // arguments to begin at SP+0. Completely unused for non-tail calls.
4319 int32_t FPDiff = 0;
4320 MachineFrameInfo &MFI = MF.getFrameInfo();
4321 auto *TRI = Subtarget->getRegisterInfo();
4322
4323 // Adjust the stack pointer for the new arguments...
4324 // These operations are automatically eliminated by the prolog/epilog pass
4325 if (!IsSibCall)
4326 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
4327
4328 if (!IsSibCall || IsChainCallConv) {
4329 if (!Subtarget->hasFlatScratchEnabled()) {
4330 SmallVector<SDValue, 4> CopyFromChains;
4331
4332 // In the HSA case, this should be an identity copy.
4333 SDValue ScratchRSrcReg =
4334 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
4335 RegsToPass.emplace_back(Args: IsChainCallConv
4336 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4337 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4338 Args&: ScratchRSrcReg);
4339 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
4340 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
4341 }
4342 }
4343
4344 const unsigned NumSpecialInputs = RegsToPass.size();
4345
4346 MVT PtrVT = MVT::i32;
4347
4348 // Walk the register/memloc assignments, inserting copies/loads.
4349 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4350 CCValAssign &VA = ArgLocs[i];
4351 SDValue Arg = OutVals[i];
4352
4353 // Promote the value if needed.
4354 switch (VA.getLocInfo()) {
4355 case CCValAssign::Full:
4356 break;
4357 case CCValAssign::BCvt:
4358 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
4359 break;
4360 case CCValAssign::ZExt:
4361 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4362 break;
4363 case CCValAssign::SExt:
4364 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4365 break;
4366 case CCValAssign::AExt:
4367 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4368 break;
4369 case CCValAssign::FPExt:
4370 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4371 break;
4372 default:
4373 llvm_unreachable("Unknown loc info!");
4374 }
4375
4376 if (VA.isRegLoc()) {
4377 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
4378 } else {
4379 assert(VA.isMemLoc());
4380
4381 SDValue DstAddr;
4382 MachinePointerInfo DstInfo;
4383
4384 unsigned LocMemOffset = VA.getLocMemOffset();
4385 int32_t Offset = LocMemOffset;
4386
4387 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
4388 MaybeAlign Alignment;
4389
4390 if (IsTailCall) {
4391 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4392 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4393 : VA.getValVT().getStoreSize();
4394
4395 // FIXME: We can have better than the minimum byval required alignment.
4396 Alignment =
4397 Flags.isByVal()
4398 ? Flags.getNonZeroByValAlign()
4399 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
4400
4401 Offset = Offset + FPDiff;
4402 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
4403
4404 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
4405 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4406
4407 // Make sure any stack arguments overlapping with where we're storing
4408 // are loaded before this eventual operation. Otherwise they'll be
4409 // clobbered.
4410
4411 // FIXME: Why is this really necessary? This seems to just result in a
4412 // lot of code to copy the stack and write them back to the same
4413 // locations, which are supposed to be immutable?
4414 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
4415 } else {
4416 // Stores to the argument stack area are relative to the stack pointer.
4417 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
4418 VT: MVT::i32);
4419 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
4420 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
4421 Alignment =
4422 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
4423 }
4424
4425 if (Outs[i].Flags.isByVal()) {
4426 SDValue SizeNode =
4427 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
4428 SDValue Cpy =
4429 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4430 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
4431 /*isVol = */ false, /*AlwaysInline = */ true,
4432 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4433 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4434
4435 MemOpChains.push_back(Elt: Cpy);
4436 } else {
4437 SDValue Store =
4438 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4439 MemOpChains.push_back(Elt: Store);
4440 }
4441 }
4442 }
4443
4444 if (!MemOpChains.empty())
4445 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4446
4447 SDValue ReadFirstLaneID =
4448 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4449
4450 SDValue TokenGlue;
4451 if (CLI.ConvergenceControlToken) {
4452 TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4453 Operand: CLI.ConvergenceControlToken);
4454 }
4455
4456 // Build a sequence of copy-to-reg nodes chained together with token chain
4457 // and flag operands which copy the outgoing args into the appropriate regs.
4458 SDValue InGlue;
4459
4460 unsigned ArgIdx = 0;
4461 for (auto [Reg, Val] : RegsToPass) {
4462 if (ArgIdx++ >= NumSpecialInputs &&
4463 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4464 // For chain calls, the inreg arguments are required to be
4465 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4466 // they are uniform.
4467 //
4468 // For other calls, if an inreg arguments is known to be uniform,
4469 // speculatively insert a readfirstlane in case it is in a VGPR.
4470 //
4471 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4472 // value, so let that continue to produce invalid code.
4473
4474 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4475 if (TokenGlue)
4476 ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4477 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4478 Ops: ReadfirstlaneArgs);
4479 }
4480
4481 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4482 InGlue = Chain.getValue(R: 1);
4483 }
4484
4485 // We don't usually want to end the call-sequence here because we would tidy
4486 // the frame up *after* the call, however in the ABI-changing tail-call case
4487 // we've carefully laid out the parameters so that when sp is reset they'll be
4488 // in the correct location.
4489 if (IsTailCall && !IsSibCall) {
4490 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
4491 InGlue = Chain.getValue(R: 1);
4492 }
4493
4494 std::vector<SDValue> Ops({Chain});
4495
4496 // Add a redundant copy of the callee global which will not be legalized, as
4497 // we need direct access to the callee later.
4498 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4499 const GlobalValue *GV = GSD->getGlobal();
4500 Ops.push_back(x: Callee);
4501 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4502 } else {
4503 if (IsTailCall) {
4504 // isEligibleForTailCallOptimization considered whether the call target is
4505 // divergent, but we may still end up with a uniform value in a VGPR.
4506 // Insert a readfirstlane just in case.
4507 SDValue ReadFirstLaneID =
4508 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4509
4510 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4511 if (TokenGlue)
4512 ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4513 Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4514 Ops: ReadfirstlaneArgs);
4515 }
4516
4517 Ops.push_back(x: Callee);
4518 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
4519 }
4520
4521 if (IsTailCall) {
4522 // Each tail call may have to adjust the stack by a different amount, so
4523 // this information must travel along with the operation for eventual
4524 // consumption by emitEpilogue.
4525 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4526 }
4527
4528 if (IsChainCallConv)
4529 llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4530
4531 // Add argument registers to the end of the list so that they are known live
4532 // into the call.
4533 for (auto &[Reg, Val] : RegsToPass)
4534 Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4535
4536 // Add a register mask operand representing the call-preserved registers.
4537 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4538 assert(Mask && "Missing call preserved mask for calling convention");
4539 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4540
4541 if (SDValue Token = CLI.ConvergenceControlToken) {
4542 SmallVector<SDValue, 2> GlueOps;
4543 GlueOps.push_back(Elt: Token);
4544 if (InGlue)
4545 GlueOps.push_back(Elt: InGlue);
4546
4547 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4548 VT: MVT::Glue, Ops: GlueOps),
4549 0);
4550 }
4551
4552 if (InGlue)
4553 Ops.push_back(x: InGlue);
4554
4555 // If we're doing a tall call, use a TC_RETURN here rather than an
4556 // actual call instruction.
4557 if (IsTailCall) {
4558 MFI.setHasTailCall();
4559 unsigned OPC = AMDGPUISD::TC_RETURN;
4560 switch (CallConv) {
4561 case CallingConv::AMDGPU_Gfx:
4562 OPC = AMDGPUISD::TC_RETURN_GFX;
4563 break;
4564 case CallingConv::AMDGPU_CS_Chain:
4565 case CallingConv::AMDGPU_CS_ChainPreserve:
4566 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4567 : AMDGPUISD::TC_RETURN_CHAIN;
4568 break;
4569 }
4570
4571 // If the caller is a whole wave function, we need to use a special opcode
4572 // so we can patch up EXEC.
4573 if (Info->isWholeWaveFunction())
4574 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4575
4576 return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4577 }
4578
4579 // Returns a chain and a flag for retval copy to use.
4580 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4581 Chain = Call.getValue(R: 0);
4582 InGlue = Call.getValue(R: 1);
4583
4584 uint64_t CalleePopBytes = NumBytes;
4585 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
4586 if (!Ins.empty())
4587 InGlue = Chain.getValue(R: 1);
4588
4589 // Handle result values, copying them out of physregs into vregs that we
4590 // return.
4591 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4592 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
4593}
4594
4595// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4596// except for:
4597// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4598// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4599SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4600 SelectionDAG &DAG) const {
4601 const MachineFunction &MF = DAG.getMachineFunction();
4602 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4603
4604 SDLoc dl(Op);
4605 EVT VT = Op.getValueType();
4606 SDValue Chain = Op.getOperand(i: 0);
4607 Register SPReg = Info->getStackPtrOffsetReg();
4608
4609 // Chain the dynamic stack allocation so that it doesn't modify the stack
4610 // pointer when other instructions are using the stack.
4611 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
4612
4613 SDValue Size = Op.getOperand(i: 1);
4614 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4615 Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getAlignValue();
4616
4617 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4618 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4619 "Stack grows upwards for AMDGPU");
4620
4621 Chain = BaseAddr.getValue(R: 1);
4622 Align StackAlign = TFL->getStackAlign();
4623 if (Alignment > StackAlign) {
4624 uint64_t ScaledAlignment = Alignment.value()
4625 << Subtarget->getWavefrontSizeLog2();
4626 uint64_t StackAlignMask = ScaledAlignment - 1;
4627 SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4628 N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4629 BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4630 N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4631 }
4632
4633 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4634 SDValue NewSP;
4635 if (isa<ConstantSDNode>(Val: Size)) {
4636 // For constant sized alloca, scale alloca size by wave-size
4637 SDValue ScaledSize = DAG.getNode(
4638 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4639 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4640 NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4641 } else {
4642 // For dynamic sized alloca, perform wave-wide reduction to get max of
4643 // alloca size(divergent) and then scale it by wave-size
4644 SDValue WaveReduction =
4645 DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4646 Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4647 N2: Size, N3: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
4648 SDValue ScaledSize = DAG.getNode(
4649 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4650 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4651 NewSP =
4652 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4653 SDValue ReadFirstLaneID =
4654 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4655 NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4656 N2: NewSP);
4657 }
4658
4659 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4660 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4661
4662 return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4663}
4664
4665SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4666 if (Op.getValueType() != MVT::i32)
4667 return Op; // Defer to cannot select error.
4668
4669 Register SP = getStackPointerRegisterToSaveRestore();
4670 SDLoc SL(Op);
4671
4672 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4673
4674 // Convert from wave uniform to swizzled vector address. This should protect
4675 // from any edge cases where the stacksave result isn't directly used with
4676 // stackrestore.
4677 SDValue VectorAddress =
4678 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4679 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4680}
4681
4682SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4683 SelectionDAG &DAG) const {
4684 SDLoc SL(Op);
4685 assert(Op.getValueType() == MVT::i32);
4686
4687 uint32_t BothRoundHwReg =
4688 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4689 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4690
4691 SDValue IntrinID =
4692 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4693 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4694 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4695
4696 // There are two rounding modes, one for f32 and one for f64/f16. We only
4697 // report in the standard value range if both are the same.
4698 //
4699 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4700 // ties away from zero is not supported, and the other values are rotated by
4701 // 1.
4702 //
4703 // If the two rounding modes are not the same, report a target defined value.
4704
4705 // Mode register rounding mode fields:
4706 //
4707 // [1:0] Single-precision round mode.
4708 // [3:2] Double/Half-precision round mode.
4709 //
4710 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4711 //
4712 // Hardware Spec
4713 // Toward-0 3 0
4714 // Nearest Even 0 1
4715 // +Inf 1 2
4716 // -Inf 2 3
4717 // NearestAway0 N/A 4
4718 //
4719 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4720 // table we can index by the raw hardware mode.
4721 //
4722 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4723
4724 SDValue BitTable =
4725 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4726
4727 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4728 SDValue RoundModeTimesNumBits =
4729 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4730
4731 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4732 // knew only one mode was demanded.
4733 SDValue TableValue =
4734 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4735 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4736
4737 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4738 SDValue TableEntry =
4739 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4740
4741 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4742 // if it's an extended value.
4743 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4744 SDValue IsStandardValue =
4745 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4746 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4747 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4748 N2: TableEntry, N3: EnumOffset);
4749
4750 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4751}
4752
4753SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4754 SelectionDAG &DAG) const {
4755 SDLoc SL(Op);
4756
4757 SDValue NewMode = Op.getOperand(i: 1);
4758 assert(NewMode.getValueType() == MVT::i32);
4759
4760 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4761 // hardware MODE.fp_round values.
4762 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4763 uint32_t ClampedVal = std::min(
4764 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4765 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4766 NewMode = DAG.getConstant(
4767 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4768 } else {
4769 // If we know the input can only be one of the supported standard modes in
4770 // the range 0-3, we can use a simplified mapping to hardware values.
4771 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4772 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4773 // The supported standard values are 0-3. The extended values start at 8. We
4774 // need to offset by 4 if the value is in the extended range.
4775
4776 if (UseReducedTable) {
4777 // Truncate to the low 32-bits.
4778 SDValue BitTable = DAG.getConstant(
4779 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4780
4781 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4782 SDValue RoundModeTimesNumBits =
4783 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4784
4785 NewMode =
4786 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4787
4788 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4789 // the table extracted bits into inline immediates.
4790 } else {
4791 // table_index = umin(value, value - 4)
4792 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4793 SDValue BitTable =
4794 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4795
4796 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4797 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4798 SDValue IndexVal =
4799 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4800
4801 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4802 SDValue RoundModeTimesNumBits =
4803 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4804
4805 SDValue TableValue =
4806 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4807 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4808
4809 // No need to mask out the high bits since the setreg will ignore them
4810 // anyway.
4811 NewMode = TruncTable;
4812 }
4813
4814 // Insert a readfirstlane in case the value is a VGPR. We could do this
4815 // earlier and keep more operations scalar, but that interferes with
4816 // combining the source.
4817 SDValue ReadFirstLaneID =
4818 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4819 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4820 N1: ReadFirstLaneID, N2: NewMode);
4821 }
4822
4823 // N.B. The setreg will be later folded into s_round_mode on supported
4824 // targets.
4825 SDValue IntrinID =
4826 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4827 uint32_t BothRoundHwReg =
4828 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4829 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4830
4831 SDValue SetReg =
4832 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4833 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4834
4835 return SetReg;
4836}
4837
4838SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4839 if (Op->isDivergent() &&
4840 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(i: 4)))
4841 // Cannot do I$ prefetch with divergent pointer.
4842 return SDValue();
4843
4844 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4845 case AMDGPUAS::FLAT_ADDRESS:
4846 case AMDGPUAS::GLOBAL_ADDRESS:
4847 case AMDGPUAS::CONSTANT_ADDRESS:
4848 break;
4849 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4850 if (Subtarget->hasSafeSmemPrefetch())
4851 break;
4852 [[fallthrough]];
4853 default:
4854 return SDValue();
4855 }
4856
4857 // I$ prefetch
4858 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(i: 4))
4859 return SDValue();
4860
4861 return Op;
4862}
4863
4864// Work around DAG legality rules only based on the result type.
4865SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4866 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4867 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4868 EVT SrcVT = Src.getValueType();
4869
4870 if (SrcVT.getScalarType() != MVT::bf16)
4871 return Op;
4872
4873 SDLoc SL(Op);
4874 SDValue BitCast =
4875 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4876
4877 EVT DstVT = Op.getValueType();
4878 if (IsStrict)
4879 llvm_unreachable("Need STRICT_BF16_TO_FP");
4880
4881 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4882}
4883
4884SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4885 SDLoc SL(Op);
4886 if (Op.getValueType() != MVT::i64)
4887 return Op;
4888
4889 uint32_t ModeHwReg =
4890 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4891 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4892 uint32_t TrapHwReg =
4893 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4894 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4895
4896 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4897 SDValue IntrinID =
4898 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4899 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4900 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4901 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4902 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4903 SDValue TokenReg =
4904 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
4905 N2: GetTrapReg.getValue(R: 1));
4906
4907 SDValue CvtPtr =
4908 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4909 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4910
4911 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4912}
4913
4914SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4915 SDLoc SL(Op);
4916 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
4917 return Op;
4918
4919 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
4920 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4921 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
4922 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4923 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
4924
4925 SDValue ReadFirstLaneID =
4926 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4927 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4928 N1: ReadFirstLaneID, N2: NewModeReg);
4929 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4930 N1: ReadFirstLaneID, N2: NewTrapReg);
4931
4932 unsigned ModeHwReg =
4933 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4934 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4935 unsigned TrapHwReg =
4936 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4937 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4938
4939 SDValue IntrinID =
4940 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4941 SDValue SetModeReg =
4942 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4943 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4944 SDValue SetTrapReg =
4945 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4946 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4947 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4948}
4949
4950Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4951 const MachineFunction &MF) const {
4952 const Function &Fn = MF.getFunction();
4953
4954 Register Reg = StringSwitch<Register>(RegName)
4955 .Case(S: "m0", Value: AMDGPU::M0)
4956 .Case(S: "exec", Value: AMDGPU::EXEC)
4957 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4958 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4959 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4960 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4961 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4962 .Default(Value: Register());
4963 if (!Reg)
4964 return Reg;
4965
4966 if (!Subtarget->hasFlatScrRegister() &&
4967 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4968 Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef(RegName) +
4969 "\" for subtarget."));
4970 }
4971
4972 switch (Reg) {
4973 case AMDGPU::M0:
4974 case AMDGPU::EXEC_LO:
4975 case AMDGPU::EXEC_HI:
4976 case AMDGPU::FLAT_SCR_LO:
4977 case AMDGPU::FLAT_SCR_HI:
4978 if (VT.getSizeInBits() == 32)
4979 return Reg;
4980 break;
4981 case AMDGPU::EXEC:
4982 case AMDGPU::FLAT_SCR:
4983 if (VT.getSizeInBits() == 64)
4984 return Reg;
4985 break;
4986 default:
4987 llvm_unreachable("missing register type checking");
4988 }
4989
4990 report_fatal_error(
4991 reason: Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4992}
4993
4994// If kill is not the last instruction, split the block so kill is always a
4995// proper terminator.
4996MachineBasicBlock *
4997SITargetLowering::splitKillBlock(MachineInstr &MI,
4998 MachineBasicBlock *BB) const {
4999 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, /*UpdateLiveIns=*/true);
5000 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5001 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
5002 return SplitBB;
5003}
5004
5005// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5006// \p MI will be the only instruction in the loop body block. Otherwise, it will
5007// be the first instruction in the remainder block.
5008//
5009/// \returns { LoopBody, Remainder }
5010static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5011splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
5012 MachineFunction *MF = MBB.getParent();
5013 MachineBasicBlock::iterator I(&MI);
5014
5015 // To insert the loop we need to split the block. Move everything after this
5016 // point to a new block, and insert a new empty block between the two.
5017 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
5018 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5019 MachineFunction::iterator MBBI(MBB);
5020 ++MBBI;
5021
5022 MF->insert(MBBI, MBB: LoopBB);
5023 MF->insert(MBBI, MBB: RemainderBB);
5024
5025 LoopBB->addSuccessor(Succ: LoopBB);
5026 LoopBB->addSuccessor(Succ: RemainderBB);
5027
5028 // Move the rest of the block into a new block.
5029 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
5030
5031 if (InstInLoop) {
5032 auto Next = std::next(x: I);
5033
5034 // Move instruction to loop body.
5035 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
5036
5037 // Move the rest of the block.
5038 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
5039 } else {
5040 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
5041 }
5042
5043 MBB.addSuccessor(Succ: LoopBB);
5044
5045 return std::pair(LoopBB, RemainderBB);
5046}
5047
5048/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5049void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
5050 MachineBasicBlock *MBB = MI.getParent();
5051 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5052 auto I = MI.getIterator();
5053 auto E = std::next(x: I);
5054
5055 // clang-format off
5056 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
5057 .addImm(Val: 0);
5058 // clang-format on
5059
5060 MIBundleBuilder Bundler(*MBB, I, E);
5061 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
5062}
5063
5064MachineBasicBlock *
5065SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
5066 MachineBasicBlock *BB) const {
5067 const DebugLoc &DL = MI.getDebugLoc();
5068
5069 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5070
5071 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5072
5073 // Apparently kill flags are only valid if the def is in the same block?
5074 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
5075 Src->setIsKill(false);
5076
5077 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
5078
5079 MachineBasicBlock::iterator I = LoopBB->end();
5080
5081 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5082 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
5083
5084 // Clear TRAP_STS.MEM_VIOL
5085 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
5086 .addImm(Val: 0)
5087 .addImm(Val: EncodedReg);
5088
5089 bundleInstWithWaitcnt(MI);
5090
5091 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5092
5093 // Load and check TRAP_STS.MEM_VIOL
5094 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
5095 .addImm(Val: EncodedReg);
5096
5097 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5098 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5099 .addReg(RegNo: Reg, Flags: RegState::Kill)
5100 .addImm(Val: 0);
5101 // clang-format off
5102 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5103 .addMBB(MBB: LoopBB);
5104 // clang-format on
5105
5106 return RemainderBB;
5107}
5108
5109// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5110// wavefront. If the value is uniform and just happens to be in a VGPR, this
5111// will only do one iteration. In the worst case, this will loop 64 times.
5112//
5113// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5114static MachineBasicBlock::iterator
5115emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
5116 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5117 const DebugLoc &DL, const MachineOperand &Idx,
5118 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5119 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5120 Register &SGPRIdxReg) {
5121
5122 MachineFunction *MF = OrigBB.getParent();
5123 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5124 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5125 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5126 MachineBasicBlock::iterator I = LoopBB.begin();
5127
5128 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5129 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
5130 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
5131 Register CurrentIdxReg =
5132 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5133 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
5134
5135 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
5136 .addReg(RegNo: InitReg)
5137 .addMBB(MBB: &OrigBB)
5138 .addReg(RegNo: ResultReg)
5139 .addMBB(MBB: &LoopBB);
5140
5141 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
5142 .addReg(RegNo: InitSaveExecReg)
5143 .addMBB(MBB: &OrigBB)
5144 .addReg(RegNo: NewExec)
5145 .addMBB(MBB: &LoopBB);
5146
5147 // Read the next variant <- also loop target.
5148 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
5149 .addReg(RegNo: Idx.getReg(), Flags: getUndefRegState(B: Idx.isUndef()));
5150
5151 // Compare the just read M0 value to all possible Idx values.
5152 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
5153 .addReg(RegNo: CurrentIdxReg)
5154 .addReg(RegNo: Idx.getReg(), Flags: {}, SubReg: Idx.getSubReg());
5155
5156 // Update EXEC, save the original EXEC value to VCC.
5157 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.AndSaveExecOpc), DestReg: NewExec)
5158 .addReg(RegNo: CondReg, Flags: RegState::Kill);
5159
5160 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
5161
5162 if (UseGPRIdxMode) {
5163 if (Offset == 0) {
5164 SGPRIdxReg = CurrentIdxReg;
5165 } else {
5166 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
5167 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
5168 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5169 .addImm(Val: Offset);
5170 }
5171 } else {
5172 // Move index from VCC into M0
5173 if (Offset == 0) {
5174 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5175 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill);
5176 } else {
5177 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5178 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5179 .addImm(Val: Offset);
5180 }
5181 }
5182
5183 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5184 MachineInstr *InsertPt =
5185 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
5186 .addReg(RegNo: LMC.ExecReg)
5187 .addReg(RegNo: NewExec);
5188
5189 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5190 // s_cbranch_scc0?
5191
5192 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5193 // clang-format off
5194 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5195 .addMBB(MBB: &LoopBB);
5196 // clang-format on
5197
5198 return InsertPt->getIterator();
5199}
5200
5201// This has slightly sub-optimal regalloc when the source vector is killed by
5202// the read. The register allocator does not understand that the kill is
5203// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5204// subregister from it, using 1 more VGPR than necessary. This was saved when
5205// this was expanded after register allocation.
5206static MachineBasicBlock::iterator
5207loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
5208 unsigned InitResultReg, unsigned PhiReg, int Offset,
5209 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5210 MachineFunction *MF = MBB.getParent();
5211 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5212 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5213 MachineRegisterInfo &MRI = MF->getRegInfo();
5214 const DebugLoc &DL = MI.getDebugLoc();
5215 MachineBasicBlock::iterator I(&MI);
5216
5217 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5218 Register DstReg = MI.getOperand(i: 0).getReg();
5219 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5220 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5221 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5222
5223 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
5224
5225 // Save the EXEC mask
5226 // clang-format off
5227 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExec)
5228 .addReg(RegNo: LMC.ExecReg);
5229 // clang-format on
5230
5231 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
5232
5233 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5234
5235 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
5236 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
5237 Offset, UseGPRIdxMode, SGPRIdxReg);
5238
5239 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5240 MachineFunction::iterator MBBI(LoopBB);
5241 ++MBBI;
5242 MF->insert(MBBI, MBB: LandingPad);
5243 LoopBB->removeSuccessor(Succ: RemainderBB);
5244 LandingPad->addSuccessor(Succ: RemainderBB);
5245 LoopBB->addSuccessor(Succ: LandingPad);
5246 MachineBasicBlock::iterator First = LandingPad->begin();
5247 // clang-format off
5248 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
5249 .addReg(RegNo: SaveExec);
5250 // clang-format on
5251
5252 return InsPt;
5253}
5254
5255// Returns subreg index, offset
5256static std::pair<unsigned, int>
5257computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
5258 const TargetRegisterClass *SuperRC, unsigned VecReg,
5259 int Offset) {
5260 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
5261
5262 // Skip out of bounds offsets, or else we would end up using an undefined
5263 // register.
5264 if (Offset >= NumElts || Offset < 0)
5265 return std::pair(AMDGPU::sub0, Offset);
5266
5267 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
5268}
5269
5270static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
5271 MachineRegisterInfo &MRI, MachineInstr &MI,
5272 int Offset) {
5273 MachineBasicBlock *MBB = MI.getParent();
5274 const DebugLoc &DL = MI.getDebugLoc();
5275 MachineBasicBlock::iterator I(&MI);
5276
5277 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5278
5279 assert(Idx->getReg() != AMDGPU::NoRegister);
5280
5281 if (Offset == 0) {
5282 // clang-format off
5283 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5284 .add(MO: *Idx);
5285 // clang-format on
5286 } else {
5287 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5288 .add(MO: *Idx)
5289 .addImm(Val: Offset);
5290 }
5291}
5292
5293static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
5294 MachineRegisterInfo &MRI, MachineInstr &MI,
5295 int Offset) {
5296 MachineBasicBlock *MBB = MI.getParent();
5297 const DebugLoc &DL = MI.getDebugLoc();
5298 MachineBasicBlock::iterator I(&MI);
5299
5300 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5301
5302 if (Offset == 0)
5303 return Idx->getReg();
5304
5305 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5306 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
5307 .add(MO: *Idx)
5308 .addImm(Val: Offset);
5309 return Tmp;
5310}
5311
5312static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
5313 MachineBasicBlock &MBB,
5314 const GCNSubtarget &ST) {
5315 const SIInstrInfo *TII = ST.getInstrInfo();
5316 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5317 MachineFunction *MF = MBB.getParent();
5318 MachineRegisterInfo &MRI = MF->getRegInfo();
5319
5320 Register Dst = MI.getOperand(i: 0).getReg();
5321 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5322 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
5323 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5324
5325 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
5326 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5327
5328 unsigned SubReg;
5329 std::tie(args&: SubReg, args&: Offset) =
5330 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
5331
5332 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5333
5334 // Check for a SGPR index.
5335 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5336 MachineBasicBlock::iterator I(&MI);
5337 const DebugLoc &DL = MI.getDebugLoc();
5338
5339 if (UseGPRIdxMode) {
5340 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5341 // to avoid interfering with other uses, so probably requires a new
5342 // optimization pass.
5343 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5344
5345 const MCInstrDesc &GPRIDXDesc =
5346 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5347 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5348 .addReg(RegNo: SrcReg)
5349 .addReg(RegNo: Idx)
5350 .addImm(Val: SubReg);
5351 } else {
5352 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5353
5354 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5355 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5356 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5357 }
5358
5359 MI.eraseFromParent();
5360
5361 return &MBB;
5362 }
5363
5364 // Control flow needs to be inserted if indexing with a VGPR.
5365 const DebugLoc &DL = MI.getDebugLoc();
5366 MachineBasicBlock::iterator I(&MI);
5367
5368 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5369 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5370
5371 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
5372
5373 Register SGPRIdxReg;
5374 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
5375 UseGPRIdxMode, SGPRIdxReg);
5376
5377 MachineBasicBlock *LoopBB = InsPt->getParent();
5378
5379 if (UseGPRIdxMode) {
5380 const MCInstrDesc &GPRIDXDesc =
5381 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5382
5383 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5384 .addReg(RegNo: SrcReg)
5385 .addReg(RegNo: SGPRIdxReg)
5386 .addImm(Val: SubReg);
5387 } else {
5388 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5389 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5390 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5391 }
5392
5393 MI.eraseFromParent();
5394
5395 return LoopBB;
5396}
5397
5398static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5399 MachineBasicBlock &MBB,
5400 const GCNSubtarget &ST) {
5401 const SIInstrInfo *TII = ST.getInstrInfo();
5402 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5403 MachineFunction *MF = MBB.getParent();
5404 MachineRegisterInfo &MRI = MF->getRegInfo();
5405
5406 Register Dst = MI.getOperand(i: 0).getReg();
5407 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
5408 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5409 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
5410 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5411 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
5412 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5413
5414 // This can be an immediate, but will be folded later.
5415 assert(Val->getReg());
5416
5417 unsigned SubReg;
5418 std::tie(args&: SubReg, args&: Offset) =
5419 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
5420 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5421
5422 if (Idx->getReg() == AMDGPU::NoRegister) {
5423 MachineBasicBlock::iterator I(&MI);
5424 const DebugLoc &DL = MI.getDebugLoc();
5425
5426 assert(Offset == 0);
5427
5428 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
5429 .add(MO: *SrcVec)
5430 .add(MO: *Val)
5431 .addImm(Val: SubReg);
5432
5433 MI.eraseFromParent();
5434 return &MBB;
5435 }
5436
5437 // Check for a SGPR index.
5438 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5439 MachineBasicBlock::iterator I(&MI);
5440 const DebugLoc &DL = MI.getDebugLoc();
5441
5442 if (UseGPRIdxMode) {
5443 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5444
5445 const MCInstrDesc &GPRIDXDesc =
5446 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5447 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5448 .addReg(RegNo: SrcVec->getReg())
5449 .add(MO: *Val)
5450 .addReg(RegNo: Idx)
5451 .addImm(Val: SubReg);
5452 } else {
5453 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5454
5455 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5456 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5457 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5458 .addReg(RegNo: SrcVec->getReg())
5459 .add(MO: *Val)
5460 .addImm(Val: SubReg);
5461 }
5462 MI.eraseFromParent();
5463 return &MBB;
5464 }
5465
5466 // Control flow needs to be inserted if indexing with a VGPR.
5467 if (Val->isReg())
5468 MRI.clearKillFlags(Reg: Val->getReg());
5469
5470 const DebugLoc &DL = MI.getDebugLoc();
5471
5472 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5473
5474 Register SGPRIdxReg;
5475 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5476 UseGPRIdxMode, SGPRIdxReg);
5477 MachineBasicBlock *LoopBB = InsPt->getParent();
5478
5479 if (UseGPRIdxMode) {
5480 const MCInstrDesc &GPRIDXDesc =
5481 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5482
5483 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5484 .addReg(RegNo: PhiReg)
5485 .add(MO: *Val)
5486 .addReg(RegNo: SGPRIdxReg)
5487 .addImm(Val: SubReg);
5488 } else {
5489 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5490 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5491 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5492 .addReg(RegNo: PhiReg)
5493 .add(MO: *Val)
5494 .addImm(Val: SubReg);
5495 }
5496
5497 MI.eraseFromParent();
5498 return LoopBB;
5499}
5500
5501static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5502 MachineBasicBlock *BB) {
5503 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5504 // For GFX12, we emit s_add_u64 and s_sub_u64.
5505 MachineFunction *MF = BB->getParent();
5506 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5507 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5508 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5509 const DebugLoc &DL = MI.getDebugLoc();
5510 MachineOperand &Dest = MI.getOperand(i: 0);
5511 MachineOperand &Src0 = MI.getOperand(i: 1);
5512 MachineOperand &Src1 = MI.getOperand(i: 2);
5513 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5514 if (ST.hasScalarAddSub64()) {
5515 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5516 // clang-format off
5517 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5518 .add(MO: Src0)
5519 .add(MO: Src1);
5520 // clang-format on
5521 } else {
5522 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5523 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5524
5525 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5526 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5527
5528 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5529 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5530 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5531 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5532
5533 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5534 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5535 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5536 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5537
5538 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5539 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5540 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0).add(MO: Src0Sub0).add(MO: Src1Sub0);
5541 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1).add(MO: Src0Sub1).add(MO: Src1Sub1);
5542 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5543 .addReg(RegNo: DestSub0)
5544 .addImm(Val: AMDGPU::sub0)
5545 .addReg(RegNo: DestSub1)
5546 .addImm(Val: AMDGPU::sub1);
5547 }
5548 MI.eraseFromParent();
5549 return BB;
5550}
5551
5552static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5553 switch (Opc) {
5554 case AMDGPU::S_MIN_U32:
5555 return std::numeric_limits<uint32_t>::max();
5556 case AMDGPU::S_MIN_I32:
5557 return std::numeric_limits<int32_t>::max();
5558 case AMDGPU::S_MAX_U32:
5559 return std::numeric_limits<uint32_t>::min();
5560 case AMDGPU::S_MAX_I32:
5561 return std::numeric_limits<int32_t>::min();
5562 case AMDGPU::V_ADD_F32_e64: // -0.0
5563 return 0x80000000;
5564 case AMDGPU::V_SUB_F32_e64: // +0.0
5565 return 0x0;
5566 case AMDGPU::S_ADD_I32:
5567 case AMDGPU::S_SUB_I32:
5568 case AMDGPU::S_OR_B32:
5569 case AMDGPU::S_XOR_B32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_AND_B32:
5572 return std::numeric_limits<uint32_t>::max();
5573 case AMDGPU::V_MIN_F32_e64:
5574 case AMDGPU::V_MAX_F32_e64:
5575 return 0x7fc00000; // qNAN
5576 default:
5577 llvm_unreachable(
5578 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5579 }
5580}
5581
5582static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5583 switch (Opc) {
5584 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5585 return std::numeric_limits<uint64_t>::max();
5586 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5587 return std::numeric_limits<int64_t>::max();
5588 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5589 return std::numeric_limits<uint64_t>::min();
5590 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5591 return std::numeric_limits<int64_t>::min();
5592 case AMDGPU::V_MIN_F64_e64:
5593 case AMDGPU::V_MAX_F64_e64:
5594 case AMDGPU::V_MIN_NUM_F64_e64:
5595 case AMDGPU::V_MAX_NUM_F64_e64:
5596 return 0x7FF8000000000000; // qNAN
5597 case AMDGPU::S_ADD_U64_PSEUDO:
5598 case AMDGPU::S_SUB_U64_PSEUDO:
5599 case AMDGPU::S_OR_B64:
5600 case AMDGPU::S_XOR_B64:
5601 return std::numeric_limits<uint64_t>::min();
5602 case AMDGPU::S_AND_B64:
5603 return std::numeric_limits<uint64_t>::max();
5604 case AMDGPU::V_ADD_F64_e64:
5605 case AMDGPU::V_ADD_F64_pseudo_e64:
5606 return 0x8000000000000000; // -0.0
5607 default:
5608 llvm_unreachable(
5609 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5610 }
5611}
5612
5613static bool is32bitWaveReduceOperation(unsigned Opc) {
5614 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5615 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5616 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5617 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5618 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5619 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5620 Opc == AMDGPU::V_SUB_F32_e64;
5621}
5622
5623static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5624 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5625 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5626 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5627 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5628 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5629}
5630
5631static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5632 MachineBasicBlock &BB,
5633 const GCNSubtarget &ST,
5634 unsigned Opc) {
5635 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5636 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5637 const DebugLoc &DL = MI.getDebugLoc();
5638 const SIInstrInfo *TII = ST.getInstrInfo();
5639
5640 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5641 Register SrcReg = MI.getOperand(i: 1).getReg();
5642 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5643 Register DstReg = MI.getOperand(i: 0).getReg();
5644 MachineBasicBlock *RetBB = nullptr;
5645 if (isSGPR) {
5646 switch (Opc) {
5647 case AMDGPU::S_MIN_U32:
5648 case AMDGPU::S_MIN_I32:
5649 case AMDGPU::V_MIN_F32_e64:
5650 case AMDGPU::S_MAX_U32:
5651 case AMDGPU::S_MAX_I32:
5652 case AMDGPU::V_MAX_F32_e64:
5653 case AMDGPU::S_AND_B32:
5654 case AMDGPU::S_OR_B32: {
5655 // Idempotent operations.
5656 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5657 RetBB = &BB;
5658 break;
5659 }
5660 case AMDGPU::V_CMP_LT_U64_e64: // umin
5661 case AMDGPU::V_CMP_LT_I64_e64: // min
5662 case AMDGPU::V_CMP_GT_U64_e64: // umax
5663 case AMDGPU::V_CMP_GT_I64_e64: // max
5664 case AMDGPU::V_MIN_F64_e64:
5665 case AMDGPU::V_MIN_NUM_F64_e64:
5666 case AMDGPU::V_MAX_F64_e64:
5667 case AMDGPU::V_MAX_NUM_F64_e64:
5668 case AMDGPU::S_AND_B64:
5669 case AMDGPU::S_OR_B64: {
5670 // Idempotent operations.
5671 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg).addReg(RegNo: SrcReg);
5672 RetBB = &BB;
5673 break;
5674 }
5675 case AMDGPU::S_XOR_B32:
5676 case AMDGPU::S_XOR_B64:
5677 case AMDGPU::S_ADD_I32:
5678 case AMDGPU::S_ADD_U64_PSEUDO:
5679 case AMDGPU::V_ADD_F32_e64:
5680 case AMDGPU::V_ADD_F64_e64:
5681 case AMDGPU::V_ADD_F64_pseudo_e64:
5682 case AMDGPU::S_SUB_I32:
5683 case AMDGPU::S_SUB_U64_PSEUDO:
5684 case AMDGPU::V_SUB_F32_e64: {
5685 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5686 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5687 Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5688 Register NumActiveLanes =
5689 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5690
5691 bool IsWave32 = ST.isWave32();
5692 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5693 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5694 unsigned BitCountOpc =
5695 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5696
5697 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5698
5699 auto NewAccumulator =
5700 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: BitCountOpc), DestReg: NumActiveLanes)
5701 .addReg(RegNo: ExecMask);
5702
5703 switch (Opc) {
5704 case AMDGPU::S_XOR_B32:
5705 case AMDGPU::S_XOR_B64: {
5706 // Performing an XOR operation on a uniform value
5707 // depends on the parity of the number of active lanes.
5708 // For even parity, the result will be 0, for odd
5709 // parity the result will be the same as the input value.
5710 Register ParityRegister =
5711 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5712
5713 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5714 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5715 .addImm(Val: 1)
5716 .setOperandDead(3); // Dead scc
5717 if (Opc == AMDGPU::S_XOR_B32) {
5718 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5719 .addReg(RegNo: SrcReg)
5720 .addReg(RegNo: ParityRegister);
5721 } else {
5722 Register DestSub0 =
5723 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5724 Register DestSub1 =
5725 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5726
5727 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
5728 const TargetRegisterClass *SrcSubRC =
5729 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5730
5731 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5732 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5733 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5734 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5735
5736 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5737 .add(MO: Op1L)
5738 .addReg(RegNo: ParityRegister);
5739
5740 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub1)
5741 .add(MO: Op1H)
5742 .addReg(RegNo: ParityRegister);
5743
5744 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5745 .addReg(RegNo: DestSub0)
5746 .addImm(Val: AMDGPU::sub0)
5747 .addReg(RegNo: DestSub1)
5748 .addImm(Val: AMDGPU::sub1);
5749 }
5750 break;
5751 }
5752 case AMDGPU::S_SUB_I32: {
5753 Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5754
5755 // Take the negation of the source operand.
5756 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedVal)
5757 .addImm(Val: 0)
5758 .addReg(RegNo: SrcReg);
5759 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5760 .addReg(RegNo: NegatedVal)
5761 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5762 break;
5763 }
5764 case AMDGPU::S_ADD_I32: {
5765 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5766 .addReg(RegNo: SrcReg)
5767 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5768 break;
5769 }
5770 case AMDGPU::S_ADD_U64_PSEUDO:
5771 case AMDGPU::S_SUB_U64_PSEUDO: {
5772 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5773 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5774 Register Op1H_Op0L_Reg =
5775 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5776 Register Op1L_Op0H_Reg =
5777 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5778 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5779 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5780 Register NegatedValLo =
5781 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5782 Register NegatedValHi =
5783 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5784
5785 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: SrcReg);
5786 const TargetRegisterClass *Src1SubRC =
5787 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5788
5789 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5790 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5791 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5792 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5793
5794 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5795 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedValLo)
5796 .addImm(Val: 0)
5797 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5798 .setOperandDead(3); // Dead scc
5799 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ASHR_I32), DestReg: NegatedValHi)
5800 .addReg(RegNo: NegatedValLo)
5801 .addImm(Val: 31)
5802 .setOperandDead(3); // Dead scc
5803 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1L_Op0H_Reg)
5804 .add(MO: Op1L)
5805 .addReg(RegNo: NegatedValHi);
5806 }
5807 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5808 ? NegatedValLo
5809 : NewAccumulator->getOperand(i: 0).getReg();
5810 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5811 .add(MO: Op1L)
5812 .addReg(RegNo: LowOpcode);
5813 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_HI_U32), DestReg: CarryReg)
5814 .add(MO: Op1L)
5815 .addReg(RegNo: LowOpcode);
5816 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1H_Op0L_Reg)
5817 .add(MO: Op1H)
5818 .addReg(RegNo: LowOpcode);
5819
5820 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5821 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: HiVal)
5822 .addReg(RegNo: CarryReg)
5823 .addReg(RegNo: Op1H_Op0L_Reg)
5824 .setOperandDead(3); // Dead scc
5825
5826 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5827 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: DestSub1)
5828 .addReg(RegNo: HiVal)
5829 .addReg(RegNo: Op1L_Op0H_Reg)
5830 .setOperandDead(3); // Dead scc
5831 }
5832 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5833 .addReg(RegNo: DestSub0)
5834 .addImm(Val: AMDGPU::sub0)
5835 .addReg(RegNo: DestSub1)
5836 .addImm(Val: AMDGPU::sub1);
5837 break;
5838 }
5839 case AMDGPU::V_ADD_F32_e64:
5840 case AMDGPU::V_ADD_F64_e64:
5841 case AMDGPU::V_ADD_F64_pseudo_e64:
5842 case AMDGPU::V_SUB_F32_e64: {
5843 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5844 const TargetRegisterClass *VregRC = TII->getRegClass(MCID: TII->get(Opcode: Opc), OpNum: 0);
5845 Register ActiveLanesVreg = MRI.createVirtualRegister(RegClass: VregRC);
5846 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
5847 // Get number of active lanes as a float val.
5848 BuildMI(BB, I&: MI, MIMD: DL,
5849 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5850 : AMDGPU::V_CVT_F64_I32_e64),
5851 DestReg: ActiveLanesVreg)
5852 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5853 .addImm(Val: 0) // clamp
5854 .addImm(Val: 0); // output-modifier
5855
5856 // Take negation of input for SUB reduction
5857 unsigned srcMod =
5858 (Opc == AMDGPU::V_SUB_F32_e64 ||
5859 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5860 ? SISrcMods::NEG
5861 : SISrcMods::NONE;
5862 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5863 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5864 ? AMDGPU::V_MUL_F64_pseudo_e64
5865 : AMDGPU::V_MUL_F64_e64;
5866 auto DestVregInst = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MulOpc),
5867 DestReg: DstVreg)
5868 .addImm(Val: srcMod) // src0 modifier
5869 .addReg(RegNo: SrcReg)
5870 .addImm(Val: SISrcMods::NONE) // src1 modifier
5871 .addReg(RegNo: ActiveLanesVreg)
5872 .addImm(Val: SISrcMods::NONE) // clamp
5873 .addImm(Val: SISrcMods::NONE); // output-mod
5874 if (is32BitOpc) {
5875 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
5876 .addReg(RegNo: DstVreg);
5877 } else {
5878 Register LaneValueLoReg =
5879 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5880 Register LaneValueHiReg =
5881 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5882 const TargetRegisterClass *VregSubRC =
5883 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5884 MachineOperand Op1L =
5885 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5886 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
5887 MachineOperand Op1H =
5888 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5889 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
5890 // lane value input should be in an sgpr
5891 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5892 DestReg: LaneValueLoReg)
5893 .add(MO: Op1L);
5894 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5895 DestReg: LaneValueHiReg)
5896 .add(MO: Op1H);
5897 NewAccumulator =
5898 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5899 .addReg(RegNo: LaneValueLoReg)
5900 .addImm(Val: AMDGPU::sub0)
5901 .addReg(RegNo: LaneValueHiReg)
5902 .addImm(Val: AMDGPU::sub1);
5903 }
5904 }
5905 }
5906 RetBB = &BB;
5907 }
5908 }
5909 } else {
5910 // TODO: Implement DPP Strategy and switch based on immediate strategy
5911 // operand. For now, for all the cases (default, Iterative and DPP we use
5912 // iterative approach by default.)
5913
5914 // To reduce the VGPR using iterative approach, we need to iterate
5915 // over all the active lanes. Lowering consists of ComputeLoop,
5916 // which iterate over only active lanes. We use copy of EXEC register
5917 // as induction variable and every active lane modifies it using bitset0
5918 // so that we will get the next active lane for next iteration.
5919 MachineBasicBlock::iterator I = BB.end();
5920 Register SrcReg = MI.getOperand(i: 1).getReg();
5921 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5922 bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
5923
5924 // Create Control flow for loop
5925 // Split MI's Machine Basic block into For loop
5926 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5927
5928 // Create virtual registers required for lowering.
5929 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5930 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5931 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5932 Register IdentityValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5933 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5934 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5935 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5936 Register FF1Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5937 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5938
5939 bool IsWave32 = ST.isWave32();
5940 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5941 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5942
5943 // Create initial values of induction variable from Exec, Accumulator and
5944 // insert branch instr to newly created ComputeBlock
5945 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpcForExec), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5946 if (is32BitOpc) {
5947 uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5948 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: IdentityValReg)
5949 .addImm(Val: IdentityValue);
5950 } else {
5951 uint64_t IdentityValue =
5952 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5953 ? 0x0 // +0.0 for double sub reduction
5954 : getIdentityValueFor64BitWaveReduction(Opc);
5955 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: IdentityValReg)
5956 .addImm(Val: IdentityValue);
5957 }
5958 // clang-format off
5959 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5960 .addMBB(MBB: ComputeLoop);
5961 // clang-format on
5962
5963 // Start constructing ComputeLoop
5964 I = ComputeLoop->begin();
5965 auto Accumulator =
5966 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5967 .addReg(RegNo: IdentityValReg)
5968 .addMBB(MBB: &BB);
5969 auto ActiveBits =
5970 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5971 .addReg(RegNo: LoopIterator)
5972 .addMBB(MBB: &BB);
5973
5974 I = ComputeLoop->end();
5975 MachineInstr *NewAccumulator;
5976 // Perform the computations
5977 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5978 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5979 .addReg(RegNo: ActiveBitsReg);
5980 if (is32BitOpc) {
5981 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5982 DestReg: LaneValueReg)
5983 .addReg(RegNo: SrcReg)
5984 .addReg(RegNo: FF1Reg);
5985 if (isFPOp) {
5986 Register LaneValVreg =
5987 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5988 Register DstVreg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5989 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
5990 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32),
5991 DestReg: LaneValVreg)
5992 .addReg(RegNo: LaneValueReg);
5993 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
5994 .addImm(Val: 0) // src0 modifier
5995 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
5996 .addImm(Val: 0) // src1 modifier
5997 .addReg(RegNo: LaneValVreg)
5998 .addImm(Val: 0) // clamp
5999 .addImm(Val: 0); // omod
6000 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6001 MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6002 .addReg(RegNo: DstVreg);
6003 } else {
6004 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6005 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6006 .addReg(RegNo: LaneValueReg);
6007 }
6008 } else {
6009 Register LaneValueLoReg =
6010 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6011 Register LaneValueHiReg =
6012 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6013 Register LaneValReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6014 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
6015 const TargetRegisterClass *SrcSubRC =
6016 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6017 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6018 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
6019 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6020 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
6021 // lane value input should be in an sgpr
6022 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6023 DestReg: LaneValueLoReg)
6024 .add(MO: Op1L)
6025 .addReg(RegNo: FF1Reg);
6026 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6027 DestReg: LaneValueHiReg)
6028 .add(MO: Op1H)
6029 .addReg(RegNo: FF1Reg);
6030 auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6031 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: LaneValReg)
6032 .addReg(RegNo: LaneValueLoReg)
6033 .addImm(Val: AMDGPU::sub0)
6034 .addReg(RegNo: LaneValueHiReg)
6035 .addImm(Val: AMDGPU::sub1);
6036 switch (Opc) {
6037 case AMDGPU::S_OR_B64:
6038 case AMDGPU::S_AND_B64:
6039 case AMDGPU::S_XOR_B64: {
6040 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6041 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6042 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6043 .setOperandDead(3); // Dead scc
6044 break;
6045 }
6046 case AMDGPU::V_CMP_GT_I64_e64:
6047 case AMDGPU::V_CMP_GT_U64_e64:
6048 case AMDGPU::V_CMP_LT_I64_e64:
6049 case AMDGPU::V_CMP_LT_U64_e64: {
6050 Register LaneMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6051 Register ComparisonResultReg =
6052 MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6053 int SrcIdx =
6054 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6055 const TargetRegisterClass *VregClass =
6056 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6057 const TargetRegisterClass *VSubRegClass =
6058 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6059 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregClass);
6060 MachineOperand SrcReg0Sub0 =
6061 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6062 SuperRC: VregClass, SubIdx: AMDGPU::sub0, SubRC: VSubRegClass);
6063 MachineOperand SrcReg0Sub1 =
6064 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6065 SuperRC: VregClass, SubIdx: AMDGPU::sub1, SubRC: VSubRegClass);
6066 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE),
6067 DestReg: AccumulatorVReg)
6068 .add(MO: SrcReg0Sub0)
6069 .addImm(Val: AMDGPU::sub0)
6070 .add(MO: SrcReg0Sub1)
6071 .addImm(Val: AMDGPU::sub1);
6072 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: LaneMaskReg)
6073 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6074 .addReg(RegNo: AccumulatorVReg);
6075
6076 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6077 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: ComparisonResultReg)
6078 .addReg(RegNo: LaneMaskReg)
6079 .addReg(RegNo: ActiveBitsReg);
6080
6081 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6082 MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
6083 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6084 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6085 break;
6086 }
6087 case AMDGPU::V_MIN_F64_e64:
6088 case AMDGPU::V_MIN_NUM_F64_e64:
6089 case AMDGPU::V_MAX_F64_e64:
6090 case AMDGPU::V_MAX_NUM_F64_e64:
6091 case AMDGPU::V_ADD_F64_e64:
6092 case AMDGPU::V_ADD_F64_pseudo_e64: {
6093 int SrcIdx =
6094 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6095 const TargetRegisterClass *VregRC =
6096 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6097 const TargetRegisterClass *VregSubRC =
6098 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6099 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregRC);
6100 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6101 Register LaneValLo =
6102 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6103 Register LaneValHi =
6104 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6105 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AccumulatorVReg)
6106 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6107 unsigned Modifier =
6108 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6109 ? SISrcMods::NEG
6110 : SISrcMods::NONE;
6111 auto DstVregInst = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6112 .addImm(Val: Modifier) // src0 modifiers
6113 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6114 .addImm(Val: SISrcMods::NONE) // src1 modifiers
6115 .addReg(RegNo: AccumulatorVReg)
6116 .addImm(Val: SISrcMods::NONE) // clamp
6117 .addImm(Val: SISrcMods::NONE); // omod
6118 auto ReadLaneLo =
6119 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6120 DestReg: LaneValLo);
6121 auto ReadLaneHi =
6122 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6123 DestReg: LaneValHi);
6124 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6125 MachineOperand Op1L =
6126 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6127 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
6128 MachineOperand Op1H =
6129 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6130 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
6131 ReadLaneLo.add(MO: Op1L);
6132 ReadLaneHi.add(MO: Op1H);
6133 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6134 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
6135 .addReg(RegNo: LaneValLo)
6136 .addImm(Val: AMDGPU::sub0)
6137 .addReg(RegNo: LaneValHi)
6138 .addImm(Val: AMDGPU::sub1);
6139 break;
6140 }
6141 case AMDGPU::S_ADD_U64_PSEUDO:
6142 case AMDGPU::S_SUB_U64_PSEUDO: {
6143 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6144 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6145 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
6146 ComputeLoop = Expand64BitScalarArithmetic(MI&: *NewAccumulator, BB: ComputeLoop);
6147 break;
6148 }
6149 }
6150 }
6151 // Manipulate the iterator to get the next active lane
6152 unsigned BITSETOpc =
6153 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6154 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
6155 .addReg(RegNo: FF1Reg)
6156 .addReg(RegNo: ActiveBitsReg);
6157
6158 // Add phi nodes
6159 Accumulator.addReg(RegNo: DstReg).addMBB(MBB: ComputeLoop);
6160 ActiveBits.addReg(RegNo: NewActiveBitsReg).addMBB(MBB: ComputeLoop);
6161
6162 // Creating branching
6163 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6164 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
6165 .addReg(RegNo: NewActiveBitsReg)
6166 .addImm(Val: 0);
6167 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6168 .addMBB(MBB: ComputeLoop);
6169
6170 RetBB = ComputeEnd;
6171 }
6172 MI.eraseFromParent();
6173 return RetBB;
6174}
6175
6176MachineBasicBlock *
6177SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6178 MachineBasicBlock *BB) const {
6179 MachineFunction *MF = BB->getParent();
6180 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
6181 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6182 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6183 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6184 MachineRegisterInfo &MRI = MF->getRegInfo();
6185 const DebugLoc &DL = MI.getDebugLoc();
6186
6187 switch (MI.getOpcode()) {
6188 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6189 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
6190 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6191 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_U64_e64);
6192 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6193 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_I32);
6194 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6195 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_I64_e64);
6196 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6197 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MIN_F32_e64);
6198 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6199 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6200 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6201 ? AMDGPU::V_MIN_NUM_F64_e64
6202 : AMDGPU::V_MIN_F64_e64);
6203 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6204 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
6205 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6206 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_U64_e64);
6207 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6208 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_I32);
6209 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6210 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_I64_e64);
6211 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6212 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MAX_F32_e64);
6213 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6214 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6215 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6216 ? AMDGPU::V_MAX_NUM_F64_e64
6217 : AMDGPU::V_MAX_F64_e64);
6218 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6219 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_I32);
6220 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6221 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_U64_PSEUDO);
6222 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6223 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_ADD_F32_e64);
6224 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6225 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6226 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6227 ? AMDGPU::V_ADD_F64_pseudo_e64
6228 : AMDGPU::V_ADD_F64_e64);
6229 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6230 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_I32);
6231 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6232 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_U64_PSEUDO);
6233 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6234 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_SUB_F32_e64);
6235 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6236 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6237 // fadd + neg, by setting the NEG bit in the instruction.
6238 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6239 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6240 ? AMDGPU::V_ADD_F64_pseudo_e64
6241 : AMDGPU::V_ADD_F64_e64);
6242 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6243 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B32);
6244 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6245 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B64);
6246 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6247 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B32);
6248 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6249 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B64);
6250 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6251 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B32);
6252 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6253 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B64);
6254 case AMDGPU::S_UADDO_PSEUDO:
6255 case AMDGPU::S_USUBO_PSEUDO: {
6256 MachineOperand &Dest0 = MI.getOperand(i: 0);
6257 MachineOperand &Dest1 = MI.getOperand(i: 1);
6258 MachineOperand &Src0 = MI.getOperand(i: 2);
6259 MachineOperand &Src1 = MI.getOperand(i: 3);
6260
6261 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6262 ? AMDGPU::S_ADD_U32
6263 : AMDGPU::S_SUB_U32;
6264 // clang-format off
6265 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
6266 .add(MO: Src0)
6267 .add(MO: Src1);
6268 // clang-format on
6269
6270 unsigned SelOpc =
6271 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6272 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: Dest1.getReg()).addImm(Val: -1).addImm(Val: 0);
6273
6274 MI.eraseFromParent();
6275 return BB;
6276 }
6277 case AMDGPU::S_ADD_U64_PSEUDO:
6278 case AMDGPU::S_SUB_U64_PSEUDO: {
6279 return Expand64BitScalarArithmetic(MI, BB);
6280 }
6281 case AMDGPU::V_ADD_U64_PSEUDO:
6282 case AMDGPU::V_SUB_U64_PSEUDO: {
6283 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6284
6285 MachineOperand &Dest = MI.getOperand(i: 0);
6286 MachineOperand &Src0 = MI.getOperand(i: 1);
6287 MachineOperand &Src1 = MI.getOperand(i: 2);
6288
6289 if (ST.hasAddSubU64Insts()) {
6290 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL,
6291 MCID: TII->get(Opcode: IsAdd ? AMDGPU::V_ADD_U64_e64
6292 : AMDGPU::V_SUB_U64_e64),
6293 DestReg: Dest.getReg())
6294 .add(MO: Src0)
6295 .add(MO: Src1)
6296 .addImm(Val: 0); // clamp
6297 TII->legalizeOperands(MI&: *I);
6298 MI.eraseFromParent();
6299 return BB;
6300 }
6301
6302 if (IsAdd && ST.hasLshlAddU64Inst()) {
6303 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
6304 DestReg: Dest.getReg())
6305 .add(MO: Src0)
6306 .addImm(Val: 0)
6307 .add(MO: Src1);
6308 TII->legalizeOperands(MI&: *Add);
6309 MI.eraseFromParent();
6310 return BB;
6311 }
6312
6313 const auto *CarryRC = TRI->getWaveMaskRegClass();
6314
6315 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6316 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6317
6318 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6319 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6320
6321 const TargetRegisterClass *Src0RC = Src0.isReg()
6322 ? MRI.getRegClass(Reg: Src0.getReg())
6323 : &AMDGPU::VReg_64RegClass;
6324 const TargetRegisterClass *Src1RC = Src1.isReg()
6325 ? MRI.getRegClass(Reg: Src1.getReg())
6326 : &AMDGPU::VReg_64RegClass;
6327
6328 const TargetRegisterClass *Src0SubRC =
6329 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6330 const TargetRegisterClass *Src1SubRC =
6331 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6332
6333 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6334 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6335 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6336 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6337
6338 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6339 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6340 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6341 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6342
6343 unsigned LoOpc =
6344 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6345 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
6346 .addReg(RegNo: CarryReg, Flags: RegState::Define)
6347 .add(MO: SrcReg0Sub0)
6348 .add(MO: SrcReg1Sub0)
6349 .addImm(Val: 0); // clamp bit
6350
6351 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6352 MachineInstr *HiHalf =
6353 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
6354 .addReg(RegNo: DeadCarryReg, Flags: RegState::Define | RegState::Dead)
6355 .add(MO: SrcReg0Sub1)
6356 .add(MO: SrcReg1Sub1)
6357 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
6358 .addImm(Val: 0); // clamp bit
6359
6360 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
6361 .addReg(RegNo: DestSub0)
6362 .addImm(Val: AMDGPU::sub0)
6363 .addReg(RegNo: DestSub1)
6364 .addImm(Val: AMDGPU::sub1);
6365 TII->legalizeOperands(MI&: *LoHalf);
6366 TII->legalizeOperands(MI&: *HiHalf);
6367 MI.eraseFromParent();
6368 return BB;
6369 }
6370 case AMDGPU::S_ADD_CO_PSEUDO:
6371 case AMDGPU::S_SUB_CO_PSEUDO: {
6372 // This pseudo has a chance to be selected
6373 // only from uniform add/subcarry node. All the VGPR operands
6374 // therefore assumed to be splat vectors.
6375 MachineBasicBlock::iterator MII = MI;
6376 MachineOperand &Dest = MI.getOperand(i: 0);
6377 MachineOperand &CarryDest = MI.getOperand(i: 1);
6378 MachineOperand &Src0 = MI.getOperand(i: 2);
6379 MachineOperand &Src1 = MI.getOperand(i: 3);
6380 MachineOperand &Src2 = MI.getOperand(i: 4);
6381 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
6382 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6383 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
6384 .addReg(RegNo: Src0.getReg());
6385 Src0.setReg(RegOp0);
6386 }
6387 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
6388 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6389 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
6390 .addReg(RegNo: Src1.getReg());
6391 Src1.setReg(RegOp1);
6392 }
6393 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6394 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
6395 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
6396 .addReg(RegNo: Src2.getReg());
6397 Src2.setReg(RegOp2);
6398 }
6399
6400 if (ST.isWave64()) {
6401 if (ST.hasScalarCompareEq64()) {
6402 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
6403 .addReg(RegNo: Src2.getReg())
6404 .addImm(Val: 0);
6405 } else {
6406 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
6407 const TargetRegisterClass *SubRC =
6408 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6409 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6410 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
6411 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6412 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
6413 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6414
6415 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
6416 .add(MO: Src2Sub0)
6417 .add(MO: Src2Sub1);
6418
6419 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6420 .addReg(RegNo: Src2_32, Flags: RegState::Kill)
6421 .addImm(Val: 0);
6422 }
6423 } else {
6424 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6425 .addReg(RegNo: Src2.getReg())
6426 .addImm(Val: 0);
6427 }
6428
6429 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6430 ? AMDGPU::S_ADDC_U32
6431 : AMDGPU::S_SUBB_U32;
6432
6433 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
6434
6435 unsigned SelOpc =
6436 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6437
6438 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
6439 .addImm(Val: -1)
6440 .addImm(Val: 0);
6441
6442 MI.eraseFromParent();
6443 return BB;
6444 }
6445 case AMDGPU::SI_INIT_M0: {
6446 MachineOperand &M0Init = MI.getOperand(i: 0);
6447 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6448 MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6449 DestReg: AMDGPU::M0)
6450 .add(MO: M0Init);
6451 MI.eraseFromParent();
6452 return BB;
6453 }
6454 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6455 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6456 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6457 MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6458 .addImm(Val: 0)
6459 .addImm(Val: 0);
6460 return BB;
6461 }
6462 case AMDGPU::GET_GROUPSTATICSIZE: {
6463 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6464 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6465 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
6466 .add(MO: MI.getOperand(i: 0))
6467 .addImm(Val: MFI->getLDSSize());
6468 MI.eraseFromParent();
6469 return BB;
6470 }
6471 case AMDGPU::GET_SHADERCYCLESHILO: {
6472 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6473 // The algorithm is:
6474 //
6475 // hi1 = getreg(SHADER_CYCLES_HI)
6476 // lo1 = getreg(SHADER_CYCLES_LO)
6477 // hi2 = getreg(SHADER_CYCLES_HI)
6478 //
6479 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6480 // Otherwise there was overflow and the result is hi2:0. In both cases the
6481 // result should represent the actual time at some point during the sequence
6482 // of three getregs.
6483 using namespace AMDGPU::Hwreg;
6484 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6485 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
6486 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6487 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6488 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
6489 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
6490 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6491 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
6492 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6493 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6494 .addReg(RegNo: RegHi1)
6495 .addReg(RegNo: RegHi2);
6496 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6497 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
6498 .addReg(RegNo: RegLo1)
6499 .addImm(Val: 0);
6500 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
6501 .add(MO: MI.getOperand(i: 0))
6502 .addReg(RegNo: RegLo)
6503 .addImm(Val: AMDGPU::sub0)
6504 .addReg(RegNo: RegHi2)
6505 .addImm(Val: AMDGPU::sub1);
6506 MI.eraseFromParent();
6507 return BB;
6508 }
6509 case AMDGPU::SI_INDIRECT_SRC_V1:
6510 case AMDGPU::SI_INDIRECT_SRC_V2:
6511 case AMDGPU::SI_INDIRECT_SRC_V3:
6512 case AMDGPU::SI_INDIRECT_SRC_V4:
6513 case AMDGPU::SI_INDIRECT_SRC_V5:
6514 case AMDGPU::SI_INDIRECT_SRC_V6:
6515 case AMDGPU::SI_INDIRECT_SRC_V7:
6516 case AMDGPU::SI_INDIRECT_SRC_V8:
6517 case AMDGPU::SI_INDIRECT_SRC_V9:
6518 case AMDGPU::SI_INDIRECT_SRC_V10:
6519 case AMDGPU::SI_INDIRECT_SRC_V11:
6520 case AMDGPU::SI_INDIRECT_SRC_V12:
6521 case AMDGPU::SI_INDIRECT_SRC_V16:
6522 case AMDGPU::SI_INDIRECT_SRC_V32:
6523 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
6524 case AMDGPU::SI_INDIRECT_DST_V1:
6525 case AMDGPU::SI_INDIRECT_DST_V2:
6526 case AMDGPU::SI_INDIRECT_DST_V3:
6527 case AMDGPU::SI_INDIRECT_DST_V4:
6528 case AMDGPU::SI_INDIRECT_DST_V5:
6529 case AMDGPU::SI_INDIRECT_DST_V6:
6530 case AMDGPU::SI_INDIRECT_DST_V7:
6531 case AMDGPU::SI_INDIRECT_DST_V8:
6532 case AMDGPU::SI_INDIRECT_DST_V9:
6533 case AMDGPU::SI_INDIRECT_DST_V10:
6534 case AMDGPU::SI_INDIRECT_DST_V11:
6535 case AMDGPU::SI_INDIRECT_DST_V12:
6536 case AMDGPU::SI_INDIRECT_DST_V16:
6537 case AMDGPU::SI_INDIRECT_DST_V32:
6538 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
6539 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6540 case AMDGPU::SI_KILL_I1_PSEUDO:
6541 return splitKillBlock(MI, BB);
6542 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6543 Register Dst = MI.getOperand(i: 0).getReg();
6544 const MachineOperand &Src0 = MI.getOperand(i: 1);
6545 const MachineOperand &Src1 = MI.getOperand(i: 2);
6546 Register SrcCond = MI.getOperand(i: 3).getReg();
6547
6548 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6549 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6550 const auto *CondRC = TRI->getWaveMaskRegClass();
6551 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
6552
6553 const TargetRegisterClass *Src0RC = Src0.isReg()
6554 ? MRI.getRegClass(Reg: Src0.getReg())
6555 : &AMDGPU::VReg_64RegClass;
6556 const TargetRegisterClass *Src1RC = Src1.isReg()
6557 ? MRI.getRegClass(Reg: Src1.getReg())
6558 : &AMDGPU::VReg_64RegClass;
6559
6560 const TargetRegisterClass *Src0SubRC =
6561 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6562 const TargetRegisterClass *Src1SubRC =
6563 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6564
6565 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6566 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6567 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6568 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6569
6570 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6571 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6572 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6573 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6574
6575 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
6576 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
6577 .addImm(Val: 0)
6578 .add(MO: Src0Sub0)
6579 .addImm(Val: 0)
6580 .add(MO: Src1Sub0)
6581 .addReg(RegNo: SrcCondCopy);
6582 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
6583 .addImm(Val: 0)
6584 .add(MO: Src0Sub1)
6585 .addImm(Val: 0)
6586 .add(MO: Src1Sub1)
6587 .addReg(RegNo: SrcCondCopy);
6588
6589 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
6590 .addReg(RegNo: DstLo)
6591 .addImm(Val: AMDGPU::sub0)
6592 .addReg(RegNo: DstHi)
6593 .addImm(Val: AMDGPU::sub1);
6594 MI.eraseFromParent();
6595 return BB;
6596 }
6597 case AMDGPU::SI_BR_UNDEF: {
6598 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6599 .add(MO: MI.getOperand(i: 0));
6600 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
6601 MI.eraseFromParent();
6602 return BB;
6603 }
6604 case AMDGPU::ADJCALLSTACKUP:
6605 case AMDGPU::ADJCALLSTACKDOWN: {
6606 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6607 MachineInstrBuilder MIB(*MF, &MI);
6608 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::ImplicitDefine)
6609 .addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::Implicit);
6610 return BB;
6611 }
6612 case AMDGPU::SI_CALL_ISEL: {
6613 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
6614
6615 MachineInstrBuilder MIB;
6616 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
6617
6618 for (const MachineOperand &MO : MI.operands())
6619 MIB.add(MO);
6620
6621 MIB.cloneMemRefs(OtherMI: MI);
6622 MI.eraseFromParent();
6623 return BB;
6624 }
6625 case AMDGPU::V_ADD_CO_U32_e32:
6626 case AMDGPU::V_SUB_CO_U32_e32:
6627 case AMDGPU::V_SUBREV_CO_U32_e32: {
6628 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6629 unsigned Opc = MI.getOpcode();
6630
6631 bool NeedClampOperand = false;
6632 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
6633 Opc = AMDGPU::getVOPe64(Opcode: Opc);
6634 NeedClampOperand = true;
6635 }
6636
6637 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
6638 if (TII->isVOP3(MI: *I)) {
6639 I.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
6640 }
6641 I.add(MO: MI.getOperand(i: 1)).add(MO: MI.getOperand(i: 2));
6642 if (NeedClampOperand)
6643 I.addImm(Val: 0); // clamp bit for e64 encoding
6644
6645 TII->legalizeOperands(MI&: *I);
6646
6647 MI.eraseFromParent();
6648 return BB;
6649 }
6650 case AMDGPU::V_ADDC_U32_e32:
6651 case AMDGPU::V_SUBB_U32_e32:
6652 case AMDGPU::V_SUBBREV_U32_e32:
6653 // These instructions have an implicit use of vcc which counts towards the
6654 // constant bus limit.
6655 TII->legalizeOperands(MI);
6656 return BB;
6657 case AMDGPU::DS_GWS_INIT:
6658 case AMDGPU::DS_GWS_SEMA_BR:
6659 case AMDGPU::DS_GWS_BARRIER:
6660 case AMDGPU::DS_GWS_SEMA_V:
6661 case AMDGPU::DS_GWS_SEMA_P:
6662 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6663 // A s_waitcnt 0 is required to be the instruction immediately following.
6664 if (getSubtarget()->hasGWSAutoReplay()) {
6665 bundleInstWithWaitcnt(MI);
6666 return BB;
6667 }
6668
6669 return emitGWSMemViolTestLoop(MI, BB);
6670 case AMDGPU::S_SETREG_B32: {
6671 // Try to optimize cases that only set the denormal mode or rounding mode.
6672 //
6673 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6674 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6675 // instead.
6676 //
6677 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6678 // allow you to have a no side effect instruction in the output of a
6679 // sideeffecting pattern.
6680 auto [ID, Offset, Width] =
6681 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
6682 if (ID != AMDGPU::Hwreg::ID_MODE)
6683 return BB;
6684
6685 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
6686 const unsigned SetMask = WidthMask << Offset;
6687
6688 if (getSubtarget()->hasDenormModeInst()) {
6689 unsigned SetDenormOp = 0;
6690 unsigned SetRoundOp = 0;
6691
6692 // The dedicated instructions can only set the whole denorm or round mode
6693 // at once, not a subset of bits in either.
6694 if (SetMask ==
6695 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
6696 // If this fully sets both the round and denorm mode, emit the two
6697 // dedicated instructions for these.
6698 SetRoundOp = AMDGPU::S_ROUND_MODE;
6699 SetDenormOp = AMDGPU::S_DENORM_MODE;
6700 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6701 SetRoundOp = AMDGPU::S_ROUND_MODE;
6702 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6703 SetDenormOp = AMDGPU::S_DENORM_MODE;
6704 }
6705
6706 if (SetRoundOp || SetDenormOp) {
6707 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
6708 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
6709 unsigned ImmVal = Def->getOperand(i: 1).getImm();
6710 if (SetRoundOp) {
6711 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
6712 .addImm(Val: ImmVal & 0xf);
6713
6714 // If we also have the denorm mode, get just the denorm mode bits.
6715 ImmVal >>= 4;
6716 }
6717
6718 if (SetDenormOp) {
6719 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
6720 .addImm(Val: ImmVal & 0xf);
6721 }
6722
6723 MI.eraseFromParent();
6724 return BB;
6725 }
6726 }
6727 }
6728
6729 // If only FP bits are touched, used the no side effects pseudo.
6730 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6731 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6732 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
6733
6734 return BB;
6735 }
6736 case AMDGPU::S_INVERSE_BALLOT_U32:
6737 case AMDGPU::S_INVERSE_BALLOT_U64:
6738 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6739 // necessary. After that they are equivalent to a COPY.
6740 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
6741 return BB;
6742 case AMDGPU::ENDPGM_TRAP: {
6743 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
6744 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
6745 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
6746 return BB;
6747 }
6748
6749 // We need a block split to make the real endpgm a terminator. We also don't
6750 // want to break phis in successor blocks, so we can't just delete to the
6751 // end of the block.
6752
6753 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6754 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6755 MF->push_back(MBB: TrapBB);
6756 // clang-format off
6757 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
6758 .addImm(Val: 0);
6759 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6760 .addMBB(MBB: TrapBB);
6761 // clang-format on
6762
6763 BB->addSuccessor(Succ: TrapBB);
6764 MI.eraseFromParent();
6765 return SplitBB;
6766 }
6767 case AMDGPU::SIMULATED_TRAP: {
6768 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6769 MachineBasicBlock *SplitBB =
6770 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
6771 MI.eraseFromParent();
6772 return SplitBB;
6773 }
6774 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6775 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6776 assert(MFI->isWholeWaveFunction());
6777
6778 // During ISel, it's difficult to propagate the original EXEC mask to use as
6779 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6780 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF&: *BB->getParent());
6781 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6782 Register OriginalExec = Setup->getOperand(i: 0).getReg();
6783 MF->getRegInfo().clearKillFlags(Reg: OriginalExec);
6784 MI.getOperand(i: 0).setReg(OriginalExec);
6785 return BB;
6786 }
6787 default:
6788 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6789 if (!MI.mayStore())
6790 AddMemOpInit(MI);
6791 return BB;
6792 }
6793 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
6794 }
6795}
6796
6797bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
6798 // This currently forces unfolding various combinations of fsub into fma with
6799 // free fneg'd operands. As long as we have fast FMA (controlled by
6800 // isFMAFasterThanFMulAndFAdd), we should perform these.
6801
6802 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6803 // most of these combines appear to be cycle neutral but save on instruction
6804 // count / code size.
6805 return true;
6806}
6807
6808bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
6809
6810EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
6811 EVT VT) const {
6812 if (!VT.isVector()) {
6813 return MVT::i1;
6814 }
6815 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
6816}
6817
6818MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
6819 // TODO: Should i16 be used always if legal? For now it would force VALU
6820 // shifts.
6821 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6822}
6823
6824LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
6825 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6826 ? Ty.changeElementSize(NewEltSize: 16)
6827 : Ty.changeElementSize(NewEltSize: 32);
6828}
6829
6830// Answering this is somewhat tricky and depends on the specific device which
6831// have different rates for fma or all f64 operations.
6832//
6833// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6834// regardless of which device (although the number of cycles differs between
6835// devices), so it is always profitable for f64.
6836//
6837// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6838// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6839// which we can always do even without fused FP ops since it returns the same
6840// result as the separate operations and since it is always full
6841// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6842// however does not support denormals, so we do report fma as faster if we have
6843// a fast fma device and require denormals.
6844//
6845bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6846 EVT VT) const {
6847 VT = VT.getScalarType();
6848
6849 switch (VT.getSimpleVT().SimpleTy) {
6850 case MVT::f32: {
6851 // If mad is not available this depends only on if f32 fma is full rate.
6852 if (!Subtarget->hasMadMacF32Insts())
6853 return Subtarget->hasFastFMAF32();
6854
6855 // Otherwise f32 mad is always full rate and returns the same result as
6856 // the separate operations so should be preferred over fma.
6857 // However does not support denormals.
6858 if (!denormalModeIsFlushAllF32(MF))
6859 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6860
6861 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6862 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6863 }
6864 case MVT::f64:
6865 return true;
6866 case MVT::f16:
6867 case MVT::bf16:
6868 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6869 default:
6870 break;
6871 }
6872
6873 return false;
6874}
6875
6876bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6877 LLT Ty) const {
6878 switch (Ty.getScalarSizeInBits()) {
6879 case 16:
6880 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
6881 case 32:
6882 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
6883 case 64:
6884 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
6885 default:
6886 break;
6887 }
6888
6889 return false;
6890}
6891
6892bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
6893 if (!Ty.isScalar())
6894 return false;
6895
6896 if (Ty.getScalarSizeInBits() == 16)
6897 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
6898 if (Ty.getScalarSizeInBits() == 32)
6899 return Subtarget->hasMadMacF32Insts() &&
6900 denormalModeIsFlushAllF32(MF: *MI.getMF());
6901
6902 return false;
6903}
6904
6905bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6906 const SDNode *N) const {
6907 // TODO: Check future ftz flag
6908 // v_mad_f32/v_mac_f32 do not support denormals.
6909 EVT VT = N->getValueType(ResNo: 0);
6910 if (VT == MVT::f32)
6911 return Subtarget->hasMadMacF32Insts() &&
6912 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6913 if (VT == MVT::f16) {
6914 return Subtarget->hasMadF16() &&
6915 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6916 }
6917
6918 return false;
6919}
6920
6921//===----------------------------------------------------------------------===//
6922// Custom DAG Lowering Operations
6923//===----------------------------------------------------------------------===//
6924
6925// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6926// wider vector type is legal.
6927SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6928 SelectionDAG &DAG) const {
6929 unsigned Opc = Op.getOpcode();
6930 EVT VT = Op.getValueType();
6931 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6932 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6933 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6934 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6935 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6936 VT == MVT::v32bf16);
6937
6938 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6939
6940 SDLoc SL(Op);
6941 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op->getFlags());
6942 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op->getFlags());
6943
6944 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6945}
6946
6947// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6948// regression whereby extra unnecessary instructions were added to codegen
6949// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6950// instructions to extract the result from the vector.
6951SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6952 [[maybe_unused]] EVT VT = Op.getValueType();
6953
6954 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6955 VT == MVT::v16i32) &&
6956 "Unexpected ValueType.");
6957
6958 return DAG.UnrollVectorOp(N: Op.getNode());
6959}
6960
6961// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6962// wider vector type is legal.
6963SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6964 SelectionDAG &DAG) const {
6965 unsigned Opc = Op.getOpcode();
6966 EVT VT = Op.getValueType();
6967 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6968 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6969 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6970 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6971 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6972 VT == MVT::v32bf16);
6973
6974 auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6975 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6976
6977 SDLoc SL(Op);
6978
6979 SDValue OpLo =
6980 DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op->getFlags());
6981 SDValue OpHi =
6982 DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op->getFlags());
6983
6984 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6985}
6986
6987SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6988 SelectionDAG &DAG) const {
6989 unsigned Opc = Op.getOpcode();
6990 EVT VT = Op.getValueType();
6991 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6992 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6993 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6994 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6995 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6996 VT == MVT::v32bf16);
6997
6998 SDValue Op0 = Op.getOperand(i: 0);
6999 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7000 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
7001 : std::pair(Op0, Op0);
7002
7003 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
7004 auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
7005
7006 SDLoc SL(Op);
7007 auto ResVT = DAG.GetSplitDestVTs(VT);
7008
7009 SDValue OpLo =
7010 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op->getFlags());
7011 SDValue OpHi =
7012 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op->getFlags());
7013
7014 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
7015}
7016
7017SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7018 switch (Op.getOpcode()) {
7019 default:
7020 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
7021 case ISD::BRCOND:
7022 return LowerBRCOND(Op, DAG);
7023 case ISD::RETURNADDR:
7024 return LowerRETURNADDR(Op, DAG);
7025 case ISD::SPONENTRY:
7026 return LowerSPONENTRY(Op, DAG);
7027 case ISD::LOAD: {
7028 SDValue Result = LowerLOAD(Op, DAG);
7029 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7030 "Load should return a value and a chain");
7031 return Result;
7032 }
7033 case ISD::FSQRT: {
7034 EVT VT = Op.getValueType();
7035 if (VT == MVT::f32)
7036 return lowerFSQRTF32(Op, DAG);
7037 if (VT == MVT::f64)
7038 return lowerFSQRTF64(Op, DAG);
7039 return SDValue();
7040 }
7041 case ISD::FSIN:
7042 case ISD::FCOS:
7043 return LowerTrig(Op, DAG);
7044 case ISD::SELECT:
7045 return LowerSELECT(Op, DAG);
7046 case ISD::FDIV:
7047 return LowerFDIV(Op, DAG);
7048 case ISD::FFREXP:
7049 return LowerFFREXP(Op, DAG);
7050 case ISD::ATOMIC_CMP_SWAP:
7051 return LowerATOMIC_CMP_SWAP(Op, DAG);
7052 case ISD::STORE:
7053 return LowerSTORE(Op, DAG);
7054 case ISD::GlobalAddress: {
7055 MachineFunction &MF = DAG.getMachineFunction();
7056 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7057 return LowerGlobalAddress(MFI, Op, DAG);
7058 }
7059 case ISD::ExternalSymbol:
7060 return LowerExternalSymbol(Op, DAG);
7061 case ISD::INTRINSIC_WO_CHAIN:
7062 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7063 case ISD::INTRINSIC_W_CHAIN:
7064 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7065 case ISD::INTRINSIC_VOID:
7066 return LowerINTRINSIC_VOID(Op, DAG);
7067 case ISD::ADDRSPACECAST:
7068 return lowerADDRSPACECAST(Op, DAG);
7069 case ISD::INSERT_SUBVECTOR:
7070 return lowerINSERT_SUBVECTOR(Op, DAG);
7071 case ISD::INSERT_VECTOR_ELT:
7072 return lowerINSERT_VECTOR_ELT(Op, DAG);
7073 case ISD::EXTRACT_VECTOR_ELT:
7074 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7075 case ISD::VECTOR_SHUFFLE:
7076 return lowerVECTOR_SHUFFLE(Op, DAG);
7077 case ISD::SCALAR_TO_VECTOR:
7078 return lowerSCALAR_TO_VECTOR(Op, DAG);
7079 case ISD::BUILD_VECTOR:
7080 return lowerBUILD_VECTOR(Op, DAG);
7081 case ISD::FP_ROUND:
7082 case ISD::STRICT_FP_ROUND:
7083 return lowerFP_ROUND(Op, DAG);
7084 case ISD::TRAP:
7085 return lowerTRAP(Op, DAG);
7086 case ISD::DEBUGTRAP:
7087 return lowerDEBUGTRAP(Op, DAG);
7088 case ISD::ABS:
7089 case ISD::FABS:
7090 case ISD::FNEG:
7091 case ISD::FCANONICALIZE:
7092 case ISD::BSWAP:
7093 return splitUnaryVectorOp(Op, DAG);
7094 case ISD::FMINNUM:
7095 case ISD::FMAXNUM:
7096 return lowerFMINNUM_FMAXNUM(Op, DAG);
7097 case ISD::FMINIMUMNUM:
7098 case ISD::FMAXIMUMNUM:
7099 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7100 case ISD::FMINIMUM:
7101 case ISD::FMAXIMUM:
7102 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7103 case ISD::FLDEXP:
7104 case ISD::STRICT_FLDEXP:
7105 return lowerFLDEXP(Op, DAG);
7106 case ISD::FMA:
7107 return splitTernaryVectorOp(Op, DAG);
7108 case ISD::FP_TO_SINT:
7109 case ISD::FP_TO_UINT:
7110 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7111 Op.getValueType() == MVT::i16 &&
7112 Op.getOperand(i: 0).getValueType() == MVT::f32) {
7113 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7114 return Op;
7115 }
7116 return LowerFP_TO_INT(Op, DAG);
7117 case ISD::SHL:
7118 case ISD::SRA:
7119 case ISD::SRL:
7120 case ISD::ADD:
7121 case ISD::SUB:
7122 case ISD::SMIN:
7123 case ISD::SMAX:
7124 case ISD::UMIN:
7125 case ISD::UMAX:
7126 case ISD::FADD:
7127 case ISD::FMUL:
7128 case ISD::FMINNUM_IEEE:
7129 case ISD::FMAXNUM_IEEE:
7130 case ISD::UADDSAT:
7131 case ISD::USUBSAT:
7132 case ISD::SADDSAT:
7133 case ISD::SSUBSAT:
7134 return splitBinaryVectorOp(Op, DAG);
7135 case ISD::FCOPYSIGN:
7136 return lowerFCOPYSIGN(Op, DAG);
7137 case ISD::MUL:
7138 return lowerMUL(Op, DAG);
7139 case ISD::SMULO:
7140 case ISD::UMULO:
7141 return lowerXMULO(Op, DAG);
7142 case ISD::SMUL_LOHI:
7143 case ISD::UMUL_LOHI:
7144 return lowerXMUL_LOHI(Op, DAG);
7145 case ISD::DYNAMIC_STACKALLOC:
7146 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7147 case ISD::STACKSAVE:
7148 return LowerSTACKSAVE(Op, DAG);
7149 case ISD::GET_ROUNDING:
7150 return lowerGET_ROUNDING(Op, DAG);
7151 case ISD::SET_ROUNDING:
7152 return lowerSET_ROUNDING(Op, DAG);
7153 case ISD::PREFETCH:
7154 return lowerPREFETCH(Op, DAG);
7155 case ISD::FP_EXTEND:
7156 case ISD::STRICT_FP_EXTEND:
7157 return lowerFP_EXTEND(Op, DAG);
7158 case ISD::GET_FPENV:
7159 return lowerGET_FPENV(Op, DAG);
7160 case ISD::SET_FPENV:
7161 return lowerSET_FPENV(Op, DAG);
7162 case ISD::ROTR:
7163 return lowerROTR(Op, DAG);
7164 }
7165 return SDValue();
7166}
7167
7168// Used for D16: Casts the result of an instruction into the right vector,
7169// packs values if loads return unpacked values.
7170static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
7171 const SDLoc &DL, SelectionDAG &DAG,
7172 bool Unpacked) {
7173 if (!LoadVT.isVector())
7174 return Result;
7175
7176 // Cast back to the original packed type or to a larger type that is a
7177 // multiple of 32 bit for D16. Widening the return type is a required for
7178 // legalization.
7179 EVT FittingLoadVT = LoadVT;
7180 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7181 FittingLoadVT =
7182 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7183 NumElements: LoadVT.getVectorNumElements() + 1);
7184 }
7185
7186 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7187 // Truncate to v2i16/v4i16.
7188 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7189
7190 // Workaround legalizer not scalarizing truncate after vector op
7191 // legalization but not creating intermediate vector trunc.
7192 SmallVector<SDValue, 4> Elts;
7193 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
7194 for (SDValue &Elt : Elts)
7195 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
7196
7197 // Pad illegal v1i16/v3fi6 to v4i16
7198 if ((LoadVT.getVectorNumElements() % 2) == 1)
7199 Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
7200
7201 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
7202
7203 // Bitcast to original type (v2f16/v4f16).
7204 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7205 }
7206
7207 // Cast back to the original packed type.
7208 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7209}
7210
7211SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7212 SelectionDAG &DAG,
7213 ArrayRef<SDValue> Ops,
7214 bool IsIntrinsic) const {
7215 SDLoc DL(M);
7216
7217 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7218 EVT LoadVT = M->getValueType(ResNo: 0);
7219
7220 EVT EquivLoadVT = LoadVT;
7221 if (LoadVT.isVector()) {
7222 if (Unpacked) {
7223 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
7224 NumElements: LoadVT.getVectorNumElements());
7225 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7226 // Widen v3f16 to legal type
7227 EquivLoadVT =
7228 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7229 NumElements: LoadVT.getVectorNumElements() + 1);
7230 }
7231 }
7232
7233 // Change from v4f16/v2f16 to EquivLoadVT.
7234 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
7235
7236 SDValue Load = DAG.getMemIntrinsicNode(
7237 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
7238 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
7239
7240 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
7241
7242 return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: 1)}, dl: DL);
7243}
7244
7245SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7246 SelectionDAG &DAG,
7247 ArrayRef<SDValue> Ops) const {
7248 SDLoc DL(M);
7249 EVT LoadVT = M->getValueType(ResNo: 0);
7250 EVT EltType = LoadVT.getScalarType();
7251 EVT IntVT = LoadVT.changeTypeToInteger();
7252
7253 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7254
7255 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7256 bool IsTFE = M->getNumValues() == 3;
7257
7258 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7259 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7260 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7261 : AMDGPUISD::BUFFER_LOAD;
7262
7263 if (IsD16) {
7264 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7265 }
7266
7267 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7268 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7269 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
7270 IsTFE);
7271
7272 if (isTypeLegal(VT: LoadVT)) {
7273 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
7274 MMO: M->getMemOperand(), DAG);
7275 }
7276
7277 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
7278 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
7279 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
7280 MMO: M->getMemOperand(), DAG);
7281 return DAG.getMergeValues(
7282 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
7283 dl: DL);
7284}
7285
7286static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7287 SelectionDAG &DAG) {
7288 EVT VT = N->getValueType(ResNo: 0);
7289 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7290 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
7291 return DAG.getPOISON(VT);
7292
7293 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7294
7295 SDValue LHS = N->getOperand(Num: 1);
7296 SDValue RHS = N->getOperand(Num: 2);
7297
7298 SDLoc DL(N);
7299
7300 EVT CmpVT = LHS.getValueType();
7301 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
7302 unsigned PromoteOp =
7303 ICmpInst::isSigned(predicate: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7304 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
7305 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
7306 }
7307
7308 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
7309
7310 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7311 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7312
7313 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
7314 N3: DAG.getCondCode(Cond: CCOpcode));
7315 if (VT.bitsEq(VT: CCVT))
7316 return SetCC;
7317 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
7318}
7319
7320static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7321 SelectionDAG &DAG) {
7322 EVT VT = N->getValueType(ResNo: 0);
7323
7324 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7325 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
7326 return DAG.getPOISON(VT);
7327
7328 SDValue Src0 = N->getOperand(Num: 1);
7329 SDValue Src1 = N->getOperand(Num: 2);
7330 EVT CmpVT = Src0.getValueType();
7331 SDLoc SL(N);
7332
7333 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
7334 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
7335 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
7336 }
7337
7338 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7339 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
7340 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7341 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7342 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
7343 N3: DAG.getCondCode(Cond: CCOpcode));
7344 if (VT.bitsEq(VT: CCVT))
7345 return SetCC;
7346 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
7347}
7348
7349static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
7350 SelectionDAG &DAG) {
7351 EVT VT = N->getValueType(ResNo: 0);
7352 SDValue Src = N->getOperand(Num: 1);
7353 SDLoc SL(N);
7354
7355 if (Src.getOpcode() == ISD::SETCC) {
7356 SDValue Op0 = Src.getOperand(i: 0);
7357 SDValue Op1 = Src.getOperand(i: 1);
7358 // Need to expand bfloat to float for comparison (setcc).
7359 if (Op0.getValueType() == MVT::bf16) {
7360 Op0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op0);
7361 Op1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op1);
7362 }
7363 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7364 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Op0, N2: Op1, N3: Src.getOperand(i: 2));
7365 }
7366 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
7367 // (ballot 0) -> 0
7368 if (Arg->isZero())
7369 return DAG.getConstant(Val: 0, DL: SL, VT);
7370
7371 // (ballot 1) -> EXEC/EXEC_LO
7372 if (Arg->isOne()) {
7373 Register Exec;
7374 if (VT.getScalarSizeInBits() == 32)
7375 Exec = AMDGPU::EXEC_LO;
7376 else if (VT.getScalarSizeInBits() == 64)
7377 Exec = AMDGPU::EXEC;
7378 else
7379 return SDValue();
7380
7381 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
7382 }
7383 }
7384
7385 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7386 // ISD::SETNE)
7387 return DAG.getNode(
7388 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
7389 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
7390}
7391
7392static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
7393 SelectionDAG &DAG) {
7394 EVT VT = N->getValueType(ResNo: 0);
7395 unsigned ValSize = VT.getSizeInBits();
7396 unsigned IID = N->getConstantOperandVal(Num: 0);
7397 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7398 IID == Intrinsic::amdgcn_permlanex16;
7399 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7400 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7401 SDLoc SL(N);
7402 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
7403 const GCNSubtarget *ST = TLI.getSubtarget();
7404 unsigned SplitSize = 32;
7405 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7406 ST->hasDPALU_DPP() &&
7407 AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: N->getConstantOperandVal(Num: 3)))
7408 SplitSize = 64;
7409
7410 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7411 SDValue Src2, MVT ValT) -> SDValue {
7412 SmallVector<SDValue, 8> Operands;
7413 switch (IID) {
7414 case Intrinsic::amdgcn_permlane16:
7415 case Intrinsic::amdgcn_permlanex16:
7416 case Intrinsic::amdgcn_update_dpp:
7417 Operands.push_back(Elt: N->getOperand(Num: 6));
7418 Operands.push_back(Elt: N->getOperand(Num: 5));
7419 Operands.push_back(Elt: N->getOperand(Num: 4));
7420 [[fallthrough]];
7421 case Intrinsic::amdgcn_writelane:
7422 Operands.push_back(Elt: Src2);
7423 [[fallthrough]];
7424 case Intrinsic::amdgcn_readlane:
7425 case Intrinsic::amdgcn_set_inactive:
7426 case Intrinsic::amdgcn_set_inactive_chain_arg:
7427 case Intrinsic::amdgcn_mov_dpp8:
7428 Operands.push_back(Elt: Src1);
7429 [[fallthrough]];
7430 case Intrinsic::amdgcn_readfirstlane:
7431 case Intrinsic::amdgcn_permlane64:
7432 Operands.push_back(Elt: Src0);
7433 break;
7434 default:
7435 llvm_unreachable("unhandled lane op");
7436 }
7437
7438 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
7439 std::reverse(first: Operands.begin(), last: Operands.end());
7440
7441 if (SDNode *GL = N->getGluedNode()) {
7442 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7443 GL = GL->getOperand(Num: 0).getNode();
7444 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7445 Operand: SDValue(GL, 0)));
7446 }
7447
7448 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
7449 };
7450
7451 SDValue Src0 = N->getOperand(Num: 1);
7452 SDValue Src1, Src2;
7453 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7454 IID == Intrinsic::amdgcn_mov_dpp8 ||
7455 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7456 Src1 = N->getOperand(Num: 2);
7457 if (IID == Intrinsic::amdgcn_writelane ||
7458 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7459 Src2 = N->getOperand(Num: 3);
7460 }
7461
7462 if (ValSize == SplitSize) {
7463 // Already legal
7464 return SDValue();
7465 }
7466
7467 if (ValSize < 32) {
7468 bool IsFloat = VT.isFloatingPoint();
7469 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
7470 DL: SL, VT: MVT::i32);
7471
7472 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7473 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
7474 DL: SL, VT: MVT::i32);
7475 }
7476
7477 if (IID == Intrinsic::amdgcn_writelane) {
7478 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
7479 DL: SL, VT: MVT::i32);
7480 }
7481
7482 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7483 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
7484 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
7485 }
7486
7487 if (ValSize % SplitSize != 0)
7488 return SDValue();
7489
7490 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7491 EVT VT = N->getValueType(ResNo: 0);
7492 unsigned NE = VT.getVectorNumElements();
7493 EVT EltVT = VT.getVectorElementType();
7494 SmallVector<SDValue, 8> Scalars;
7495 unsigned NumOperands = N->getNumOperands();
7496 SmallVector<SDValue, 4> Operands(NumOperands);
7497 SDNode *GL = N->getGluedNode();
7498
7499 // only handle convergencectrl_glue
7500 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7501
7502 for (unsigned i = 0; i != NE; ++i) {
7503 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7504 ++j) {
7505 SDValue Operand = N->getOperand(Num: j);
7506 EVT OperandVT = Operand.getValueType();
7507 if (OperandVT.isVector()) {
7508 // A vector operand; extract a single element.
7509 EVT OperandEltVT = OperandVT.getVectorElementType();
7510 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
7511 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
7512 } else {
7513 // A scalar operand; just use it as is.
7514 Operands[j] = Operand;
7515 }
7516 }
7517
7518 if (GL)
7519 Operands[NumOperands - 1] =
7520 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7521 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
7522
7523 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
7524 }
7525
7526 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
7527 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
7528 };
7529
7530 if (VT.isVector()) {
7531 switch (MVT::SimpleValueType EltTy =
7532 VT.getVectorElementType().getSimpleVT().SimpleTy) {
7533 case MVT::i32:
7534 case MVT::f32:
7535 if (SplitSize == 32) {
7536 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7537 return unrollLaneOp(LaneOp.getNode());
7538 }
7539 [[fallthrough]];
7540 case MVT::i16:
7541 case MVT::f16:
7542 case MVT::bf16: {
7543 unsigned SubVecNumElt =
7544 SplitSize / VT.getVectorElementType().getSizeInBits();
7545 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
7546 SmallVector<SDValue, 4> Pieces;
7547 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7548 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7549 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
7550 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7551
7552 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7553 IsPermLane16)
7554 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
7555 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7556
7557 if (IID == Intrinsic::amdgcn_writelane)
7558 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
7559 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7560
7561 Pieces.push_back(
7562 Elt: IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7563 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7564 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7565 EltIdx += SubVecNumElt;
7566 }
7567 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
7568 }
7569 default:
7570 // Handle all other cases by bitcasting to i32 vectors
7571 break;
7572 }
7573 }
7574
7575 MVT VecVT =
7576 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
7577 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
7578
7579 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7580 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
7581
7582 if (IID == Intrinsic::amdgcn_writelane)
7583 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
7584
7585 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7586 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7587 return DAG.getBitcast(VT, V: UnrolledLaneOp);
7588}
7589
7590static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
7591 SelectionDAG &DAG) {
7592 EVT VT = N->getValueType(ResNo: 0);
7593
7594 if (VT.getSizeInBits() != 32)
7595 return SDValue();
7596
7597 SDLoc SL(N);
7598
7599 SDValue Value = N->getOperand(Num: 1);
7600 SDValue Index = N->getOperand(Num: 2);
7601
7602 // ds_bpermute requires index to be multiplied by 4
7603 SDValue ShiftAmount = DAG.getShiftAmountConstant(Val: 2, VT: MVT::i32, DL: SL);
7604 SDValue ShiftedIndex =
7605 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: Index.getValueType(), N1: Index, N2: ShiftAmount);
7606
7607 // Intrinsics will require i32 to operate on
7608 SDValue ValueI32 = DAG.getBitcast(VT: MVT::i32, V: Value);
7609
7610 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7611 SmallVector<SDValue> IntrinArgs) -> SDValue {
7612 SmallVector<SDValue> Operands(1);
7613 Operands[0] = DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32);
7614 Operands.append(RHS: IntrinArgs);
7615 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: RetVT, Ops: Operands);
7616 };
7617
7618 // If we can bpermute across the whole wave, then just do that
7619 if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7620 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7621 {ShiftedIndex, ValueI32});
7622 return DAG.getBitcast(VT, V: BPermute);
7623 }
7624
7625 assert(TLI.getSubtarget()->isWave64());
7626
7627 // Otherwise, we need to make use of whole wave mode
7628 SDValue PoisonVal = DAG.getPOISON(VT: ValueI32->getValueType(ResNo: 0));
7629
7630 // Set inactive lanes to poison
7631 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7632 {ValueI32, PoisonVal});
7633 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7634 {ShiftedIndex, PoisonVal});
7635
7636 SDValue Swapped =
7637 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7638
7639 // Get permutation of each half, then we'll select which one to use
7640 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7641 {WWMIndex, WWMValue});
7642 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7643 MVT::i32, {WWMIndex, Swapped});
7644 SDValue BPermOtherHalfWWM =
7645 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7646
7647 // Select which side to take the permute from
7648 SDValue ThreadIDMask = DAG.getAllOnesConstant(DL: SL, VT: MVT::i32);
7649 // We can get away with only using mbcnt_lo here since we're only
7650 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7651 // returns 32 for lanes 32-63.
7652 SDValue ThreadID =
7653 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7654 {ThreadIDMask, DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32)});
7655
7656 SDValue SameOrOtherHalf =
7657 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
7658 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: ThreadID, N2: Index),
7659 N2: DAG.getTargetConstant(Val: 32, DL: SL, VT: MVT::i32));
7660 SDValue UseSameHalf =
7661 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SameOrOtherHalf,
7662 RHS: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond: ISD::SETEQ);
7663 SDValue Result = DAG.getSelect(DL: SL, VT: MVT::i32, Cond: UseSameHalf, LHS: BPermSameHalf,
7664 RHS: BPermOtherHalfWWM);
7665 return DAG.getBitcast(VT, V: Result);
7666}
7667
7668void SITargetLowering::ReplaceNodeResults(SDNode *N,
7669 SmallVectorImpl<SDValue> &Results,
7670 SelectionDAG &DAG) const {
7671 switch (N->getOpcode()) {
7672 case ISD::INSERT_VECTOR_ELT: {
7673 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7674 Results.push_back(Elt: Res);
7675 return;
7676 }
7677 case ISD::EXTRACT_VECTOR_ELT: {
7678 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7679 Results.push_back(Elt: Res);
7680 return;
7681 }
7682 case ISD::INTRINSIC_WO_CHAIN: {
7683 unsigned IID = N->getConstantOperandVal(Num: 0);
7684 switch (IID) {
7685 case Intrinsic::amdgcn_make_buffer_rsrc:
7686 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
7687 return;
7688 case Intrinsic::amdgcn_cvt_pkrtz: {
7689 SDValue Src0 = N->getOperand(Num: 1);
7690 SDValue Src1 = N->getOperand(Num: 2);
7691 SDLoc SL(N);
7692 SDValue Cvt =
7693 DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7694 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
7695 return;
7696 }
7697 case Intrinsic::amdgcn_cvt_pknorm_i16:
7698 case Intrinsic::amdgcn_cvt_pknorm_u16:
7699 case Intrinsic::amdgcn_cvt_pk_i16:
7700 case Intrinsic::amdgcn_cvt_pk_u16: {
7701 SDValue Src0 = N->getOperand(Num: 1);
7702 SDValue Src1 = N->getOperand(Num: 2);
7703 SDLoc SL(N);
7704 unsigned Opcode;
7705
7706 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7707 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7708 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7709 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7710 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7711 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7712 else
7713 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7714
7715 EVT VT = N->getValueType(ResNo: 0);
7716 if (isTypeLegal(VT))
7717 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
7718 else {
7719 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7720 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
7721 }
7722 return;
7723 }
7724 case Intrinsic::amdgcn_s_buffer_load: {
7725 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7726 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7727 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7728 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7729 // s_buffer_load_i8.
7730 if (!Subtarget->hasScalarSubwordLoads())
7731 return;
7732 SDValue Op = SDValue(N, 0);
7733 SDValue Rsrc = Op.getOperand(i: 1);
7734 SDValue Offset = Op.getOperand(i: 2);
7735 SDValue CachePolicy = Op.getOperand(i: 3);
7736 EVT VT = Op.getValueType();
7737 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7738 SDLoc DL(Op);
7739 MachineFunction &MF = DAG.getMachineFunction();
7740 const DataLayout &DataLayout = DAG.getDataLayout();
7741 Align Alignment =
7742 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
7743 MachineMemOperand *MMO = MF.getMachineMemOperand(
7744 PtrInfo: MachinePointerInfo(),
7745 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7746 MachineMemOperand::MOInvariant,
7747 Size: VT.getStoreSize(), BaseAlignment: Alignment);
7748 SDValue LoadVal;
7749 if (!Offset->isDivergent()) {
7750 SDValue Ops[] = {Rsrc, // source register
7751 Offset, CachePolicy};
7752 SDValue BufferLoad =
7753 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
7754 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
7755 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
7756 } else {
7757 SDValue Ops[] = {
7758 DAG.getEntryNode(), // Chain
7759 Rsrc, // rsrc
7760 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
7761 {}, // voffset
7762 {}, // soffset
7763 {}, // offset
7764 CachePolicy, // cachepolicy
7765 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
7766 };
7767 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
7768 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
7769 }
7770 Results.push_back(Elt: LoadVal);
7771 return;
7772 }
7773 case Intrinsic::amdgcn_dead: {
7774 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7775 Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
7776 return;
7777 }
7778 }
7779 break;
7780 }
7781 case ISD::INTRINSIC_W_CHAIN: {
7782 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
7783 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7784 // FIXME: Hacky
7785 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7786 Results.push_back(Elt: Res.getOperand(i: I));
7787 }
7788 } else {
7789 Results.push_back(Elt: Res);
7790 Results.push_back(Elt: Res.getValue(R: 1));
7791 }
7792 return;
7793 }
7794
7795 break;
7796 }
7797 case ISD::SELECT: {
7798 SDLoc SL(N);
7799 EVT VT = N->getValueType(ResNo: 0);
7800 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
7801 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
7802 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
7803
7804 EVT SelectVT = NewVT;
7805 if (NewVT.bitsLT(VT: MVT::i32)) {
7806 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
7807 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
7808 SelectVT = MVT::i32;
7809 }
7810
7811 SDValue NewSelect =
7812 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
7813
7814 if (NewVT != SelectVT)
7815 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
7816 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
7817 return;
7818 }
7819 case ISD::FNEG: {
7820 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7821 break;
7822
7823 SDLoc SL(N);
7824 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7825
7826 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
7827 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
7828 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7829 return;
7830 }
7831 case ISD::FABS: {
7832 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7833 break;
7834
7835 SDLoc SL(N);
7836 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7837
7838 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
7839 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
7840 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7841 return;
7842 }
7843 case ISD::FSQRT: {
7844 if (N->getValueType(ResNo: 0) != MVT::f16)
7845 break;
7846 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
7847 break;
7848 }
7849 default:
7850 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
7851 break;
7852 }
7853}
7854
7855/// Helper function for LowerBRCOND
7856static SDNode *findUser(SDValue Value, unsigned Opcode) {
7857
7858 for (SDUse &U : Value->uses()) {
7859 if (U.get() != Value)
7860 continue;
7861
7862 if (U.getUser()->getOpcode() == Opcode)
7863 return U.getUser();
7864 }
7865 return nullptr;
7866}
7867
7868unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7869 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7870 switch (Intr->getConstantOperandVal(Num: 1)) {
7871 case Intrinsic::amdgcn_if:
7872 return AMDGPUISD::IF;
7873 case Intrinsic::amdgcn_else:
7874 return AMDGPUISD::ELSE;
7875 case Intrinsic::amdgcn_loop:
7876 return AMDGPUISD::LOOP;
7877 case Intrinsic::amdgcn_end_cf:
7878 llvm_unreachable("should not occur");
7879 default:
7880 return 0;
7881 }
7882 }
7883
7884 // break, if_break, else_break are all only used as inputs to loop, not
7885 // directly as branch conditions.
7886 return 0;
7887}
7888
7889bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
7890 const Triple &TT = getTargetMachine().getTargetTriple();
7891 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7892 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7893 AMDGPU::shouldEmitConstantsToTextSection(TT);
7894}
7895
7896bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
7897 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7898 return false;
7899
7900 // FIXME: Either avoid relying on address space here or change the default
7901 // address space for functions to avoid the explicit check.
7902 return (GV->getValueType()->isFunctionTy() ||
7903 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
7904 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
7905}
7906
7907bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
7908 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7909}
7910
7911bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
7912 if (!GV->hasExternalLinkage())
7913 return true;
7914
7915 const auto OS = getTargetMachine().getTargetTriple().getOS();
7916 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7917}
7918
7919/// This transforms the control flow intrinsics to get the branch destination as
7920/// last parameter, also switches branch target with BR if the need arise
7921SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7922 SDLoc DL(BRCOND);
7923
7924 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
7925 SDValue Target = BRCOND.getOperand(i: 2);
7926 SDNode *BR = nullptr;
7927 SDNode *SetCC = nullptr;
7928
7929 switch (Intr->getOpcode()) {
7930 case ISD::SETCC: {
7931 // As long as we negate the condition everything is fine
7932 SetCC = Intr;
7933 Intr = SetCC->getOperand(Num: 0).getNode();
7934 break;
7935 }
7936 case ISD::XOR: {
7937 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7938 SDValue LHS = Intr->getOperand(Num: 0);
7939 SDValue RHS = Intr->getOperand(Num: 1);
7940 if (auto *C = dyn_cast<ConstantSDNode>(Val&: RHS); C && C->getZExtValue()) {
7941 Intr = LHS.getNode();
7942 break;
7943 }
7944 [[fallthrough]];
7945 }
7946 default: {
7947 // Get the target from BR if we don't negate the condition
7948 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
7949 assert(BR && "brcond missing unconditional branch user");
7950 Target = BR->getOperand(Num: 1);
7951 }
7952 }
7953
7954 unsigned CFNode = isCFIntrinsic(Intr);
7955 if (CFNode == 0) {
7956 // This is a uniform branch so we don't need to legalize.
7957 return BRCOND;
7958 }
7959
7960 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7961 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7962
7963 assert(!SetCC ||
7964 (SetCC->getConstantOperandVal(1) == 1 &&
7965 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7966 ISD::SETNE));
7967
7968 // operands of the new intrinsic call
7969 SmallVector<SDValue, 4> Ops;
7970 if (HaveChain)
7971 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
7972
7973 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
7974 Ops.push_back(Elt: Target);
7975
7976 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7977
7978 // build the new intrinsic call
7979 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
7980
7981 if (!HaveChain) {
7982 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(i: 0)};
7983
7984 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
7985 }
7986
7987 if (BR) {
7988 // Give the branch instruction our target
7989 SDValue Ops[] = {BR->getOperand(Num: 0), BRCOND.getOperand(i: 2)};
7990 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
7991 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
7992 }
7993
7994 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7995
7996 // Copy the intrinsic results to registers
7997 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7998 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
7999 if (!CopyToReg)
8000 continue;
8001
8002 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: 1),
8003 N: SDValue(Result, i - 1), Glue: SDValue());
8004
8005 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
8006 }
8007
8008 // Remove the old intrinsic from the chain
8009 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Intr, Intr->getNumValues() - 1),
8010 To: Intr->getOperand(Num: 0));
8011
8012 return Chain;
8013}
8014
8015SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8016 MVT VT = Op.getSimpleValueType();
8017 SDLoc DL(Op);
8018 // Checking the depth
8019 if (Op.getConstantOperandVal(i: 0) != 0)
8020 return DAG.getConstant(Val: 0, DL, VT);
8021
8022 MachineFunction &MF = DAG.getMachineFunction();
8023 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8024 // Check for kernel and shader functions
8025 if (Info->isEntryFunction())
8026 return DAG.getConstant(Val: 0, DL, VT);
8027
8028 MachineFrameInfo &MFI = MF.getFrameInfo();
8029 // There is a call to @llvm.returnaddress in this function
8030 MFI.setReturnAddressIsTaken(true);
8031
8032 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8033 // Get the return address reg and mark it as an implicit live-in
8034 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
8035 RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
8036
8037 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
8038}
8039
8040SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8041 MachineFunction &MF = DAG.getMachineFunction();
8042 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8043
8044 // For functions that set up their own stack, select the GET_STACK_BASE
8045 // pseudo.
8046 if (MFI->isBottomOfStack())
8047 return Op;
8048
8049 // For everything else, create a dummy stack object.
8050 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8051 return DAG.getFrameIndex(FI, VT: Op.getValueType());
8052}
8053
8054SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8055 const SDLoc &DL, EVT VT) const {
8056 return Op.getValueType().bitsLE(VT)
8057 ? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
8058 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
8059 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8060}
8061
8062SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8063 SelectionDAG &DAG) const {
8064 EVT DstVT = Op.getValueType();
8065 unsigned NumElts = DstVT.getVectorNumElements();
8066 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8067
8068 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
8069
8070 SDLoc DL(Op);
8071 unsigned Opc = Op.getOpcode();
8072 SDValue Flags = Op.getOperand(i: 1);
8073 EVT HalfDstVT =
8074 EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / 2);
8075 SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
8076 SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
8077
8078 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
8079}
8080
8081SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8082 SDValue Src = Op.getOperand(i: 0);
8083 EVT SrcVT = Src.getValueType();
8084 EVT DstVT = Op.getValueType();
8085
8086 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8087 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8088 if (SrcVT.getScalarType() != MVT::f32)
8089 return SDValue();
8090 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8091 }
8092
8093 if (SrcVT.getScalarType() != MVT::f64)
8094 return Op;
8095
8096 SDLoc DL(Op);
8097 if (DstVT == MVT::f16) {
8098 // TODO: Handle strictfp
8099 if (Op.getOpcode() != ISD::FP_ROUND)
8100 return Op;
8101
8102 if (!Subtarget->has16BitInsts()) {
8103 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
8104 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8105 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8106 }
8107 if (Op->getFlags().hasApproximateFuncs()) {
8108 SDValue Flags = Op.getOperand(i: 1);
8109 SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
8110 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
8111 }
8112 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8113 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8114 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8115 }
8116
8117 assert(DstVT.getScalarType() == MVT::bf16 &&
8118 "custom lower FP_ROUND for f16 or bf16");
8119 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8120
8121 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8122 // hardware f32 -> bf16 instruction.
8123 EVT F32VT = SrcVT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::f32);
8124 SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
8125 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
8126 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8127}
8128
8129SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8130 SelectionDAG &DAG) const {
8131 EVT VT = Op.getValueType();
8132 const MachineFunction &MF = DAG.getMachineFunction();
8133 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8134 bool IsIEEEMode = Info->getMode().IEEE;
8135
8136 // FIXME: Assert during selection that this is only selected for
8137 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8138 // mode functions, but this happens to be OK since it's only done in cases
8139 // where there is known no sNaN.
8140 if (IsIEEEMode)
8141 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
8142
8143 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8144 VT == MVT::v16bf16)
8145 return splitBinaryVectorOp(Op, DAG);
8146 return Op;
8147}
8148
8149SDValue
8150SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8151 SelectionDAG &DAG) const {
8152 EVT VT = Op.getValueType();
8153 const MachineFunction &MF = DAG.getMachineFunction();
8154 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8155 bool IsIEEEMode = Info->getMode().IEEE;
8156
8157 if (IsIEEEMode)
8158 return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
8159
8160 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8161 VT == MVT::v16bf16)
8162 return splitBinaryVectorOp(Op, DAG);
8163 return Op;
8164}
8165
8166SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8167 SelectionDAG &DAG) const {
8168 EVT VT = Op.getValueType();
8169 if (VT.isVector())
8170 return splitBinaryVectorOp(Op, DAG);
8171
8172 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8173 !Subtarget->hasMinimum3Maximum3F16() &&
8174 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8175 "should not need to widen f16 minimum/maximum to v2f16");
8176
8177 // Widen f16 operation to v2f16
8178
8179 // fminimum f16:x, f16:y ->
8180 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8181 // (v2f16 (scalar_to_vector y))), 0
8182 SDLoc SL(Op);
8183 SDValue WideSrc0 =
8184 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 0));
8185 SDValue WideSrc1 =
8186 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 1));
8187
8188 SDValue Widened =
8189 DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
8190
8191 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
8192 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8193}
8194
8195SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8196 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8197 EVT VT = Op.getValueType();
8198 assert(VT == MVT::f16);
8199
8200 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
8201 EVT ExpVT = Exp.getValueType();
8202 if (ExpVT == MVT::i16)
8203 return Op;
8204
8205 SDLoc DL(Op);
8206
8207 // Correct the exponent type for f16 to i16.
8208 // Clamp the range of the exponent to the instruction's range.
8209
8210 // TODO: This should be a generic narrowing legalization, and can easily be
8211 // for GlobalISel.
8212
8213 SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
8214 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
8215
8216 SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
8217 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
8218
8219 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
8220
8221 if (IsStrict) {
8222 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
8223 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
8224 }
8225
8226 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
8227}
8228
8229static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
8230 switch (Op->getOpcode()) {
8231 case ISD::SRA:
8232 case ISD::SMIN:
8233 case ISD::SMAX:
8234 return ISD::SIGN_EXTEND;
8235 case ISD::SRL:
8236 case ISD::UMIN:
8237 case ISD::UMAX:
8238 return ISD::ZERO_EXTEND;
8239 case ISD::ADD:
8240 case ISD::SUB:
8241 case ISD::AND:
8242 case ISD::OR:
8243 case ISD::XOR:
8244 case ISD::SHL:
8245 case ISD::SELECT:
8246 case ISD::MUL:
8247 // operation result won't be influenced by garbage high bits.
8248 // TODO: are all of those cases correct, and are there more?
8249 return ISD::ANY_EXTEND;
8250 case ISD::SETCC: {
8251 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8252 return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8253 }
8254 default:
8255 llvm_unreachable("unexpected opcode!");
8256 }
8257}
8258
8259SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8260 DAGCombinerInfo &DCI) const {
8261 const unsigned Opc = Op.getOpcode();
8262 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8263 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8264 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8265 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8266 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8267
8268 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8269 : Op->getOperand(Num: 0).getValueType();
8270 auto &DAG = DCI.DAG;
8271 auto ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8272
8273 if (DCI.isBeforeLegalizeOps() ||
8274 isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8275 return SDValue();
8276
8277 SDLoc DL(Op);
8278 SDValue LHS;
8279 SDValue RHS;
8280 if (Opc == ISD::SELECT) {
8281 LHS = Op->getOperand(Num: 1);
8282 RHS = Op->getOperand(Num: 2);
8283 } else {
8284 LHS = Op->getOperand(Num: 0);
8285 RHS = Op->getOperand(Num: 1);
8286 }
8287
8288 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8289 LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
8290
8291 // Special case: for shifts, the RHS always needs a zext.
8292 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8293 RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
8294 else
8295 RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
8296
8297 // setcc always return i1/i1 vec so no need to truncate after.
8298 if (Opc == ISD::SETCC) {
8299 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8300 return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
8301 }
8302
8303 // For other ops, we extend the operation's return type as well so we need to
8304 // truncate back to the original type.
8305 SDValue NewVal;
8306 if (Opc == ISD::SELECT)
8307 NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op->getOperand(Num: 0), LHS, RHS});
8308 else
8309 NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
8310
8311 return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
8312}
8313
8314SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8315 SDValue Mag = Op.getOperand(i: 0);
8316 EVT MagVT = Mag.getValueType();
8317
8318 if (MagVT.getVectorNumElements() > 2)
8319 return splitBinaryVectorOp(Op, DAG);
8320
8321 SDValue Sign = Op.getOperand(i: 1);
8322 EVT SignVT = Sign.getValueType();
8323
8324 if (MagVT == SignVT)
8325 return Op;
8326
8327 // fcopysign v2f16:mag, v2f32:sign ->
8328 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8329
8330 SDLoc SL(Op);
8331 SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
8332 SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
8333
8334 SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
8335
8336 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
8337}
8338
8339// Custom lowering for vector multiplications and s_mul_u64.
8340SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8341 EVT VT = Op.getValueType();
8342
8343 // Split vector operands.
8344 if (VT.isVector())
8345 return splitBinaryVectorOp(Op, DAG);
8346
8347 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8348
8349 // There are four ways to lower s_mul_u64:
8350 //
8351 // 1. If all the operands are uniform, then we lower it as it is.
8352 //
8353 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8354 // multiplications because there is not a vector equivalent of s_mul_u64.
8355 //
8356 // 3. If the cost model decides that it is more efficient to use vector
8357 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8358 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8359 //
8360 // 4. If the cost model decides to use vector registers and both of the
8361 // operands are zero-extended/sign-extended from 32-bits, then we split the
8362 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8363 // possible to check if the operands are zero-extended or sign-extended in
8364 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8365 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8366 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8367 // If the cost model decides that we have to use vector registers, then
8368 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8369 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8370 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8371 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8372 // SIInstrInfo.cpp .
8373
8374 if (Op->isDivergent())
8375 return SDValue();
8376
8377 SDValue Op0 = Op.getOperand(i: 0);
8378 SDValue Op1 = Op.getOperand(i: 1);
8379 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8380 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8381 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8382 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
8383 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8384 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
8385 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8386 SDLoc SL(Op);
8387 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8388 return SDValue(
8389 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8390 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
8391 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
8392 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8393 return SDValue(
8394 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8395 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8396 return Op;
8397}
8398
8399SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8400 EVT VT = Op.getValueType();
8401 SDLoc SL(Op);
8402 SDValue LHS = Op.getOperand(i: 0);
8403 SDValue RHS = Op.getOperand(i: 1);
8404 bool isSigned = Op.getOpcode() == ISD::SMULO;
8405
8406 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
8407 const APInt &C = RHSC->getAPIntValue();
8408 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8409 if (C.isPowerOf2()) {
8410 // smulo(x, signed_min) is same as umulo(x, signed_min).
8411 bool UseArithShift = isSigned && !C.isMinSignedValue();
8412 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
8413 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
8414 SDValue Overflow =
8415 DAG.getSetCC(DL: SL, VT: MVT::i1,
8416 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
8417 N1: Result, N2: ShiftAmt),
8418 RHS: LHS, Cond: ISD::SETNE);
8419 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8420 }
8421 }
8422
8423 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
8424 SDValue Top =
8425 DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
8426
8427 SDValue Sign = isSigned
8428 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
8429 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1,
8430 DL: SL, VT: MVT::i32))
8431 : DAG.getConstant(Val: 0, DL: SL, VT);
8432 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
8433
8434 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8435}
8436
8437SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8438 if (Op->isDivergent()) {
8439 // Select to V_MAD_[IU]64_[IU]32.
8440 return Op;
8441 }
8442 if (Subtarget->hasSMulHi()) {
8443 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8444 return SDValue();
8445 }
8446 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8447 // calculate the high part, so we might as well do the whole thing with
8448 // V_MAD_[IU]64_[IU]32.
8449 return Op;
8450}
8451
8452SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8453 if (!Subtarget->hasTrapHandler() ||
8454 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8455 return lowerTrapEndpgm(Op, DAG);
8456
8457 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8458 : lowerTrapHsaQueuePtr(Op, DAG);
8459}
8460
8461SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8462 SDLoc SL(Op);
8463 SDValue Chain = Op.getOperand(i: 0);
8464 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8465}
8466
8467SDValue
8468SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8469 const SDLoc &DL, Align Alignment,
8470 ImplicitParameter Param) const {
8471 MachineFunction &MF = DAG.getMachineFunction();
8472 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8473 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
8474 MachinePointerInfo PtrInfo =
8475 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
8476 return DAG.getLoad(
8477 VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
8478 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
8479}
8480
8481SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8482 SelectionDAG &DAG) const {
8483 SDLoc SL(Op);
8484 SDValue Chain = Op.getOperand(i: 0);
8485
8486 SDValue QueuePtr;
8487 // For code object version 5, QueuePtr is passed through implicit kernarg.
8488 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8489 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8490 QueuePtr =
8491 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
8492 } else {
8493 MachineFunction &MF = DAG.getMachineFunction();
8494 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8495 Register UserSGPR = Info->getQueuePtrUserSGPR();
8496
8497 if (UserSGPR == AMDGPU::NoRegister) {
8498 // We probably are in a function incorrectly marked with
8499 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8500 // trap, so just use a null pointer.
8501 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8502 } else {
8503 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
8504 VT: MVT::i64);
8505 }
8506 }
8507
8508 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
8509 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue());
8510
8511 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8512 SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
8513 ToReg.getValue(R: 1)};
8514 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8515}
8516
8517SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8518 SDLoc SL(Op);
8519 SDValue Chain = Op.getOperand(i: 0);
8520
8521 // We need to simulate the 's_trap 2' instruction on targets that run in
8522 // PRIV=1 (where it is treated as a nop).
8523 if (Subtarget->hasPrivEnabledTrap2NopBug())
8524 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8525
8526 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8527 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8528 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8529}
8530
8531SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8532 SDLoc SL(Op);
8533 SDValue Chain = Op.getOperand(i: 0);
8534 MachineFunction &MF = DAG.getMachineFunction();
8535
8536 if (!Subtarget->hasTrapHandler() ||
8537 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8538 LLVMContext &Ctx = MF.getFunction().getContext();
8539 Ctx.diagnose(DI: DiagnosticInfoUnsupported(MF.getFunction(),
8540 "debugtrap handler not supported",
8541 Op.getDebugLoc(), DS_Warning));
8542 return Chain;
8543 }
8544
8545 uint64_t TrapID =
8546 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8547 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8548 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8549}
8550
8551SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8552 SelectionDAG &DAG) const {
8553 if (Subtarget->hasApertureRegs()) {
8554 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8555 ? AMDGPU::SRC_SHARED_BASE
8556 : AMDGPU::SRC_PRIVATE_BASE;
8557 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8558 !Subtarget->hasGloballyAddressableScratch()) &&
8559 "Cannot use src_private_base with globally addressable scratch!");
8560 // Note: this feature (register) is broken. When used as a 32-bit operand,
8561 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8562 // bits.
8563 //
8564 // To work around the issue, emit a 64 bit copy from this register
8565 // then extract the high bits. Note that this shouldn't even result in a
8566 // shift being emitted and simply become a pair of registers (e.g.):
8567 // s_mov_b64 s[6:7], src_shared_base
8568 // v_mov_b32_e32 v1, s7
8569 SDValue Copy =
8570 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ApertureRegNo, VT: MVT::v2i32);
8571 return DAG.getExtractVectorElt(DL, VT: MVT::i32, Vec: Copy, Idx: 1);
8572 }
8573
8574 // For code object version 5, private_base and shared_base are passed through
8575 // implicit kernargs.
8576 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8577 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8578 ImplicitParameter Param =
8579 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
8580 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
8581 }
8582
8583 MachineFunction &MF = DAG.getMachineFunction();
8584 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8585 Register UserSGPR = Info->getQueuePtrUserSGPR();
8586 if (UserSGPR == AMDGPU::NoRegister) {
8587 // We probably are in a function incorrectly marked with
8588 // amdgpu-no-queue-ptr. This is undefined.
8589 return DAG.getPOISON(VT: MVT::i32);
8590 }
8591
8592 SDValue QueuePtr =
8593 CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
8594
8595 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8596 // private_segment_aperture_base_hi.
8597 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8598
8599 SDValue Ptr =
8600 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
8601
8602 // TODO: Use custom target PseudoSourceValue.
8603 // TODO: We should use the value from the IR intrinsic call, but it might not
8604 // be available and how do we get it?
8605 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8606 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
8607 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
8608 MMOFlags: MachineMemOperand::MODereferenceable |
8609 MachineMemOperand::MOInvariant);
8610}
8611
8612/// Return true if the value is a known valid address, such that a null check is
8613/// not necessary.
8614static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
8615 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8616 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
8617 return true;
8618
8619 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8620 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8621
8622 // TODO: Search through arithmetic, handle arguments and loads
8623 // marked nonnull.
8624 return false;
8625}
8626
8627SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8628 SelectionDAG &DAG) const {
8629 SDLoc SL(Op);
8630
8631 const AMDGPUTargetMachine &TM =
8632 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8633
8634 unsigned DestAS, SrcAS;
8635 SDValue Src;
8636 bool IsNonNull = false;
8637 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
8638 SrcAS = ASC->getSrcAddressSpace();
8639 Src = ASC->getOperand(Num: 0);
8640 DestAS = ASC->getDestAddressSpace();
8641 } else {
8642 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8643 Op.getConstantOperandVal(0) ==
8644 Intrinsic::amdgcn_addrspacecast_nonnull);
8645 Src = Op->getOperand(Num: 1);
8646 SrcAS = Op->getConstantOperandVal(Num: 2);
8647 DestAS = Op->getConstantOperandVal(Num: 3);
8648 IsNonNull = true;
8649 }
8650
8651 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8652
8653 // flat -> local/private
8654 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8655 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8656 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8657 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8658
8659 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8660 Subtarget->hasGloballyAddressableScratch()) {
8661 // flat -> private with globally addressable scratch: subtract
8662 // src_flat_scratch_base_lo.
8663 SDValue FlatScratchBaseLo(
8664 DAG.getMachineNode(
8665 Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32,
8666 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, VT: MVT::i32)),
8667 0);
8668 Ptr = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: Ptr, N2: FlatScratchBaseLo);
8669 }
8670
8671 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8672 return Ptr;
8673
8674 unsigned NullVal = TM.getNullPointerValue(AddrSpace: DestAS);
8675 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8676 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
8677
8678 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
8679 N3: SegmentNullPtr);
8680 }
8681 }
8682
8683 // local/private -> flat
8684 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8685 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8686 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8687 SDValue CvtPtr;
8688 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8689 Subtarget->hasGloballyAddressableScratch()) {
8690 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8691 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8692 SDValue AllOnes = DAG.getSignedTargetConstant(Val: -1, DL: SL, VT: MVT::i32);
8693 SDValue ThreadID = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
8694 ThreadID = DAG.getNode(
8695 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8696 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_lo, DL: SL, VT: MVT::i32),
8697 N2: AllOnes, N3: ThreadID);
8698 if (Subtarget->isWave64())
8699 ThreadID = DAG.getNode(
8700 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8701 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_hi, DL: SL, VT: MVT::i32),
8702 N2: AllOnes, N3: ThreadID);
8703 SDValue ShAmt = DAG.getShiftAmountConstant(
8704 Val: 57 - 32 - Subtarget->getWavefrontSizeLog2(), VT: MVT::i32, DL: SL);
8705 SDValue SrcHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ThreadID, N2: ShAmt);
8706 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: SrcHi);
8707 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8708 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8709 // 64-bit hi:lo value.
8710 SDValue FlatScratchBase = {
8711 DAG.getMachineNode(
8712 Opcode: AMDGPU::S_MOV_B64, dl: SL, VT: MVT::i64,
8713 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE, VT: MVT::i64)),
8714 0};
8715 CvtPtr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: CvtPtr, N2: FlatScratchBase);
8716 } else {
8717 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
8718 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
8719 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8720 }
8721
8722 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8723 return CvtPtr;
8724
8725 unsigned NullVal = TM.getNullPointerValue(AddrSpace: SrcAS);
8726 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8727
8728 SDValue NonNull =
8729 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
8730
8731 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
8732 N3: FlatNullPtr);
8733 }
8734 }
8735
8736 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8737 Op.getValueType() == MVT::i64) {
8738 const SIMachineFunctionInfo *Info =
8739 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8740 if (Info->get32BitAddressHighBits() == 0)
8741 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: Src);
8742
8743 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
8744 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
8745 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
8746 }
8747
8748 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8749 Src.getValueType() == MVT::i64)
8750 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8751
8752 // global <-> flat are no-ops and never emitted.
8753
8754 // Invalid casts are poison.
8755 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
8756}
8757
8758// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8759// the small vector and inserting them into the big vector. That is better than
8760// the default expansion of doing it via a stack slot. Even though the use of
8761// the stack slot would be optimized away afterwards, the stack slot itself
8762// remains.
8763SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8764 SelectionDAG &DAG) const {
8765 SDValue Vec = Op.getOperand(i: 0);
8766 SDValue Ins = Op.getOperand(i: 1);
8767 SDValue Idx = Op.getOperand(i: 2);
8768 EVT VecVT = Vec.getValueType();
8769 EVT InsVT = Ins.getValueType();
8770 EVT EltVT = VecVT.getVectorElementType();
8771 unsigned InsNumElts = InsVT.getVectorNumElements();
8772 unsigned IdxVal = Idx->getAsZExtVal();
8773 SDLoc SL(Op);
8774
8775 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8776 // Insert 32-bit registers at a time.
8777 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8778
8779 unsigned VecNumElts = VecVT.getVectorNumElements();
8780 EVT NewVecVT =
8781 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
8782 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8783 : EVT::getVectorVT(Context&: *DAG.getContext(),
8784 VT: MVT::i32, NumElements: InsNumElts / 2);
8785
8786 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
8787 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
8788
8789 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8790 SDValue Elt;
8791 if (InsNumElts == 2) {
8792 Elt = Ins;
8793 } else {
8794 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
8795 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8796 }
8797 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
8798 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
8799 }
8800
8801 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
8802 }
8803
8804 for (unsigned I = 0; I != InsNumElts; ++I) {
8805 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
8806 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8807 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
8808 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
8809 }
8810 return Vec;
8811}
8812
8813SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8814 SelectionDAG &DAG) const {
8815 SDValue Vec = Op.getOperand(i: 0);
8816 SDValue InsVal = Op.getOperand(i: 1);
8817 SDValue Idx = Op.getOperand(i: 2);
8818 EVT VecVT = Vec.getValueType();
8819 EVT EltVT = VecVT.getVectorElementType();
8820 unsigned VecSize = VecVT.getSizeInBits();
8821 unsigned EltSize = EltVT.getSizeInBits();
8822 SDLoc SL(Op);
8823
8824 // Specially handle the case of v4i16 with static indexing.
8825 unsigned NumElts = VecVT.getVectorNumElements();
8826 auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
8827 if (NumElts == 4 && EltSize == 16 && KIdx) {
8828 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
8829
8830 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8831 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8832 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8833 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
8834
8835 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
8836 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
8837
8838 unsigned Idx = KIdx->getZExtValue();
8839 bool InsertLo = Idx < 2;
8840 SDValue InsHalf = DAG.getNode(
8841 Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
8842 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
8843 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
8844
8845 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
8846
8847 SDValue Concat =
8848 InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
8849 : DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
8850
8851 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
8852 }
8853
8854 // Static indexing does not lower to stack access, and hence there is no need
8855 // for special custom lowering to avoid stack access.
8856 if (isa<ConstantSDNode>(Val: Idx))
8857 return SDValue();
8858
8859 // Avoid stack access for dynamic indexing by custom lowering to
8860 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8861
8862 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8863
8864 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8865
8866 // Convert vector index to bit-index and get the required bit mask.
8867 assert(isPowerOf2_32(EltSize));
8868 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
8869 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8870 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8871 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
8872 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
8873
8874 // 1. Create a congruent vector with the target value in each element.
8875 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
8876 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
8877
8878 // 2. Mask off all other indices except the required index within (1).
8879 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
8880
8881 // 3. Mask off the required index within the target vector.
8882 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8883 SDValue RHS =
8884 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
8885
8886 // 4. Get (2) and (3) ORed into the target vector.
8887 SDValue BFI =
8888 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
8889
8890 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
8891}
8892
8893SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8894 SelectionDAG &DAG) const {
8895 SDLoc SL(Op);
8896
8897 EVT ResultVT = Op.getValueType();
8898 SDValue Vec = Op.getOperand(i: 0);
8899 SDValue Idx = Op.getOperand(i: 1);
8900 EVT VecVT = Vec.getValueType();
8901 unsigned VecSize = VecVT.getSizeInBits();
8902 EVT EltVT = VecVT.getVectorElementType();
8903
8904 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8905
8906 // Make sure we do any optimizations that will make it easier to fold
8907 // source modifiers before obscuring it with bit operations.
8908
8909 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8910 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
8911 return Combined;
8912
8913 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8914 SDValue Lo, Hi;
8915 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
8916
8917 if (VecSize == 128) {
8918 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
8919 Lo = DAG.getBitcast(VT: LoVT,
8920 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8921 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
8922 Hi = DAG.getBitcast(VT: HiVT,
8923 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8924 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
8925 } else if (VecSize == 256) {
8926 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
8927 SDValue Parts[4];
8928 for (unsigned P = 0; P < 4; ++P) {
8929 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8930 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8931 }
8932
8933 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8934 N1: Parts[0], N2: Parts[1]));
8935 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8936 N1: Parts[2], N2: Parts[3]));
8937 } else {
8938 assert(VecSize == 512);
8939
8940 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
8941 SDValue Parts[8];
8942 for (unsigned P = 0; P < 8; ++P) {
8943 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8944 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8945 }
8946
8947 Lo = DAG.getBitcast(VT: LoVT,
8948 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8949 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
8950 Hi = DAG.getBitcast(VT: HiVT,
8951 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8952 N1: Parts[4], N2: Parts[5], N3: Parts[6], N4: Parts[7]));
8953 }
8954
8955 EVT IdxVT = Idx.getValueType();
8956 unsigned NElem = VecVT.getVectorNumElements();
8957 assert(isPowerOf2_32(NElem));
8958 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
8959 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
8960 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
8961 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
8962 }
8963
8964 assert(VecSize <= 64);
8965
8966 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8967
8968 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8969 SDValue VecBC = peekThroughBitcasts(V: Vec);
8970 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8971 SDValue Src = VecBC.getOperand(i: 0);
8972 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
8973 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
8974 }
8975
8976 unsigned EltSize = EltVT.getSizeInBits();
8977 assert(isPowerOf2_32(EltSize));
8978
8979 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8980
8981 // Convert vector index to bit-index (* EltSize)
8982 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8983
8984 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8985 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
8986
8987 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8988 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
8989 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
8990 }
8991
8992 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
8993}
8994
8995static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8996 assert(Elt % 2 == 0);
8997 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8998}
8999
9000static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9001 assert(Elt % 2 == 0);
9002 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9003 !(Mask[Elt + 1] & 1);
9004}
9005
9006SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9007 SelectionDAG &DAG) const {
9008 SDLoc SL(Op);
9009 EVT ResultVT = Op.getValueType();
9010 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
9011 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9012 const int NewSrcNumElts = 2;
9013 MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
9014 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
9015
9016 // Break up the shuffle into registers sized pieces.
9017 //
9018 // We're trying to form sub-shuffles that the register allocation pipeline
9019 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9020 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9021 // pair of copies into a consecutive register copy, so use the ordinary
9022 // extract_vector_elt lowering unless we can use the shuffle.
9023 //
9024 // TODO: This is a bit of hack, and we should probably always use
9025 // extract_subvector for the largest possible subvector we can (or at least
9026 // use it for PackVT aligned pieces). However we have worse support for
9027 // combines on them don't directly treat extract_subvector / insert_subvector
9028 // as legal. The DAG scheduler also ends up doing a worse job with the
9029 // extract_subvectors.
9030 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9031
9032 // vector_shuffle <0,1,6,7> lhs, rhs
9033 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9034 //
9035 // vector_shuffle <6,7,2,3> lhs, rhs
9036 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9037 //
9038 // vector_shuffle <6,7,0,1> lhs, rhs
9039 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9040
9041 // Avoid scalarizing when both halves are reading from consecutive elements.
9042
9043 // If we're treating 2 element shuffles as legal, also create odd-to-even
9044 // shuffles of neighboring pairs.
9045 //
9046 // vector_shuffle <3,2,7,6> lhs, rhs
9047 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9048 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9049
9050 SmallVector<SDValue, 16> Pieces;
9051 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9052 if (ShouldUseConsecutiveExtract &&
9053 elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
9054 const int Idx = SVN->getMaskElt(Idx: I);
9055 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9056 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9057 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
9058 N1: SVN->getOperand(Num: VecIdx),
9059 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
9060 Pieces.push_back(Elt: SubVec);
9061 } else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
9062 isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
9063 int Idx0 = SVN->getMaskElt(Idx: I);
9064 int Idx1 = SVN->getMaskElt(Idx: I + 1);
9065
9066 SDValue SrcOp0 = SVN->getOperand(Num: 0);
9067 SDValue SrcOp1 = SrcOp0;
9068 if (Idx0 >= SrcNumElts) {
9069 SrcOp0 = SVN->getOperand(Num: 1);
9070 Idx0 -= SrcNumElts;
9071 }
9072
9073 if (Idx1 >= SrcNumElts) {
9074 SrcOp1 = SVN->getOperand(Num: 1);
9075 Idx1 -= SrcNumElts;
9076 }
9077
9078 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9079 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9080
9081 // Extract nearest even aligned piece.
9082 SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
9083 N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
9084 SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
9085 N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
9086
9087 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9088 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9089
9090 SDValue Result0 = SubVec0;
9091 SDValue Result1 = SubVec0;
9092
9093 if (SubVec0 != SubVec1) {
9094 NewMaskIdx1 += NewSrcNumElts;
9095 Result1 = SubVec1;
9096 } else {
9097 Result1 = DAG.getPOISON(VT: PackVT);
9098 }
9099
9100 SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
9101 Mask: {NewMaskIdx0, NewMaskIdx1});
9102 Pieces.push_back(Elt: Shuf);
9103 } else {
9104 const int Idx0 = SVN->getMaskElt(Idx: I);
9105 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
9106 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9107 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9108 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9109 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9110
9111 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
9112 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
9113 N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
9114
9115 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
9116 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
9117 N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
9118 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
9119 }
9120 }
9121
9122 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
9123}
9124
9125SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9126 SelectionDAG &DAG) const {
9127 SDValue SVal = Op.getOperand(i: 0);
9128 EVT ResultVT = Op.getValueType();
9129 EVT SValVT = SVal.getValueType();
9130 SDValue UndefVal = DAG.getPOISON(VT: SValVT);
9131 SDLoc SL(Op);
9132
9133 SmallVector<SDValue, 8> VElts;
9134 VElts.push_back(Elt: SVal);
9135 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9136 VElts.push_back(Elt: UndefVal);
9137
9138 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
9139}
9140
9141SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9142 SelectionDAG &DAG) const {
9143 SDLoc SL(Op);
9144 EVT VT = Op.getValueType();
9145
9146 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9147 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9148
9149 SDValue Lo = Op.getOperand(i: 0);
9150 SDValue Hi = Op.getOperand(i: 1);
9151
9152 // Avoid adding defined bits with the zero_extend.
9153 if (Hi.isUndef()) {
9154 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9155 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9156 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
9157 }
9158
9159 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
9160 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
9161
9162 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
9163 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
9164 if (Lo.isUndef())
9165 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
9166
9167 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9168 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9169
9170 SDValue Or =
9171 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
9172 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
9173 }
9174
9175 // Split into 2-element chunks.
9176 const unsigned NumParts = VT.getVectorNumElements() / 2;
9177 EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: 2);
9178 MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
9179
9180 SmallVector<SDValue> Casts;
9181 for (unsigned P = 0; P < NumParts; ++P) {
9182 SDValue Vec = DAG.getBuildVector(
9183 VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * 2), Op.getOperand(i: P * 2 + 1)});
9184 Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
9185 }
9186
9187 SDValue Blend =
9188 DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
9189 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
9190}
9191
9192bool SITargetLowering::isOffsetFoldingLegal(
9193 const GlobalAddressSDNode *GA) const {
9194 // OSes that use ELF REL relocations (instead of RELA) can only store a
9195 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9196 // which can create arbitrary 64-bit addends. (This is only a problem for
9197 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9198 // the high 32 bits of the addend.)
9199 //
9200 // This should be kept in sync with how HasRelocationAddend is initialized in
9201 // the constructor of ELFAMDGPUAsmBackend.
9202 if (!Subtarget->isAmdHsaOS())
9203 return false;
9204
9205 // We can fold offsets for anything that doesn't require a GOT relocation.
9206 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9207 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
9208 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
9209 !shouldEmitGOTReloc(GV: GA->getGlobal());
9210}
9211
9212static SDValue
9213buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
9214 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9215 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9216 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9217 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9218 // lowered to the following code sequence:
9219 //
9220 // For constant address space:
9221 // s_getpc_b64 s[0:1]
9222 // s_add_u32 s0, s0, $symbol
9223 // s_addc_u32 s1, s1, 0
9224 //
9225 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9226 // a fixup or relocation is emitted to replace $symbol with a literal
9227 // constant, which is a pc-relative offset from the encoding of the $symbol
9228 // operand to the global variable.
9229 //
9230 // For global address space:
9231 // s_getpc_b64 s[0:1]
9232 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9233 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9234 //
9235 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9236 // fixups or relocations are emitted to replace $symbol@*@lo and
9237 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9238 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9239 // operand to the global variable.
9240 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9241 assert(GAFlags != SIInstrInfo::MO_NONE);
9242
9243 SDValue Ptr =
9244 DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset, TargetFlags: GAFlags + 2);
9245 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET64, DL, VT: PtrVT, Operand: Ptr);
9246 }
9247
9248 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
9249 SDValue PtrHi;
9250 if (GAFlags == SIInstrInfo::MO_NONE)
9251 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
9252 else
9253 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
9254 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
9255}
9256
9257SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9258 SDValue Op,
9259 SelectionDAG &DAG) const {
9260 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
9261 SDLoc DL(GSD);
9262 EVT PtrVT = Op.getValueType();
9263
9264 const GlobalValue *GV = GSD->getGlobal();
9265 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9266 shouldUseLDSConstAddress(GV)) ||
9267 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
9268 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
9269 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9270 GV->hasExternalLinkage()) {
9271 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
9272 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9273 // zero-sized type in other languages to declare the dynamic shared
9274 // memory which size is not known at the compile time. They will be
9275 // allocated by the runtime and placed directly after the static
9276 // allocated ones. They all share the same offset.
9277 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
9278 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9279 // Adjust alignment for that dynamic shared memory array.
9280 Function &F = DAG.getMachineFunction().getFunction();
9281 MFI->setDynLDSAlign(F, GV: GVar);
9282 MFI->setUsesDynamicLDS(true);
9283 return SDValue(
9284 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
9285 }
9286 }
9287 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
9288 }
9289
9290 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
9291 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
9292 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9293 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
9294 }
9295
9296 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9297 if (Subtarget->has64BitLiterals()) {
9298 SDValue Addr = DAG.getTargetGlobalAddress(
9299 GV, DL, VT: MVT::i64, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS64);
9300 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, Op1: Addr),
9301 0);
9302 }
9303
9304 SDValue AddrLo = DAG.getTargetGlobalAddress(
9305 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
9306 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
9307
9308 SDValue AddrHi = DAG.getTargetGlobalAddress(
9309 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
9310 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
9311
9312 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
9313 }
9314
9315 if (shouldEmitFixup(GV))
9316 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
9317
9318 if (shouldEmitPCReloc(GV))
9319 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
9320 GAFlags: SIInstrInfo::MO_REL32);
9321
9322 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
9323 GAFlags: SIInstrInfo::MO_GOTPCREL32);
9324 PointerType *PtrTy =
9325 PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
9326 const DataLayout &DataLayout = DAG.getDataLayout();
9327 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
9328 MachinePointerInfo PtrInfo =
9329 MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
9330
9331 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
9332 MMOFlags: MachineMemOperand::MODereferenceable |
9333 MachineMemOperand::MOInvariant);
9334}
9335
9336SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9337 SelectionDAG &DAG) const {
9338 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9339 const Function &Fn = DAG.getMachineFunction().getFunction();
9340 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9341 Fn, "unsupported external symbol", Op.getDebugLoc()));
9342 return DAG.getPOISON(VT: Op.getValueType());
9343}
9344
9345SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
9346 const SDLoc &DL, SDValue V) const {
9347 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9348 // the destination register.
9349 //
9350 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9351 // so we will end up with redundant moves to m0.
9352 //
9353 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9354
9355 // A Null SDValue creates a glue result.
9356 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
9357 Op1: V, Op2: Chain);
9358 return SDValue(M0, 0);
9359}
9360
9361SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9362 MVT VT,
9363 unsigned Offset) const {
9364 SDLoc SL(Op);
9365 SDValue Param = lowerKernargMemParameter(
9366 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
9367 // The local size values will have the hi 16-bits as zero.
9368 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
9369 N2: DAG.getValueType(VT));
9370}
9371
9372static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9373 EVT VT) {
9374 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9375 DAG.getMachineFunction().getFunction(),
9376 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9377 return DAG.getPOISON(VT);
9378}
9379
9380static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9381 EVT VT) {
9382 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9383 DAG.getMachineFunction().getFunction(),
9384 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9385 return DAG.getPOISON(VT);
9386}
9387
9388static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
9389 ArrayRef<SDValue> Elts) {
9390 assert(!Elts.empty());
9391 MVT Type;
9392 unsigned NumElts = Elts.size();
9393
9394 if (NumElts <= 12) {
9395 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
9396 } else {
9397 assert(Elts.size() <= 16);
9398 Type = MVT::v16f32;
9399 NumElts = 16;
9400 }
9401
9402 SmallVector<SDValue, 16> VecElts(NumElts);
9403 for (unsigned i = 0; i < Elts.size(); ++i) {
9404 SDValue Elt = Elts[i];
9405 if (Elt.getValueType() != MVT::f32)
9406 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
9407 VecElts[i] = Elt;
9408 }
9409 for (unsigned i = Elts.size(); i < NumElts; ++i)
9410 VecElts[i] = DAG.getPOISON(VT: MVT::f32);
9411
9412 if (NumElts == 1)
9413 return VecElts[0];
9414 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
9415}
9416
9417static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9418 SDValue Src, int ExtraElts) {
9419 EVT SrcVT = Src.getValueType();
9420
9421 SmallVector<SDValue, 8> Elts;
9422
9423 if (SrcVT.isVector())
9424 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
9425 else
9426 Elts.push_back(Elt: Src);
9427
9428 SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
9429 while (ExtraElts--)
9430 Elts.push_back(Elt: Undef);
9431
9432 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
9433}
9434
9435// Re-construct the required return value for a image load intrinsic.
9436// This is more complicated due to the optional use TexFailCtrl which means the
9437// required return type is an aggregate
9438static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
9439 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9440 bool Unpacked, bool IsD16, int DMaskPop,
9441 int NumVDataDwords, bool IsAtomicPacked16Bit,
9442 const SDLoc &DL) {
9443 // Determine the required return type. This is the same regardless of
9444 // IsTexFail flag
9445 EVT ReqRetVT = ResultTypes[0];
9446 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9447 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9448 ? (ReqRetNumElts + 1) / 2
9449 : ReqRetNumElts;
9450
9451 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9452
9453 MVT DataDwordVT =
9454 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
9455
9456 MVT MaskPopVT =
9457 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
9458
9459 SDValue Data(Result, 0);
9460 SDValue TexFail;
9461
9462 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9463 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
9464 if (MaskPopVT.isVector()) {
9465 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
9466 N1: SDValue(Result, 0), N2: ZeroIdx);
9467 } else {
9468 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
9469 N1: SDValue(Result, 0), N2: ZeroIdx);
9470 }
9471 }
9472
9473 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9474 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
9475 ExtraElts: NumDataDwords - MaskPopDwords);
9476
9477 if (IsD16)
9478 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
9479
9480 EVT LegalReqRetVT = ReqRetVT;
9481 if (!ReqRetVT.isVector()) {
9482 if (!Data.getValueType().isInteger())
9483 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
9484 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
9485 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
9486 } else {
9487 // We need to widen the return vector to a legal type
9488 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9489 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9490 LegalReqRetVT =
9491 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
9492 NumElements: ReqRetVT.getVectorNumElements() + 1);
9493 }
9494 }
9495 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
9496
9497 if (IsTexFail) {
9498 TexFail =
9499 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
9500 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
9501
9502 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
9503 }
9504
9505 if (Result->getNumValues() == 1)
9506 return Data;
9507
9508 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
9509}
9510
9511static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9512 SDValue *LWE, bool &IsTexFail) {
9513 auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
9514
9515 uint64_t Value = TexFailCtrlConst->getZExtValue();
9516 if (Value) {
9517 IsTexFail = true;
9518 }
9519
9520 SDLoc DL(TexFailCtrlConst);
9521 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
9522 Value &= ~(uint64_t)0x1;
9523 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
9524 Value &= ~(uint64_t)0x2;
9525
9526 return Value == 0;
9527}
9528
9529static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
9530 MVT PackVectorVT,
9531 SmallVectorImpl<SDValue> &PackedAddrs,
9532 unsigned DimIdx, unsigned EndIdx,
9533 unsigned NumGradients) {
9534 SDLoc DL(Op);
9535 for (unsigned I = DimIdx; I < EndIdx; I++) {
9536 SDValue Addr = Op.getOperand(i: I);
9537
9538 // Gradients are packed with undef for each coordinate.
9539 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9540 // 1D: undef,dx/dh; undef,dx/dv
9541 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9542 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9543 if (((I + 1) >= EndIdx) ||
9544 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9545 I == DimIdx + NumGradients - 1))) {
9546 if (Addr.getValueType() != MVT::i16)
9547 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
9548 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
9549 } else {
9550 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
9551 I++;
9552 }
9553 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
9554 PackedAddrs.push_back(Elt: Addr);
9555 }
9556}
9557
9558SDValue SITargetLowering::lowerImage(SDValue Op,
9559 const AMDGPU::ImageDimIntrinsicInfo *Intr,
9560 SelectionDAG &DAG, bool WithChain) const {
9561 SDLoc DL(Op);
9562 MachineFunction &MF = DAG.getMachineFunction();
9563 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9564 unsigned IntrOpcode = Intr->BaseOpcode;
9565 // For image atomic: use no-return opcode if result is unused.
9566 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9567 !Op.getNode()->hasAnyUseOfValue(Value: 0))
9568 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9569 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9570 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
9571 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
9572 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
9573 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9574 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9575
9576 SmallVector<EVT, 3> ResultTypes(Op->values());
9577 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9578 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9579 ResultTypes.erase(CI: &ResultTypes[0]);
9580
9581 bool IsD16 = false;
9582 bool IsG16 = false;
9583 bool IsA16 = false;
9584 SDValue VData;
9585 int NumVDataDwords = 0;
9586 bool AdjustRetType = false;
9587 bool IsAtomicPacked16Bit = false;
9588
9589 // Offset of intrinsic arguments
9590 const unsigned ArgOffset = WithChain ? 2 : 1;
9591
9592 unsigned DMask;
9593 unsigned DMaskLanes = 0;
9594
9595 if (BaseOpcode->Atomic) {
9596 VData = Op.getOperand(i: 2);
9597
9598 IsAtomicPacked16Bit =
9599 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9600 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9601 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9602 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9603
9604 bool Is64Bit = VData.getValueSizeInBits() == 64;
9605 if (BaseOpcode->AtomicX2) {
9606 SDValue VData2 = Op.getOperand(i: 3);
9607 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9608 Ops: {VData, VData2});
9609 if (Is64Bit)
9610 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
9611
9612 if (!BaseOpcode->NoReturn)
9613 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9614
9615 DMask = Is64Bit ? 0xf : 0x3;
9616 NumVDataDwords = Is64Bit ? 4 : 2;
9617 } else {
9618 DMask = Is64Bit ? 0x3 : 0x1;
9619 NumVDataDwords = Is64Bit ? 2 : 1;
9620 }
9621 } else {
9622 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
9623 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
9624
9625 if (BaseOpcode->Store) {
9626 VData = Op.getOperand(i: 2);
9627
9628 MVT StoreVT = VData.getSimpleValueType();
9629 if (StoreVT.getScalarType() == MVT::f16) {
9630 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9631 return Op; // D16 is unsupported for this instruction
9632
9633 IsD16 = true;
9634 VData = handleD16VData(VData, DAG, ImageStore: true);
9635 }
9636
9637 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9638 } else if (!BaseOpcode->NoReturn) {
9639 // Work out the num dwords based on the dmask popcount and underlying type
9640 // and whether packing is supported.
9641 MVT LoadVT = ResultTypes[0].getSimpleVT();
9642 if (LoadVT.getScalarType() == MVT::f16) {
9643 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9644 return Op; // D16 is unsupported for this instruction
9645
9646 IsD16 = true;
9647 }
9648
9649 // Confirm that the return type is large enough for the dmask specified
9650 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9651 (!LoadVT.isVector() && DMaskLanes > 1))
9652 return Op;
9653
9654 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9655 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9656 // instructions.
9657 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9658 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9659 NumVDataDwords = (DMaskLanes + 1) / 2;
9660 else
9661 NumVDataDwords = DMaskLanes;
9662
9663 AdjustRetType = true;
9664 }
9665 }
9666
9667 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9668 SmallVector<SDValue, 4> VAddrs;
9669
9670 // Check for 16 bit addresses or derivatives and pack if true.
9671 MVT VAddrVT =
9672 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
9673 MVT VAddrScalarVT = VAddrVT.getScalarType();
9674 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9675 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9676
9677 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
9678 VAddrScalarVT = VAddrVT.getScalarType();
9679 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9680 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9681
9682 // Push back extra arguments.
9683 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9684 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
9685 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9686 // Special handling of bias when A16 is on. Bias is of type half but
9687 // occupies full 32-bit.
9688 SDValue Bias = DAG.getBuildVector(
9689 VT: MVT::v2f16, DL,
9690 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
9691 VAddrs.push_back(Elt: Bias);
9692 } else {
9693 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9694 "Bias needs to be converted to 16 bit in A16 mode");
9695 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
9696 }
9697 }
9698
9699 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9700 // 16 bit gradients are supported, but are tied to the A16 control
9701 // so both gradients and addresses must be 16 bit
9702 LLVM_DEBUG(
9703 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9704 "require 16 bit args for both gradients and addresses");
9705 return Op;
9706 }
9707
9708 if (IsA16) {
9709 if (!ST->hasA16()) {
9710 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9711 "support 16 bit addresses\n");
9712 return Op;
9713 }
9714 }
9715
9716 // We've dealt with incorrect input so we know that if IsA16, IsG16
9717 // are set then we have to compress/pack operands (either address,
9718 // gradient or both)
9719 // In the case where a16 and gradients are tied (no G16 support) then we
9720 // have already verified that both IsA16 and IsG16 are true
9721 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9722 // Activate g16
9723 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9724 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
9725 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9726 }
9727
9728 // Add gradients (packed or unpacked)
9729 if (IsG16) {
9730 // Pack the gradients
9731 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9732 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
9733 DimIdx: ArgOffset + Intr->GradientStart,
9734 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
9735 } else {
9736 for (unsigned I = ArgOffset + Intr->GradientStart;
9737 I < ArgOffset + Intr->CoordStart; I++)
9738 VAddrs.push_back(Elt: Op.getOperand(i: I));
9739 }
9740
9741 // Add addresses (packed or unpacked)
9742 if (IsA16) {
9743 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
9744 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
9745 NumGradients: 0 /* No gradients */);
9746 } else {
9747 // Add uncompressed address
9748 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9749 VAddrs.push_back(Elt: Op.getOperand(i: I));
9750 }
9751
9752 // If the register allocator cannot place the address registers contiguously
9753 // without introducing moves, then using the non-sequential address encoding
9754 // is always preferable, since it saves VALU instructions and is usually a
9755 // wash in terms of code size or even better.
9756 //
9757 // However, we currently have no way of hinting to the register allocator that
9758 // MIMG addresses should be placed contiguously when it is possible to do so,
9759 // so force non-NSA for the common 2-address case as a heuristic.
9760 //
9761 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9762 // allocation when possible.
9763 //
9764 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9765 // set of the remaining addresses.
9766 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
9767 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9768 const bool UseNSA = ST->hasNSAEncoding() &&
9769 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9770 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9771 const bool UsePartialNSA =
9772 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9773
9774 SDValue VAddr;
9775 if (UsePartialNSA) {
9776 VAddr = getBuildDwordsVector(DAG, DL,
9777 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
9778 } else if (!UseNSA) {
9779 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
9780 }
9781
9782 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
9783 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
9784 SDValue Unorm;
9785 if (!BaseOpcode->Sampler) {
9786 Unorm = True;
9787 } else {
9788 uint64_t UnormConst =
9789 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
9790
9791 Unorm = UnormConst ? True : False;
9792 }
9793
9794 SDValue TFE;
9795 SDValue LWE;
9796 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
9797 bool IsTexFail = false;
9798 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
9799 return Op;
9800
9801 if (IsTexFail) {
9802 if (!DMaskLanes) {
9803 // Expecting to get an error flag since TFC is on - and dmask is 0
9804 // Force dmask to be at least 1 otherwise the instruction will fail
9805 DMask = 0x1;
9806 DMaskLanes = 1;
9807 NumVDataDwords = 1;
9808 }
9809 NumVDataDwords += 1;
9810 AdjustRetType = true;
9811 }
9812
9813 // Has something earlier tagged that the return type needs adjusting
9814 // This happens if the instruction is a load or has set TexFailCtrl flags
9815 if (AdjustRetType) {
9816 // NumVDataDwords reflects the true number of dwords required in the return
9817 // type
9818 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9819 // This is a no-op load. This can be eliminated
9820 SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
9821 if (isa<MemSDNode>(Val: Op))
9822 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
9823 return Undef;
9824 }
9825
9826 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(Context&: *DAG.getContext(),
9827 VT: MVT::i32, NumElements: NumVDataDwords)
9828 : MVT::i32;
9829
9830 ResultTypes[0] = NewVT;
9831 if (ResultTypes.size() == 3) {
9832 // Original result was aggregate type used for TexFailCtrl results
9833 // The actual instruction returns as a vector type which has now been
9834 // created. Remove the aggregate result.
9835 ResultTypes.erase(CI: &ResultTypes[1]);
9836 }
9837 }
9838
9839 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
9840 // Keep GLC only when the atomic's result is actually used.
9841 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9842 CPol |= AMDGPU::CPol::GLC;
9843 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9844 AMDGPU::CPol::VOLATILE))
9845 return Op;
9846
9847 SmallVector<SDValue, 26> Ops;
9848 if (BaseOpcode->Store || BaseOpcode->Atomic)
9849 Ops.push_back(Elt: VData); // vdata
9850 if (UsePartialNSA) {
9851 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
9852 Ops.push_back(Elt: VAddr);
9853 } else if (UseNSA)
9854 append_range(C&: Ops, R&: VAddrs);
9855 else
9856 Ops.push_back(Elt: VAddr);
9857 SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
9858 EVT RsrcVT = Rsrc.getValueType();
9859 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9860 return Op;
9861 Ops.push_back(Elt: Rsrc);
9862 if (BaseOpcode->Sampler) {
9863 SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
9864 if (Samp.getValueType() != MVT::v4i32)
9865 return Op;
9866 Ops.push_back(Elt: Samp);
9867 }
9868 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
9869 if (IsGFX10Plus)
9870 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
9871 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9872 Ops.push_back(Elt: Unorm);
9873 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
9874 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
9875 ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
9876 ? True
9877 : False);
9878 if (IsGFX10Plus)
9879 Ops.push_back(Elt: IsA16 ? True : False);
9880
9881 if (!Subtarget->hasGFX90AInsts())
9882 Ops.push_back(Elt: TFE); // tfe
9883 else if (TFE->getAsZExtVal()) {
9884 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9885 DAG.getMachineFunction().getFunction(),
9886 "TFE is not supported on this GPU", DL.getDebugLoc()));
9887 }
9888
9889 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9890 Ops.push_back(Elt: LWE); // lwe
9891 if (!IsGFX10Plus)
9892 Ops.push_back(Elt: DimInfo->DA ? True : False);
9893 if (BaseOpcode->HasD16)
9894 Ops.push_back(Elt: IsD16 ? True : False);
9895 if (isa<MemSDNode>(Val: Op))
9896 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
9897
9898 int NumVAddrDwords =
9899 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9900 int Opcode = -1;
9901
9902 if (IsGFX12Plus) {
9903 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
9904 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9905 } else if (IsGFX11Plus) {
9906 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9907 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
9908 : AMDGPU::MIMGEncGfx11Default,
9909 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9910 } else if (IsGFX10Plus) {
9911 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9912 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
9913 : AMDGPU::MIMGEncGfx10Default,
9914 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9915 } else {
9916 if (Subtarget->hasGFX90AInsts()) {
9917 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
9918 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9919 if (Opcode == -1) {
9920 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9921 DAG.getMachineFunction().getFunction(),
9922 "requested image instruction is not supported on this GPU",
9923 DL.getDebugLoc()));
9924
9925 unsigned Idx = 0;
9926 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9927 for (EVT VT : OrigResultTypes) {
9928 if (VT == MVT::Other)
9929 RetValues[Idx++] = Op.getOperand(i: 0); // Chain
9930 else
9931 RetValues[Idx++] = DAG.getPOISON(VT);
9932 }
9933
9934 return DAG.getMergeValues(Ops: RetValues, dl: DL);
9935 }
9936 }
9937 if (Opcode == -1 &&
9938 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9939 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
9940 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9941 if (Opcode == -1)
9942 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
9943 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9944 }
9945 if (Opcode == -1)
9946 return Op;
9947
9948 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
9949 if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
9950 MachineMemOperand *MemRef = MemOp->getMemOperand();
9951 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9952 }
9953
9954 if (BaseOpcode->NoReturn) {
9955 if (BaseOpcode->Atomic)
9956 return DAG.getMergeValues(
9957 Ops: {DAG.getPOISON(VT: OrigResultTypes[0]), SDValue(NewNode, 0)}, dl: DL);
9958
9959 return SDValue(NewNode, 0);
9960 }
9961
9962 if (BaseOpcode->AtomicX2) {
9963 SmallVector<SDValue, 1> Elt;
9964 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
9965 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
9966 }
9967
9968 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
9969 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
9970 NumVDataDwords, IsAtomicPacked16Bit, DL);
9971}
9972
9973SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9974 SDValue Offset, SDValue CachePolicy,
9975 SelectionDAG &DAG) const {
9976 MachineFunction &MF = DAG.getMachineFunction();
9977
9978 const DataLayout &DataLayout = DAG.getDataLayout();
9979 Align Alignment =
9980 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
9981
9982 MachineMemOperand *MMO = MF.getMachineMemOperand(
9983 PtrInfo: MachinePointerInfo(),
9984 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
9985 MachineMemOperand::MOInvariant,
9986 Size: VT.getStoreSize(), BaseAlignment: Alignment);
9987
9988 if (!Offset->isDivergent()) {
9989 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9990
9991 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9992 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9993 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9994 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9995 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9996 SDValue BufferLoad =
9997 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
9998 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
9999 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
10000 }
10001
10002 // Widen vec3 load to vec4.
10003 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10004 !Subtarget->hasScalarDwordx3Loads()) {
10005 EVT WidenedVT =
10006 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
10007 auto WidenedOp = DAG.getMemIntrinsicNode(
10008 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
10009 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
10010 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
10011 N2: DAG.getVectorIdxConstant(Val: 0, DL));
10012 return Subvector;
10013 }
10014
10015 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
10016 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
10017 }
10018
10019 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10020 // assume that the buffer is unswizzled.
10021 SDValue Ops[] = {
10022 DAG.getEntryNode(), // Chain
10023 Rsrc, // rsrc
10024 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10025 {}, // voffset
10026 {}, // soffset
10027 {}, // offset
10028 CachePolicy, // cachepolicy
10029 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10030 };
10031 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10032 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
10033 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
10034 }
10035
10036 SmallVector<SDValue, 4> Loads;
10037 unsigned NumLoads = 1;
10038 MVT LoadVT = VT.getSimpleVT();
10039 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10040 assert((LoadVT.getScalarType() == MVT::i32 ||
10041 LoadVT.getScalarType() == MVT::f32));
10042
10043 if (NumElts == 8 || NumElts == 16) {
10044 NumLoads = NumElts / 4;
10045 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
10046 }
10047
10048 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
10049
10050 // Use the alignment to ensure that the required offsets will fit into the
10051 // immediate offsets.
10052 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
10053 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10054
10055 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10056 for (unsigned i = 0; i < NumLoads; ++i) {
10057 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
10058 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10059 MemVT: LoadVT, MMO, DAG));
10060 }
10061
10062 if (NumElts == 8 || NumElts == 16)
10063 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
10064
10065 return Loads[0];
10066}
10067
10068SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10069 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10070 if (!Subtarget->hasArchitectedSGPRs())
10071 return {};
10072 SDLoc SL(Op);
10073 MVT VT = MVT::i32;
10074 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
10075 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
10076 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
10077}
10078
10079SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10080 AMDGPU::Hwreg::Id HwReg,
10081 unsigned LowBit,
10082 unsigned Width) const {
10083 SDLoc SL(Op);
10084 using namespace AMDGPU::Hwreg;
10085 return {DAG.getMachineNode(
10086 Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT: MVT::i32,
10087 Op1: DAG.getTargetConstant(Val: HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width),
10088 DL: SL, VT: MVT::i32)),
10089 0};
10090}
10091
10092SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10093 unsigned Dim,
10094 const ArgDescriptor &Arg) const {
10095 SDLoc SL(Op);
10096 MachineFunction &MF = DAG.getMachineFunction();
10097 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
10098 if (MaxID == 0)
10099 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
10100
10101 // It's undefined behavior if a function marked with the amdgpu-no-*
10102 // attributes uses the corresponding intrinsic.
10103 if (!Arg)
10104 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
10105
10106 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
10107 SL: SDLoc(DAG.getEntryNode()), Arg);
10108
10109 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10110 // masking operations anyway.
10111 //
10112 // TODO: We could assert the top bit is 0 for the source copy.
10113 if (Arg.isMasked())
10114 return Val;
10115
10116 // Preserve the known bits after expansion to a copy.
10117 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
10118 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
10119 N2: DAG.getValueType(SmallVT));
10120}
10121
10122SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10123 SelectionDAG &DAG) const {
10124 MachineFunction &MF = DAG.getMachineFunction();
10125 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10126
10127 EVT VT = Op.getValueType();
10128 SDLoc DL(Op);
10129 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
10130
10131 // TODO: Should this propagate fast-math-flags?
10132
10133 switch (IntrinsicID) {
10134 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10135 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
10136 return emitNonHSAIntrinsicError(DAG, DL, VT);
10137 return getPreloadedValue(DAG, MFI: *MFI, VT,
10138 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
10139 }
10140 case Intrinsic::amdgcn_dispatch_ptr:
10141 case Intrinsic::amdgcn_queue_ptr: {
10142 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
10143 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10144 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10145 DL.getDebugLoc()));
10146 return DAG.getPOISON(VT);
10147 }
10148
10149 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10150 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
10151 : AMDGPUFunctionArgInfo::QUEUE_PTR;
10152 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
10153 }
10154 case Intrinsic::amdgcn_implicitarg_ptr: {
10155 if (MFI->isEntryFunction())
10156 return getImplicitArgPtr(DAG, SL: DL);
10157 return getPreloadedValue(DAG, MFI: *MFI, VT,
10158 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
10159 }
10160 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10161 if (!AMDGPU::isKernel(F: MF.getFunction())) {
10162 // This only makes sense to call in a kernel, so just lower to null.
10163 return DAG.getConstant(Val: 0, DL, VT);
10164 }
10165
10166 return getPreloadedValue(DAG, MFI: *MFI, VT,
10167 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
10168 }
10169 case Intrinsic::amdgcn_dispatch_id: {
10170 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
10171 }
10172 case Intrinsic::amdgcn_rcp:
10173 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
10174 case Intrinsic::amdgcn_rsq:
10175 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10176 case Intrinsic::amdgcn_rsq_legacy:
10177 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10178 return emitRemovedIntrinsicError(DAG, DL, VT);
10179 return SDValue();
10180 case Intrinsic::amdgcn_rcp_legacy:
10181 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10182 return emitRemovedIntrinsicError(DAG, DL, VT);
10183 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
10184 case Intrinsic::amdgcn_rsq_clamp: {
10185 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10186 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
10187
10188 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
10189 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
10190 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
10191
10192 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10193 SDValue Tmp =
10194 DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
10195 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
10196 N2: DAG.getConstantFP(Val: Min, DL, VT));
10197 }
10198 case Intrinsic::r600_read_ngroups_x:
10199 if (Subtarget->isAmdHsaOS())
10200 return emitNonHSAIntrinsicError(DAG, DL, VT);
10201
10202 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10203 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
10204 Signed: false);
10205 case Intrinsic::r600_read_ngroups_y:
10206 if (Subtarget->isAmdHsaOS())
10207 return emitNonHSAIntrinsicError(DAG, DL, VT);
10208
10209 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10210 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
10211 Signed: false);
10212 case Intrinsic::r600_read_ngroups_z:
10213 if (Subtarget->isAmdHsaOS())
10214 return emitNonHSAIntrinsicError(DAG, DL, VT);
10215
10216 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10217 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
10218 Signed: false);
10219 case Intrinsic::r600_read_local_size_x:
10220 if (Subtarget->isAmdHsaOS())
10221 return emitNonHSAIntrinsicError(DAG, DL, VT);
10222
10223 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10224 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
10225 case Intrinsic::r600_read_local_size_y:
10226 if (Subtarget->isAmdHsaOS())
10227 return emitNonHSAIntrinsicError(DAG, DL, VT);
10228
10229 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10230 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
10231 case Intrinsic::r600_read_local_size_z:
10232 if (Subtarget->isAmdHsaOS())
10233 return emitNonHSAIntrinsicError(DAG, DL, VT);
10234
10235 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10236 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
10237 case Intrinsic::amdgcn_workgroup_id_x:
10238 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10239 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
10240 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
10241 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
10242 case Intrinsic::amdgcn_workgroup_id_y:
10243 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10244 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
10245 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
10246 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
10247 case Intrinsic::amdgcn_workgroup_id_z:
10248 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10249 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
10250 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
10251 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
10252 case Intrinsic::amdgcn_cluster_id_x:
10253 return Subtarget->hasClusters()
10254 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10255 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
10256 : DAG.getPOISON(VT);
10257 case Intrinsic::amdgcn_cluster_id_y:
10258 return Subtarget->hasClusters()
10259 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10260 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
10261 : DAG.getPOISON(VT);
10262 case Intrinsic::amdgcn_cluster_id_z:
10263 return Subtarget->hasClusters()
10264 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10265 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
10266 : DAG.getPOISON(VT);
10267 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10268 return Subtarget->hasClusters()
10269 ? getPreloadedValue(
10270 DAG, MFI: *MFI, VT,
10271 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
10272 : DAG.getPOISON(VT);
10273 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10274 return Subtarget->hasClusters()
10275 ? getPreloadedValue(
10276 DAG, MFI: *MFI, VT,
10277 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
10278 : DAG.getPOISON(VT);
10279 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10280 return Subtarget->hasClusters()
10281 ? getPreloadedValue(
10282 DAG, MFI: *MFI, VT,
10283 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
10284 : DAG.getPOISON(VT);
10285 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10286 return Subtarget->hasClusters()
10287 ? lowerConstHwRegRead(DAG, Op, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4)
10288 : SDValue();
10289 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10290 return Subtarget->hasClusters()
10291 ? getPreloadedValue(
10292 DAG, MFI: *MFI, VT,
10293 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
10294 : DAG.getPOISON(VT);
10295 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10296 return Subtarget->hasClusters()
10297 ? getPreloadedValue(
10298 DAG, MFI: *MFI, VT,
10299 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
10300 : DAG.getPOISON(VT);
10301 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10302 return Subtarget->hasClusters()
10303 ? getPreloadedValue(
10304 DAG, MFI: *MFI, VT,
10305 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
10306 : DAG.getPOISON(VT);
10307 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10308 return Subtarget->hasClusters()
10309 ? getPreloadedValue(
10310 DAG, MFI: *MFI, VT,
10311 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
10312 : DAG.getPOISON(VT);
10313 case Intrinsic::amdgcn_wave_id:
10314 return lowerWaveID(DAG, Op);
10315 case Intrinsic::amdgcn_lds_kernel_id: {
10316 if (MFI->isEntryFunction())
10317 return getLDSKernelId(DAG, SL: DL);
10318 return getPreloadedValue(DAG, MFI: *MFI, VT,
10319 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
10320 }
10321 case Intrinsic::amdgcn_workitem_id_x:
10322 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
10323 case Intrinsic::amdgcn_workitem_id_y:
10324 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
10325 case Intrinsic::amdgcn_workitem_id_z:
10326 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
10327 case Intrinsic::amdgcn_wavefrontsize:
10328 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10329 DL: SDLoc(Op), VT: MVT::i32);
10330 case Intrinsic::amdgcn_s_buffer_load: {
10331 unsigned CPol = Op.getConstantOperandVal(i: 3);
10332 // s_buffer_load, because of how it's optimized, can't be volatile
10333 // so reject ones with the volatile bit set.
10334 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10335 ? AMDGPU::CPol::ALL
10336 : AMDGPU::CPol::ALL_pregfx12))
10337 return Op;
10338 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2),
10339 CachePolicy: Op.getOperand(i: 3), DAG);
10340 }
10341 case Intrinsic::amdgcn_fdiv_fast:
10342 return lowerFDIV_FAST(Op, DAG);
10343 case Intrinsic::amdgcn_sin:
10344 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
10345
10346 case Intrinsic::amdgcn_cos:
10347 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
10348
10349 case Intrinsic::amdgcn_mul_u24:
10350 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1),
10351 N2: Op.getOperand(i: 2));
10352 case Intrinsic::amdgcn_mul_i24:
10353 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1),
10354 N2: Op.getOperand(i: 2));
10355
10356 case Intrinsic::amdgcn_log_clamp: {
10357 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10358 return SDValue();
10359
10360 return emitRemovedIntrinsicError(DAG, DL, VT);
10361 }
10362 case Intrinsic::amdgcn_fract:
10363 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
10364
10365 case Intrinsic::amdgcn_class:
10366 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: 1),
10367 N2: Op.getOperand(i: 2));
10368 case Intrinsic::amdgcn_div_fmas:
10369 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: 1),
10370 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10371
10372 case Intrinsic::amdgcn_div_fixup:
10373 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: 1),
10374 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10375
10376 case Intrinsic::amdgcn_div_scale: {
10377 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
10378
10379 // Translate to the operands expected by the machine instruction. The
10380 // first parameter must be the same as the first instruction.
10381 SDValue Numerator = Op.getOperand(i: 1);
10382 SDValue Denominator = Op.getOperand(i: 2);
10383
10384 // Note this order is opposite of the machine instruction's operations,
10385 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10386 // intrinsic has the numerator as the first operand to match a normal
10387 // division operation.
10388
10389 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10390
10391 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
10392 N2: Denominator, N3: Numerator);
10393 }
10394 case Intrinsic::amdgcn_icmp: {
10395 // There is a Pat that handles this variant, so return it as-is.
10396 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
10397 Op.getConstantOperandVal(i: 2) == 0 &&
10398 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
10399 return Op;
10400 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10401 }
10402 case Intrinsic::amdgcn_fcmp: {
10403 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10404 }
10405 case Intrinsic::amdgcn_ballot:
10406 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10407 case Intrinsic::amdgcn_fmed3:
10408 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: 1),
10409 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10410 case Intrinsic::amdgcn_fdot2:
10411 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: 1),
10412 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10413 case Intrinsic::amdgcn_fmul_legacy:
10414 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: 1),
10415 N2: Op.getOperand(i: 2));
10416 case Intrinsic::amdgcn_sffbh:
10417 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
10418 case Intrinsic::amdgcn_sbfe:
10419 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: 1),
10420 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10421 case Intrinsic::amdgcn_ubfe:
10422 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: 1),
10423 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10424 case Intrinsic::amdgcn_cvt_pkrtz:
10425 case Intrinsic::amdgcn_cvt_pknorm_i16:
10426 case Intrinsic::amdgcn_cvt_pknorm_u16:
10427 case Intrinsic::amdgcn_cvt_pk_i16:
10428 case Intrinsic::amdgcn_cvt_pk_u16: {
10429 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10430 EVT VT = Op.getValueType();
10431 unsigned Opcode;
10432
10433 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10434 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10435 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10436 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10437 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10438 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10439 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10440 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10441 else
10442 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10443
10444 if (isTypeLegal(VT))
10445 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10446
10447 SDValue Node =
10448 DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10449 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
10450 }
10451 case Intrinsic::amdgcn_fmad_ftz:
10452 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
10453 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10454
10455 case Intrinsic::amdgcn_if_break:
10456 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
10457 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)),
10458 0);
10459
10460 case Intrinsic::amdgcn_groupstaticsize: {
10461 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
10462 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10463 return Op;
10464
10465 const Module *M = MF.getFunction().getParent();
10466 const GlobalValue *GV =
10467 Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
10468 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
10469 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10470 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10471 }
10472 case Intrinsic::amdgcn_is_shared:
10473 case Intrinsic::amdgcn_is_private: {
10474 SDLoc SL(Op);
10475 SDValue SrcVec =
10476 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
10477 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
10478 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
10479
10480 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10481 ? AMDGPUAS::LOCAL_ADDRESS
10482 : AMDGPUAS::PRIVATE_ADDRESS;
10483 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10484 Subtarget->hasGloballyAddressableScratch()) {
10485 SDValue FlatScratchBaseHi(
10486 DAG.getMachineNode(
10487 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
10488 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, VT: MVT::i32)),
10489 0);
10490 // Test bits 63..58 against the aperture address.
10491 return DAG.getSetCC(
10492 DL: SL, VT: MVT::i1,
10493 LHS: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: SrcHi, N2: FlatScratchBaseHi),
10494 RHS: DAG.getConstant(Val: 1u << 26, DL: SL, VT: MVT::i32), Cond: ISD::SETULT);
10495 }
10496
10497 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
10498 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
10499 }
10500 case Intrinsic::amdgcn_perm:
10501 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
10502 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10503 case Intrinsic::amdgcn_reloc_constant: {
10504 Module *M = MF.getFunction().getParent();
10505 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
10506 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
10507 auto *RelocSymbol = cast<GlobalVariable>(
10508 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
10509 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
10510 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10511 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10512 }
10513 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10514 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10515 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10516 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10517 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10518 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10519 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10520 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10521 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
10522 return SDValue();
10523
10524 SDLoc SL(Op);
10525 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
10526 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10527 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
10528 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
10529 }
10530 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10531 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10532 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10533 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10534 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10535 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10536 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10537 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10538 if (Op.getOperand(i: 4).getValueType() == MVT::i64)
10539 return SDValue();
10540
10541 SDLoc SL(Op);
10542 auto IndexKeyi64 =
10543 Op.getOperand(i: 4).getValueType() == MVT::v2i32
10544 ? DAG.getBitcast(VT: MVT::i64, V: Op.getOperand(i: 4))
10545 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i64);
10546 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10547 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10548 Op.getOperand(i: 3), IndexKeyi64, Op.getOperand(i: 5),
10549 Op.getOperand(i: 6)});
10550 }
10551 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10552 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10553 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10554 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10555 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10556 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10557 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10558 ? MVT::i64
10559 : MVT::i32;
10560 if (Op.getOperand(i: 6).getValueType() == IndexKeyTy)
10561 return SDValue();
10562
10563 SDLoc SL(Op);
10564 auto IndexKey =
10565 Op.getOperand(i: 6).getValueType().isVector()
10566 ? DAG.getBitcast(VT: IndexKeyTy, V: Op.getOperand(i: 6))
10567 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: IndexKeyTy);
10568 SmallVector<SDValue> Args{
10569 Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10570 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10571 IndexKey, Op.getOperand(i: 7), Op.getOperand(i: 8)};
10572 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10573 Args.push_back(Elt: Op.getOperand(i: 9));
10574 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), Ops: Args);
10575 }
10576 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10577 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10578 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10579 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
10580 return SDValue();
10581
10582 SDLoc SL(Op);
10583 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
10584 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10585 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10586 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10587 IndexKeyi32, Op.getOperand(i: 7)});
10588 }
10589 case Intrinsic::amdgcn_addrspacecast_nonnull:
10590 return lowerADDRSPACECAST(Op, DAG);
10591 case Intrinsic::amdgcn_readlane:
10592 case Intrinsic::amdgcn_readfirstlane:
10593 case Intrinsic::amdgcn_writelane:
10594 case Intrinsic::amdgcn_permlane16:
10595 case Intrinsic::amdgcn_permlanex16:
10596 case Intrinsic::amdgcn_permlane64:
10597 case Intrinsic::amdgcn_set_inactive:
10598 case Intrinsic::amdgcn_set_inactive_chain_arg:
10599 case Intrinsic::amdgcn_mov_dpp8:
10600 case Intrinsic::amdgcn_update_dpp:
10601 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
10602 case Intrinsic::amdgcn_dead: {
10603 SmallVector<SDValue, 8> Poisons;
10604 for (const EVT ValTy : Op.getNode()->values())
10605 Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
10606 return DAG.getMergeValues(Ops: Poisons, dl: SDLoc(Op));
10607 }
10608 case Intrinsic::amdgcn_wave_shuffle:
10609 return lowerWaveShuffle(TLI: *this, N: Op.getNode(), DAG);
10610 default:
10611 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10612 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10613 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
10614
10615 return Op;
10616 }
10617}
10618
10619// On targets not supporting constant in soffset field, turn zero to
10620// SGPR_NULL to avoid generating an extra s_mov with zero.
10621static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
10622 const GCNSubtarget *Subtarget) {
10623 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
10624 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
10625 return SOffset;
10626}
10627
10628SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10629 SelectionDAG &DAG,
10630 unsigned NewOpcode) const {
10631 SDLoc DL(Op);
10632
10633 SDValue VData = Op.getOperand(i: 2);
10634 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10635 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10636 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10637 SDValue Ops[] = {
10638 Op.getOperand(i: 0), // Chain
10639 VData, // vdata
10640 Rsrc, // rsrc
10641 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10642 VOffset, // voffset
10643 SOffset, // soffset
10644 Offset, // offset
10645 Op.getOperand(i: 6), // cachepolicy
10646 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10647 };
10648
10649 auto *M = cast<MemSDNode>(Val&: Op);
10650
10651 EVT MemVT = VData.getValueType();
10652 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10653 MMO: M->getMemOperand());
10654}
10655
10656SDValue
10657SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10658 unsigned NewOpcode) const {
10659 SDLoc DL(Op);
10660
10661 SDValue VData = Op.getOperand(i: 2);
10662 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10663 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10664 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10665 SDValue Ops[] = {
10666 Op.getOperand(i: 0), // Chain
10667 VData, // vdata
10668 Rsrc, // rsrc
10669 Op.getOperand(i: 4), // vindex
10670 VOffset, // voffset
10671 SOffset, // soffset
10672 Offset, // offset
10673 Op.getOperand(i: 7), // cachepolicy
10674 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10675 };
10676
10677 auto *M = cast<MemSDNode>(Val&: Op);
10678
10679 EVT MemVT = VData.getValueType();
10680 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10681 MMO: M->getMemOperand());
10682}
10683
10684SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10685 SelectionDAG &DAG) const {
10686 unsigned IntrID = Op.getConstantOperandVal(i: 1);
10687 SDLoc DL(Op);
10688
10689 switch (IntrID) {
10690 case Intrinsic::amdgcn_ds_ordered_add:
10691 case Intrinsic::amdgcn_ds_ordered_swap: {
10692 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10693 SDValue Chain = M->getOperand(Num: 0);
10694 SDValue M0 = M->getOperand(Num: 2);
10695 SDValue Value = M->getOperand(Num: 3);
10696 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
10697 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
10698 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
10699
10700 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10701 IndexOperand &= ~0x3f;
10702 unsigned CountDw = 0;
10703
10704 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10705 CountDw = (IndexOperand >> 24) & 0xf;
10706 IndexOperand &= ~(0xf << 24);
10707
10708 if (CountDw < 1 || CountDw > 4) {
10709 const Function &Fn = DAG.getMachineFunction().getFunction();
10710 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10711 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10712 DL.getDebugLoc()));
10713 CountDw = 1;
10714 }
10715 }
10716
10717 if (IndexOperand) {
10718 const Function &Fn = DAG.getMachineFunction().getFunction();
10719 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10720 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10721 }
10722
10723 if (WaveDone && !WaveRelease) {
10724 // TODO: Move this to IR verifier
10725 const Function &Fn = DAG.getMachineFunction().getFunction();
10726 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10727 Fn, "ds_ordered_count: wave_done requires wave_release",
10728 DL.getDebugLoc()));
10729 }
10730
10731 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10732 unsigned ShaderType =
10733 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
10734 unsigned Offset0 = OrderedCountIndex << 2;
10735 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10736
10737 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10738 Offset1 |= (CountDw - 1) << 6;
10739
10740 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10741 Offset1 |= ShaderType << 2;
10742
10743 unsigned Offset = Offset0 | (Offset1 << 8);
10744
10745 SDValue Ops[] = {
10746 Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
10747 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
10748 };
10749 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
10750 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
10751 MMO: M->getMemOperand());
10752 }
10753 case Intrinsic::amdgcn_raw_buffer_load:
10754 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10755 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10756 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10757 case Intrinsic::amdgcn_raw_buffer_load_format:
10758 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10759 const bool IsFormat =
10760 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10761 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10762
10763 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10764 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10765 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10766 SDValue Ops[] = {
10767 Op.getOperand(i: 0), // Chain
10768 Rsrc, // rsrc
10769 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10770 VOffset, // voffset
10771 SOffset, // soffset
10772 Offset, // offset
10773 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
10774 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10775 };
10776
10777 auto *M = cast<MemSDNode>(Val&: Op);
10778 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10779 }
10780 case Intrinsic::amdgcn_struct_buffer_load:
10781 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10782 case Intrinsic::amdgcn_struct_buffer_load_format:
10783 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10784 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10785 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10786 const bool IsFormat =
10787 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10788 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10789
10790 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10791 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10792 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10793 SDValue Ops[] = {
10794 Op.getOperand(i: 0), // Chain
10795 Rsrc, // rsrc
10796 Op.getOperand(i: 3), // vindex
10797 VOffset, // voffset
10798 SOffset, // soffset
10799 Offset, // offset
10800 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10801 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10802 };
10803
10804 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
10805 }
10806 case Intrinsic::amdgcn_raw_tbuffer_load:
10807 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10808 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10809 EVT LoadVT = Op.getValueType();
10810 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10811 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10812 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10813
10814 SDValue Ops[] = {
10815 Op.getOperand(i: 0), // Chain
10816 Rsrc, // rsrc
10817 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10818 VOffset, // voffset
10819 SOffset, // soffset
10820 Offset, // offset
10821 Op.getOperand(i: 5), // format
10822 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10823 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10824 };
10825
10826 if (LoadVT.getScalarType() == MVT::f16)
10827 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10828 Ops);
10829 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10830 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10831 DAG);
10832 }
10833 case Intrinsic::amdgcn_struct_tbuffer_load:
10834 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10835 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10836 EVT LoadVT = Op.getValueType();
10837 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10838 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10839 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10840
10841 SDValue Ops[] = {
10842 Op.getOperand(i: 0), // Chain
10843 Rsrc, // rsrc
10844 Op.getOperand(i: 3), // vindex
10845 VOffset, // voffset
10846 SOffset, // soffset
10847 Offset, // offset
10848 Op.getOperand(i: 6), // format
10849 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
10850 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10851 };
10852
10853 if (LoadVT.getScalarType() == MVT::f16)
10854 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10855 Ops);
10856 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10857 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10858 DAG);
10859 }
10860 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10861 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10862 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10863 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10865 return lowerStructBufferAtomicIntrin(Op, DAG,
10866 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10867 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10868 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10869 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10870 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10872 return lowerStructBufferAtomicIntrin(Op, DAG,
10873 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10874 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10875 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10876 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10877 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10879 return lowerStructBufferAtomicIntrin(Op, DAG,
10880 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10881 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10883 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10884 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10886 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10887 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10889 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10890 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10892 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10893 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10895 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10896 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10898 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10899 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10901 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10902 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10904 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10905 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10907 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10908 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10910 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10911 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10913 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10914 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10916 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10917 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10919 return lowerStructBufferAtomicIntrin(Op, DAG,
10920 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10921 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10922 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10923 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10924 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10925 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10926 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10927 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10929 return lowerStructBufferAtomicIntrin(Op, DAG,
10930 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10931 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10933 return lowerStructBufferAtomicIntrin(Op, DAG,
10934 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10937 return lowerStructBufferAtomicIntrin(Op, DAG,
10938 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10939 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10941 return lowerStructBufferAtomicIntrin(Op, DAG,
10942 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10945 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10946 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10948 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10949 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10951 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10952 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10954 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10955 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10957 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10958 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10959 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10960 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10961 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10963 return lowerStructBufferAtomicIntrin(Op, DAG,
10964 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10965 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10967 return lowerRawBufferAtomicIntrin(Op, DAG,
10968 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10969 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10971 return lowerStructBufferAtomicIntrin(Op, DAG,
10972 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10973 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10975 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
10976 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10977 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10978 SDValue Ops[] = {
10979 Op.getOperand(i: 0), // Chain
10980 Op.getOperand(i: 2), // src
10981 Op.getOperand(i: 3), // cmp
10982 Rsrc, // rsrc
10983 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10984 VOffset, // voffset
10985 SOffset, // soffset
10986 Offset, // offset
10987 Op.getOperand(i: 7), // cachepolicy
10988 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10989 };
10990 EVT VT = Op.getValueType();
10991 auto *M = cast<MemSDNode>(Val&: Op);
10992
10993 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
10994 VTList: Op->getVTList(), Ops, MemVT: VT,
10995 MMO: M->getMemOperand());
10996 }
10997 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10999 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
11000 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
11001 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
11002 SDValue Ops[] = {
11003 Op.getOperand(i: 0), // Chain
11004 Op.getOperand(i: 2), // src
11005 Op.getOperand(i: 3), // cmp
11006 Rsrc, // rsrc
11007 Op.getOperand(i: 5), // vindex
11008 VOffset, // voffset
11009 SOffset, // soffset
11010 Offset, // offset
11011 Op.getOperand(i: 8), // cachepolicy
11012 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11013 };
11014 EVT VT = Op.getValueType();
11015 auto *M = cast<MemSDNode>(Val&: Op);
11016
11017 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11018 VTList: Op->getVTList(), Ops, MemVT: VT,
11019 MMO: M->getMemOperand());
11020 }
11021 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11022 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11023 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11024 SDValue NodePtr = M->getOperand(Num: 2);
11025 SDValue RayExtent = M->getOperand(Num: 3);
11026 SDValue InstanceMask = M->getOperand(Num: 4);
11027 SDValue RayOrigin = M->getOperand(Num: 5);
11028 SDValue RayDir = M->getOperand(Num: 6);
11029 SDValue Offsets = M->getOperand(Num: 7);
11030 SDValue TDescr = M->getOperand(Num: 8);
11031
11032 assert(NodePtr.getValueType() == MVT::i64);
11033 assert(RayDir.getValueType() == MVT::v3f32);
11034
11035 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11036 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11037 return SDValue();
11038 }
11039
11040 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11041 const unsigned NumVDataDwords = 10;
11042 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11043 int Opcode = AMDGPU::getMIMGOpcode(
11044 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11045 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11046 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11047 assert(Opcode != -1);
11048
11049 SmallVector<SDValue, 7> Ops;
11050 Ops.push_back(Elt: NodePtr);
11051 Ops.push_back(Elt: DAG.getBuildVector(
11052 VT: MVT::v2i32, DL,
11053 Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
11054 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
11055 Ops.push_back(Elt: RayOrigin);
11056 Ops.push_back(Elt: RayDir);
11057 Ops.push_back(Elt: Offsets);
11058 Ops.push_back(Elt: TDescr);
11059 Ops.push_back(Elt: M->getChain());
11060
11061 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11062 MachineMemOperand *MemRef = M->getMemOperand();
11063 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11064 return SDValue(NewNode, 0);
11065 }
11066 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11067 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11068 SDValue NodePtr = M->getOperand(Num: 2);
11069 SDValue RayExtent = M->getOperand(Num: 3);
11070 SDValue RayOrigin = M->getOperand(Num: 4);
11071 SDValue RayDir = M->getOperand(Num: 5);
11072 SDValue RayInvDir = M->getOperand(Num: 6);
11073 SDValue TDescr = M->getOperand(Num: 7);
11074
11075 assert(NodePtr.getValueType() == MVT::i32 ||
11076 NodePtr.getValueType() == MVT::i64);
11077 assert(RayDir.getValueType() == MVT::v3f16 ||
11078 RayDir.getValueType() == MVT::v3f32);
11079
11080 if (!Subtarget->hasGFX10_AEncoding()) {
11081 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11082 return SDValue();
11083 }
11084
11085 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
11086 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
11087 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11088 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11089 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11090 const unsigned NumVDataDwords = 4;
11091 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11092 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11093 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11094 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11095 IsGFX12Plus;
11096 const unsigned BaseOpcodes[2][2] = {
11097 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11098 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11099 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11100 int Opcode;
11101 if (UseNSA) {
11102 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11103 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11104 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11105 : AMDGPU::MIMGEncGfx10NSA,
11106 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11107 } else {
11108 assert(!IsGFX12Plus);
11109 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11110 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11111 : AMDGPU::MIMGEncGfx10Default,
11112 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11113 }
11114 assert(Opcode != -1);
11115
11116 SmallVector<SDValue, 16> Ops;
11117
11118 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11119 SmallVector<SDValue, 3> Lanes;
11120 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
11121 if (Lanes[0].getValueSizeInBits() == 32) {
11122 for (unsigned I = 0; I < 3; ++I)
11123 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
11124 } else {
11125 if (IsAligned) {
11126 Ops.push_back(Elt: DAG.getBitcast(
11127 VT: MVT::i32,
11128 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[0], Lanes[1]})));
11129 Ops.push_back(Elt: Lanes[2]);
11130 } else {
11131 SDValue Elt0 = Ops.pop_back_val();
11132 Ops.push_back(Elt: DAG.getBitcast(
11133 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes[0]})));
11134 Ops.push_back(Elt: DAG.getBitcast(
11135 VT: MVT::i32,
11136 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[1], Lanes[2]})));
11137 }
11138 }
11139 };
11140
11141 if (UseNSA && IsGFX11Plus) {
11142 Ops.push_back(Elt: NodePtr);
11143 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11144 Ops.push_back(Elt: RayOrigin);
11145 if (IsA16) {
11146 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11147 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
11148 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
11149 for (unsigned I = 0; I < 3; ++I) {
11150 MergedLanes.push_back(Elt: DAG.getBitcast(
11151 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
11152 Ops: {DirLanes[I], InvDirLanes[I]})));
11153 }
11154 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
11155 } else {
11156 Ops.push_back(Elt: RayDir);
11157 Ops.push_back(Elt: RayInvDir);
11158 }
11159 } else {
11160 if (Is64)
11161 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
11162 Count: 2);
11163 else
11164 Ops.push_back(Elt: NodePtr);
11165
11166 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11167 packLanes(RayOrigin, true);
11168 packLanes(RayDir, true);
11169 packLanes(RayInvDir, false);
11170 }
11171
11172 if (!UseNSA) {
11173 // Build a single vector containing all the operands so far prepared.
11174 if (NumVAddrDwords > 12) {
11175 SDValue Undef = DAG.getPOISON(VT: MVT::i32);
11176 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
11177 }
11178 assert(Ops.size() >= 8 && Ops.size() <= 12);
11179 SDValue MergedOps =
11180 DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
11181 Ops.clear();
11182 Ops.push_back(Elt: MergedOps);
11183 }
11184
11185 Ops.push_back(Elt: TDescr);
11186 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
11187 Ops.push_back(Elt: M->getChain());
11188
11189 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11190 MachineMemOperand *MemRef = M->getMemOperand();
11191 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11192 return SDValue(NewNode, 0);
11193 }
11194 case Intrinsic::amdgcn_global_atomic_fmin_num:
11195 case Intrinsic::amdgcn_global_atomic_fmax_num:
11196 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11197 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11198 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11199 SDValue Ops[] = {
11200 M->getOperand(Num: 0), // Chain
11201 M->getOperand(Num: 2), // Ptr
11202 M->getOperand(Num: 3) // Value
11203 };
11204 unsigned Opcode = 0;
11205 switch (IntrID) {
11206 case Intrinsic::amdgcn_global_atomic_fmin_num:
11207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11208 Opcode = ISD::ATOMIC_LOAD_FMIN;
11209 break;
11210 }
11211 case Intrinsic::amdgcn_global_atomic_fmax_num:
11212 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11213 Opcode = ISD::ATOMIC_LOAD_FMAX;
11214 break;
11215 }
11216 default:
11217 llvm_unreachable("unhandled atomic opcode");
11218 }
11219 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
11220 Ops, MMO: M->getMemOperand());
11221 }
11222 case Intrinsic::amdgcn_s_alloc_vgpr: {
11223 SDValue NumVGPRs = Op.getOperand(i: 2);
11224 if (!NumVGPRs->isDivergent())
11225 return Op;
11226
11227 SDValue ReadFirstLaneID =
11228 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
11229 NumVGPRs = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
11230 N1: ReadFirstLaneID, N2: NumVGPRs);
11231
11232 return DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, VTList: Op->getVTList(),
11233 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: NumVGPRs);
11234 }
11235 case Intrinsic::amdgcn_s_get_barrier_state:
11236 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11237 SDValue Chain = Op->getOperand(Num: 0);
11238 SmallVector<SDValue, 2> Ops;
11239 unsigned Opc;
11240
11241 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
11242 uint64_t BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getZExtValue();
11243 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11244 BarID = (BarID >> 4) & 0x3F;
11245 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11246 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11247 Ops.push_back(Elt: K);
11248 Ops.push_back(Elt: Chain);
11249 } else {
11250 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11251 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11252 SDValue M0Val;
11253 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op->getOperand(Num: 2),
11254 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11255 M0Val = SDValue(
11256 DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11257 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11258 0);
11259 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11260 } else
11261 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op->getOperand(Num: 2)).getValue(R: 0));
11262 }
11263
11264 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11265 return SDValue(NewMI, 0);
11266 }
11267 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11268 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11269 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11270 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11271 SDValue Chain = Op->getOperand(Num: 0);
11272 SDValue Ptr = Op->getOperand(Num: 2);
11273 EVT VT = Op->getValueType(ResNo: 0);
11274 return DAG.getAtomicLoad(ExtType: ISD::NON_EXTLOAD, dl: DL, MemVT: MII->getMemoryVT(), VT,
11275 Chain, Ptr, MMO: MII->getMemOperand());
11276 }
11277 case Intrinsic::amdgcn_flat_load_monitor_b32:
11278 case Intrinsic::amdgcn_flat_load_monitor_b64:
11279 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11280 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11281 SDValue Chain = Op->getOperand(Num: 0);
11282 SDValue Ptr = Op->getOperand(Num: 2);
11283 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::FLAT_LOAD_MONITOR, dl: DL,
11284 VTList: Op->getVTList(), Ops: {Chain, Ptr},
11285 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11286 }
11287 case Intrinsic::amdgcn_global_load_monitor_b32:
11288 case Intrinsic::amdgcn_global_load_monitor_b64:
11289 case Intrinsic::amdgcn_global_load_monitor_b128: {
11290 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11291 SDValue Chain = Op->getOperand(Num: 0);
11292 SDValue Ptr = Op->getOperand(Num: 2);
11293 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::GLOBAL_LOAD_MONITOR, dl: DL,
11294 VTList: Op->getVTList(), Ops: {Chain, Ptr},
11295 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11296 }
11297 default:
11298
11299 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11300 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
11301 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11302
11303 return SDValue();
11304 }
11305}
11306
11307// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11308// dwordx4 if on SI and handle TFE loads.
11309SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11310 SDVTList VTList,
11311 ArrayRef<SDValue> Ops, EVT MemVT,
11312 MachineMemOperand *MMO,
11313 SelectionDAG &DAG) const {
11314 LLVMContext &C = *DAG.getContext();
11315 MachineFunction &MF = DAG.getMachineFunction();
11316 EVT VT = VTList.VTs[0];
11317
11318 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11319 bool IsTFE = VTList.NumVTs == 3;
11320 if (IsTFE) {
11321 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
11322 unsigned NumOpDWords = NumValueDWords + 1;
11323 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
11324 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
11325 MachineMemOperand *OpDWordsMMO =
11326 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
11327 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
11328 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
11329 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
11330 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
11331 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
11332 SDValue ValueDWords =
11333 NumValueDWords == 1
11334 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
11335 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
11336 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
11337 N2: ZeroIdx);
11338 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
11339 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
11340 }
11341
11342 if (!Subtarget->hasDwordx3LoadStores() &&
11343 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11344 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
11345 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
11346 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
11347 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
11348 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
11349 MemVT: WidenedMemVT, MMO: WidenedMMO);
11350 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
11351 N2: DAG.getVectorIdxConstant(Val: 0, DL));
11352 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
11353 }
11354
11355 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
11356}
11357
11358SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11359 bool ImageStore) const {
11360 EVT StoreVT = VData.getValueType();
11361
11362 // No change for f16 and legal vector D16 types.
11363 if (!StoreVT.isVector())
11364 return VData;
11365
11366 SDLoc DL(VData);
11367 unsigned NumElements = StoreVT.getVectorNumElements();
11368
11369 if (Subtarget->hasUnpackedD16VMem()) {
11370 // We need to unpack the packed data to store.
11371 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11372 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11373
11374 EVT EquivStoreVT =
11375 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
11376 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
11377 return DAG.UnrollVectorOp(N: ZExt.getNode());
11378 }
11379
11380 // The sq block of gfx8.1 does not estimate register use correctly for d16
11381 // image store instructions. The data operand is computed as if it were not a
11382 // d16 image instruction.
11383 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11384 // Bitcast to i16
11385 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11386 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11387
11388 // Decompose into scalars
11389 SmallVector<SDValue, 4> Elts;
11390 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
11391
11392 // Group pairs of i16 into v2i16 and bitcast to i32
11393 SmallVector<SDValue, 4> PackedElts;
11394 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11395 SDValue Pair =
11396 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
11397 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11398 PackedElts.push_back(Elt: IntPair);
11399 }
11400 if ((NumElements % 2) == 1) {
11401 // Handle v3i16
11402 unsigned I = Elts.size() / 2;
11403 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
11404 Ops: {Elts[I * 2], DAG.getPOISON(VT: MVT::i16)});
11405 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11406 PackedElts.push_back(Elt: IntPair);
11407 }
11408
11409 // Pad using UNDEF
11410 PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
11411
11412 // Build final vector
11413 EVT VecVT =
11414 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
11415 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
11416 }
11417
11418 if (NumElements == 3) {
11419 EVT IntStoreVT =
11420 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
11421 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11422
11423 EVT WidenedStoreVT = EVT::getVectorVT(
11424 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
11425 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
11426 BitWidth: WidenedStoreVT.getStoreSizeInBits());
11427 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
11428 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
11429 }
11430
11431 assert(isTypeLegal(StoreVT));
11432 return VData;
11433}
11434
11435static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11436 switch (Intr) {
11437 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11438 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11439 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11440 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11441 case Intrinsic::amdgcn_load_async_to_lds:
11442 case Intrinsic::amdgcn_global_load_async_lds:
11443 return true;
11444 }
11445 return false;
11446}
11447
11448SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11449 SelectionDAG &DAG) const {
11450 SDLoc DL(Op);
11451 SDValue Chain = Op.getOperand(i: 0);
11452 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
11453
11454 switch (IntrinsicID) {
11455 case Intrinsic::amdgcn_exp_compr: {
11456 if (!Subtarget->hasCompressedExport()) {
11457 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11458 DAG.getMachineFunction().getFunction(),
11459 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11460 }
11461 SDValue Src0 = Op.getOperand(i: 4);
11462 SDValue Src1 = Op.getOperand(i: 5);
11463 // Hack around illegal type on SI by directly selecting it.
11464 if (isTypeLegal(VT: Src0.getValueType()))
11465 return SDValue();
11466
11467 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
11468 SDValue Undef = DAG.getPOISON(VT: MVT::f32);
11469 const SDValue Ops[] = {
11470 Op.getOperand(i: 2), // tgt
11471 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
11472 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
11473 Undef, // src2
11474 Undef, // src3
11475 Op.getOperand(i: 7), // vm
11476 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
11477 Op.getOperand(i: 3), // en
11478 Op.getOperand(i: 0) // Chain
11479 };
11480
11481 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11482 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
11483 }
11484
11485 case Intrinsic::amdgcn_struct_tbuffer_store:
11486 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11487 SDValue VData = Op.getOperand(i: 2);
11488 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11489 if (IsD16)
11490 VData = handleD16VData(VData, DAG);
11491 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11492 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11493 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11494 SDValue Ops[] = {
11495 Chain,
11496 VData, // vdata
11497 Rsrc, // rsrc
11498 Op.getOperand(i: 4), // vindex
11499 VOffset, // voffset
11500 SOffset, // soffset
11501 Offset, // offset
11502 Op.getOperand(i: 7), // format
11503 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
11504 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11505 };
11506 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11507 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11508 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11509 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11510 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11511 }
11512
11513 case Intrinsic::amdgcn_raw_tbuffer_store:
11514 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11515 SDValue VData = Op.getOperand(i: 2);
11516 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11517 if (IsD16)
11518 VData = handleD16VData(VData, DAG);
11519 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11520 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11521 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11522 SDValue Ops[] = {
11523 Chain,
11524 VData, // vdata
11525 Rsrc, // rsrc
11526 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11527 VOffset, // voffset
11528 SOffset, // soffset
11529 Offset, // offset
11530 Op.getOperand(i: 6), // format
11531 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11532 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11533 };
11534 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11535 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11536 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11537 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11538 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11539 }
11540
11541 case Intrinsic::amdgcn_raw_buffer_store:
11542 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11543 case Intrinsic::amdgcn_raw_buffer_store_format:
11544 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11545 const bool IsFormat =
11546 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11547 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11548
11549 SDValue VData = Op.getOperand(i: 2);
11550 EVT VDataVT = VData.getValueType();
11551 EVT EltType = VDataVT.getScalarType();
11552 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11553 if (IsD16) {
11554 VData = handleD16VData(VData, DAG);
11555 VDataVT = VData.getValueType();
11556 }
11557
11558 if (!isTypeLegal(VT: VDataVT)) {
11559 VData =
11560 DAG.getNode(Opcode: ISD::BITCAST, DL,
11561 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11562 }
11563
11564 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11565 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11566 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11567 SDValue Ops[] = {
11568 Chain,
11569 VData,
11570 Rsrc,
11571 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11572 VOffset, // voffset
11573 SOffset, // soffset
11574 Offset, // offset
11575 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
11576 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11577 };
11578 unsigned Opc =
11579 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11580 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11581 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11582
11583 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11584 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11585 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
11586
11587 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11588 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11589 }
11590
11591 case Intrinsic::amdgcn_struct_buffer_store:
11592 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11593 case Intrinsic::amdgcn_struct_buffer_store_format:
11594 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11595 const bool IsFormat =
11596 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11597 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11598
11599 SDValue VData = Op.getOperand(i: 2);
11600 EVT VDataVT = VData.getValueType();
11601 EVT EltType = VDataVT.getScalarType();
11602 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11603
11604 if (IsD16) {
11605 VData = handleD16VData(VData, DAG);
11606 VDataVT = VData.getValueType();
11607 }
11608
11609 if (!isTypeLegal(VT: VDataVT)) {
11610 VData =
11611 DAG.getNode(Opcode: ISD::BITCAST, DL,
11612 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11613 }
11614
11615 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11616 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11617 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11618 SDValue Ops[] = {
11619 Chain,
11620 VData,
11621 Rsrc,
11622 Op.getOperand(i: 4), // vindex
11623 VOffset, // voffset
11624 SOffset, // soffset
11625 Offset, // offset
11626 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11627 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11628 };
11629 unsigned Opc =
11630 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11631 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11632 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11633
11634 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11635 EVT VDataType = VData.getValueType().getScalarType();
11636 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11637 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11638
11639 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11640 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11641 }
11642 case Intrinsic::amdgcn_raw_buffer_load_lds:
11643 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11644 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11645 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11646 case Intrinsic::amdgcn_struct_buffer_load_lds:
11647 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11648 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11649 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11650 if (!Subtarget->hasVMemToLDSLoad())
11651 return SDValue();
11652 unsigned Opc;
11653 bool HasVIndex =
11654 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11655 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11656 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11657 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11658 unsigned OpOffset = HasVIndex ? 1 : 0;
11659 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
11660 bool HasVOffset = !isNullConstant(V: VOffset);
11661 unsigned Size = Op->getConstantOperandVal(Num: 4);
11662
11663 switch (Size) {
11664 default:
11665 return SDValue();
11666 case 1:
11667 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11668 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11669 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11670 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11671 break;
11672 case 2:
11673 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11674 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11675 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11676 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11677 break;
11678 case 4:
11679 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11680 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11681 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11682 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11683 break;
11684 case 12:
11685 if (!Subtarget->hasLDSLoadB96_B128())
11686 return SDValue();
11687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11688 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11689 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11690 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11691 break;
11692 case 16:
11693 if (!Subtarget->hasLDSLoadB96_B128())
11694 return SDValue();
11695 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11696 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11697 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11698 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11699 break;
11700 }
11701
11702 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11703
11704 SmallVector<SDValue, 8> Ops;
11705
11706 if (HasVIndex && HasVOffset)
11707 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
11708 Ops: {Op.getOperand(i: 5), // VIndex
11709 VOffset}));
11710 else if (HasVIndex)
11711 Ops.push_back(Elt: Op.getOperand(i: 5));
11712 else if (HasVOffset)
11713 Ops.push_back(Elt: VOffset);
11714
11715 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11716 Ops.push_back(Elt: Rsrc);
11717 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
11718 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
11719 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11720 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
11721 Ops.push_back(Elt: DAG.getTargetConstant(
11722 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11723 DL, VT: MVT::i8)); // cpol
11724 Ops.push_back(Elt: DAG.getTargetConstant(
11725 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11726 ? 1
11727 : 0,
11728 DL, VT: MVT::i8)); // swz
11729 Ops.push_back(
11730 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11731 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11732 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11733
11734 auto *M = cast<MemSDNode>(Val&: Op);
11735 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
11736 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11737
11738 return SDValue(Load, 0);
11739 }
11740 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11741 // for "trust me" that the remaining cases are global pointers until
11742 // such time as we can put two mem operands on an intrinsic.
11743 case Intrinsic::amdgcn_load_to_lds:
11744 case Intrinsic::amdgcn_load_async_to_lds:
11745 case Intrinsic::amdgcn_global_load_lds:
11746 case Intrinsic::amdgcn_global_load_async_lds: {
11747 if (!Subtarget->hasVMemToLDSLoad())
11748 return SDValue();
11749
11750 unsigned Opc;
11751 unsigned Size = Op->getConstantOperandVal(Num: 4);
11752 switch (Size) {
11753 default:
11754 return SDValue();
11755 case 1:
11756 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11757 break;
11758 case 2:
11759 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11760 break;
11761 case 4:
11762 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11763 break;
11764 case 12:
11765 if (!Subtarget->hasLDSLoadB96_B128())
11766 return SDValue();
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11768 break;
11769 case 16:
11770 if (!Subtarget->hasLDSLoadB96_B128())
11771 return SDValue();
11772 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11773 break;
11774 }
11775
11776 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11777
11778 SmallVector<SDValue, 6> Ops;
11779
11780 SDValue Addr = Op.getOperand(i: 2); // Global ptr
11781 SDValue VOffset;
11782 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11783 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11784 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11785 SDValue LHS = Addr.getOperand(i: 0);
11786 SDValue RHS = Addr.getOperand(i: 1);
11787
11788 if (LHS->isDivergent())
11789 std::swap(a&: LHS, b&: RHS);
11790
11791 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11792 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
11793 // add (i64 sgpr), (zero_extend (i32 vgpr))
11794 Addr = LHS;
11795 VOffset = RHS.getOperand(i: 0);
11796 }
11797 }
11798
11799 Ops.push_back(Elt: Addr);
11800 if (!Addr->isDivergent()) {
11801 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
11802 if (!VOffset)
11803 VOffset =
11804 SDValue(DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
11805 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
11806 0);
11807 Ops.push_back(Elt: VOffset);
11808 }
11809
11810 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
11811
11812 unsigned Aux = Op.getConstantOperandVal(i: 6);
11813 Ops.push_back(Elt: DAG.getTargetConstant(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11814 VT: MVT::i32)); // CPol
11815 Ops.push_back(
11816 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11817
11818 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11819 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11820
11821 auto *M = cast<MemSDNode>(Val&: Op);
11822 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11823 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11824
11825 return SDValue(Load, 0);
11826 }
11827 case Intrinsic::amdgcn_end_cf:
11828 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
11829 Op1: Op->getOperand(Num: 2), Op2: Chain),
11830 0);
11831 case Intrinsic::amdgcn_s_barrier_init:
11832 case Intrinsic::amdgcn_s_barrier_signal_var: {
11833 // these two intrinsics have two operands: barrier pointer and member count
11834 SDValue Chain = Op->getOperand(Num: 0);
11835 SmallVector<SDValue, 2> Ops;
11836 SDValue BarOp = Op->getOperand(Num: 2);
11837 SDValue CntOp = Op->getOperand(Num: 3);
11838 SDValue M0Val;
11839 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11840 ? AMDGPU::S_BARRIER_INIT_M0
11841 : AMDGPU::S_BARRIER_SIGNAL_M0;
11842 // extract the BarrierID from bits 4-9 of BarOp
11843 SDValue BarID;
11844 BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11845 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11846 BarID =
11847 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
11848 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11849 0);
11850 // Member count should be put into M0[ShAmt:+6]
11851 // Barrier ID should be put into M0[5:0]
11852 M0Val =
11853 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
11854 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11855 0);
11856 constexpr unsigned ShAmt = 16;
11857 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
11858 N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
11859
11860 M0Val = SDValue(
11861 DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), 0);
11862
11863 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11864
11865 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11866 return SDValue(NewMI, 0);
11867 }
11868 case Intrinsic::amdgcn_s_wakeup_barrier: {
11869 if (!Subtarget->hasSWakeupBarrier())
11870 return SDValue();
11871 [[fallthrough]];
11872 }
11873 case Intrinsic::amdgcn_s_barrier_join: {
11874 // these three intrinsics have one operand: barrier pointer
11875 SDValue Chain = Op->getOperand(Num: 0);
11876 SmallVector<SDValue, 2> Ops;
11877 SDValue BarOp = Op->getOperand(Num: 2);
11878 unsigned Opc;
11879
11880 if (isa<ConstantSDNode>(Val: BarOp)) {
11881 uint64_t BarVal = cast<ConstantSDNode>(Val&: BarOp)->getZExtValue();
11882 switch (IntrinsicID) {
11883 default:
11884 return SDValue();
11885 case Intrinsic::amdgcn_s_barrier_join:
11886 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11887 break;
11888 case Intrinsic::amdgcn_s_wakeup_barrier:
11889 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11890 break;
11891 }
11892 // extract the BarrierID from bits 4-9 of the immediate
11893 unsigned BarID = (BarVal >> 4) & 0x3F;
11894 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11895 Ops.push_back(Elt: K);
11896 Ops.push_back(Elt: Chain);
11897 } else {
11898 switch (IntrinsicID) {
11899 default:
11900 return SDValue();
11901 case Intrinsic::amdgcn_s_barrier_join:
11902 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11903 break;
11904 case Intrinsic::amdgcn_s_wakeup_barrier:
11905 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11906 break;
11907 }
11908 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11909 SDValue M0Val;
11910 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11911 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11912 M0Val =
11913 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11914 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11915 0);
11916 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11917 }
11918
11919 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11920 return SDValue(NewMI, 0);
11921 }
11922 case Intrinsic::amdgcn_s_prefetch_data: {
11923 // For non-global address space preserve the chain and remove the call.
11924 if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
11925 return Op.getOperand(i: 0);
11926 return Op;
11927 }
11928 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11929 SDValue Ops[] = {
11930 Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG),
11931 Op.getOperand(i: 3), // offset
11932 Op.getOperand(i: 4), // length
11933 };
11934
11935 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11936 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
11937 VTList: Op->getVTList(), Ops, MemVT: M->getMemoryVT(),
11938 MMO: M->getMemOperand());
11939 }
11940 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11941 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11942 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11943 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11944 SDValue Chain = Op->getOperand(Num: 0);
11945 SDValue Ptr = Op->getOperand(Num: 2);
11946 SDValue Val = Op->getOperand(Num: 3);
11947 return DAG.getAtomic(Opcode: ISD::ATOMIC_STORE, dl: DL, MemVT: MII->getMemoryVT(), Chain, Ptr: Val,
11948 Val: Ptr, MMO: MII->getMemOperand());
11949 }
11950 default: {
11951 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11952 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
11953 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11954
11955 return Op;
11956 }
11957 }
11958}
11959
11960// Return whether the operation has NoUnsignedWrap property.
11961static bool isNoUnsignedWrap(SDValue Addr) {
11962 return (Addr.getOpcode() == ISD::ADD &&
11963 Addr->getFlags().hasNoUnsignedWrap()) ||
11964 Addr->getOpcode() == ISD::OR;
11965}
11966
11967bool SITargetLowering::shouldPreservePtrArith(const Function &F,
11968 EVT PtrVT) const {
11969 return PtrVT == MVT::i64;
11970}
11971
11972bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
11973 EVT PtrVT) const {
11974 return true;
11975}
11976
11977// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11978// offset (the offset that is included in bounds checking and swizzling, to be
11979// split between the instruction's voffset and immoffset fields) and soffset
11980// (the offset that is excluded from bounds checking and swizzling, to go in
11981// the instruction's soffset field). This function takes the first kind of
11982// offset and figures out how to split it between voffset and immoffset.
11983std::pair<SDValue, SDValue>
11984SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11985 SDLoc DL(Offset);
11986 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
11987 SDValue N0 = Offset;
11988 ConstantSDNode *C1 = nullptr;
11989
11990 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
11991 N0 = SDValue();
11992 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
11993 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11994 // being added, so we can only safely match a 32-bit addition with no
11995 // unsigned overflow.
11996 bool CheckNUW = Subtarget->hasGFX1250Insts();
11997 if (!CheckNUW || isNoUnsignedWrap(Addr: N0)) {
11998 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
11999 N0 = N0.getOperand(i: 0);
12000 }
12001 }
12002
12003 if (C1) {
12004 unsigned ImmOffset = C1->getZExtValue();
12005 // If the immediate value is too big for the immoffset field, put only bits
12006 // that would normally fit in the immoffset field. The remaining value that
12007 // is copied/added for the voffset field is a large power of 2, and it
12008 // stands more chance of being CSEd with the copy/add for another similar
12009 // load/store.
12010 // However, do not do that rounding down if that is a negative
12011 // number, as it appears to be illegal to have a negative offset in the
12012 // vgpr, even if adding the immediate offset makes it positive.
12013 unsigned Overflow = ImmOffset & ~MaxImm;
12014 ImmOffset -= Overflow;
12015 if ((int32_t)Overflow < 0) {
12016 Overflow += ImmOffset;
12017 ImmOffset = 0;
12018 }
12019 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
12020 if (Overflow) {
12021 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
12022 if (!N0)
12023 N0 = OverflowVal;
12024 else {
12025 SDValue Ops[] = {N0, OverflowVal};
12026 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
12027 }
12028 }
12029 }
12030 if (!N0)
12031 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12032 if (!C1)
12033 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
12034 return {N0, SDValue(C1, 0)};
12035}
12036
12037// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12038// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12039// pointed to by Offsets.
12040void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12041 SelectionDAG &DAG, SDValue *Offsets,
12042 Align Alignment) const {
12043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12044 SDLoc DL(CombinedOffset);
12045 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
12046 uint32_t Imm = C->getZExtValue();
12047 uint32_t SOffset, ImmOffset;
12048 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12049 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12050 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12051 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12052 return;
12053 }
12054 }
12055 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
12056 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12057 // being added, so we can only safely match a 32-bit addition with no
12058 // unsigned overflow.
12059 bool CheckNUW = Subtarget->hasGFX1250Insts();
12060 SDValue N0 = CombinedOffset.getOperand(i: 0);
12061 SDValue N1 = CombinedOffset.getOperand(i: 1);
12062 uint32_t SOffset, ImmOffset;
12063 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
12064 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(Addr: CombinedOffset)) &&
12065 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
12066 Offsets[0] = N0;
12067 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12068 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12069 return;
12070 }
12071 }
12072
12073 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12074 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
12075 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12076
12077 Offsets[0] = CombinedOffset;
12078 Offsets[1] = SOffsetZero;
12079 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
12080}
12081
12082SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12083 SelectionDAG &DAG) const {
12084 if (!MaybePointer.getValueType().isScalarInteger())
12085 return MaybePointer;
12086
12087 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
12088 return Rsrc;
12089}
12090
12091// Wrap a global or flat pointer into a buffer intrinsic using the flags
12092// specified in the intrinsic.
12093SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12094 SelectionDAG &DAG) const {
12095 SDLoc Loc(Op);
12096
12097 SDValue Pointer = Op->getOperand(Num: 1);
12098 SDValue Stride = Op->getOperand(Num: 2);
12099 SDValue NumRecords = Op->getOperand(Num: 3);
12100 SDValue Flags = Op->getOperand(Num: 4);
12101
12102 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
12103 SDValue Rsrc;
12104
12105 if (Subtarget->has45BitNumRecordsBufferResource()) {
12106 SDValue Zero = DAG.getConstant(Val: 0, DL: Loc, VT: MVT::i32);
12107 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12108 // num_records.
12109 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Op: Pointer, DL: Loc, VT: MVT::i64);
12110 SDValue NumRecordsLHS =
12111 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12112 N2: DAG.getShiftAmountConstant(Val: 57, VT: MVT::i32, DL: Loc));
12113 SDValue LowHalf =
12114 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: ExtPointer, N2: NumRecordsLHS);
12115
12116 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12117 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12118 SDValue NumRecordsRHS =
12119 DAG.getNode(Opcode: ISD::SRL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12120 N2: DAG.getShiftAmountConstant(Val: 7, VT: MVT::i32, DL: Loc));
12121 SDValue ShiftedStride =
12122 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12123 N2: DAG.getShiftAmountConstant(Val: 12, VT: MVT::i32, DL: Loc));
12124 SDValue ExtShiftedStrideVec =
12125 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedStride);
12126 SDValue ExtShiftedStride =
12127 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedStrideVec);
12128 SDValue ShiftedFlags =
12129 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: Flags,
12130 N2: DAG.getShiftAmountConstant(Val: 28, VT: MVT::i32, DL: Loc));
12131 SDValue ExtShiftedFlagsVec =
12132 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedFlags);
12133 SDValue ExtShiftedFlags =
12134 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedFlagsVec);
12135 SDValue CombinedFields =
12136 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: NumRecordsRHS, N2: ExtShiftedStride);
12137 SDValue HighHalf =
12138 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: CombinedFields, N2: ExtShiftedFlags);
12139
12140 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i64, N1: LowHalf, N2: HighHalf);
12141 } else {
12142 NumRecords = DAG.getAnyExtOrTrunc(Op: NumRecords, DL: Loc, VT: MVT::i32);
12143 auto [LowHalf, HighHalf] =
12144 DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
12145 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
12146 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
12147 SDValue ShiftedStride =
12148 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12149 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
12150 SDValue NewHighHalf =
12151 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
12152
12153 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, N2: NewHighHalf,
12154 N3: NumRecords, N4: Flags);
12155 }
12156
12157 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
12158 return RsrcPtr;
12159}
12160
12161// Handle 8 bit and 16 bit buffer loads
12162SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12163 EVT LoadVT, SDLoc DL,
12164 ArrayRef<SDValue> Ops,
12165 MachineMemOperand *MMO,
12166 bool IsTFE) const {
12167 EVT IntVT = LoadVT.changeTypeToInteger();
12168
12169 if (IsTFE) {
12170 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12171 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12172 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12173 MachineFunction &MF = DAG.getMachineFunction();
12174 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
12175 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
12176 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
12177 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12178 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
12179 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12180 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
12181 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
12182 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
12183 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
12184 }
12185
12186 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12187 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12188 : AMDGPUISD::BUFFER_LOAD_USHORT;
12189
12190 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
12191 SDValue BufferLoad =
12192 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
12193 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
12194 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
12195
12196 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
12197}
12198
12199// Handle 8 bit and 16 bit buffer stores
12200SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12201 EVT VDataType, SDLoc DL,
12202 SDValue Ops[],
12203 MemSDNode *M) const {
12204 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12205 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
12206
12207 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
12208 Ops[1] = BufferStoreExt;
12209 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12210 : AMDGPUISD::BUFFER_STORE_SHORT;
12211 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12212 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
12213 MMO: M->getMemOperand());
12214}
12215
12216static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
12217 SDValue Op, const SDLoc &SL, EVT VT) {
12218 if (VT.bitsLT(VT: Op.getValueType()))
12219 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
12220
12221 switch (ExtType) {
12222 case ISD::SEXTLOAD:
12223 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
12224 case ISD::ZEXTLOAD:
12225 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
12226 case ISD::EXTLOAD:
12227 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
12228 case ISD::NON_EXTLOAD:
12229 return Op;
12230 }
12231
12232 llvm_unreachable("invalid ext type");
12233}
12234
12235// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12236// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12237SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12238 DAGCombinerInfo &DCI) const {
12239 SelectionDAG &DAG = DCI.DAG;
12240 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12241 return SDValue();
12242
12243 // FIXME: Constant loads should all be marked invariant.
12244 unsigned AS = Ld->getAddressSpace();
12245 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12246 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
12247 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12248 return SDValue();
12249
12250 // Don't do this early, since it may interfere with adjacent load merging for
12251 // illegal types. We can avoid losing alignment information for exotic types
12252 // pre-legalize.
12253 EVT MemVT = Ld->getMemoryVT();
12254 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12255 MemVT.getSizeInBits() >= 32)
12256 return SDValue();
12257
12258 SDLoc SL(Ld);
12259
12260 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12261 "unexpected vector extload");
12262
12263 // TODO: Drop only high part of range.
12264 SDValue Ptr = Ld->getBasePtr();
12265 SDValue NewLoad = DAG.getLoad(
12266 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
12267 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
12268 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
12269 Ranges: nullptr); // Drop ranges
12270
12271 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
12272 if (MemVT.isFloatingPoint()) {
12273 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
12274 "unexpected fp extload");
12275 TruncVT = MemVT.changeTypeToInteger();
12276 }
12277
12278 SDValue Cvt = NewLoad;
12279 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12280 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
12281 N2: DAG.getValueType(TruncVT));
12282 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12283 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
12284 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
12285 } else {
12286 assert(Ld->getExtensionType() == ISD::EXTLOAD);
12287 }
12288
12289 EVT VT = Ld->getValueType(ResNo: 0);
12290 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
12291
12292 DCI.AddToWorklist(N: Cvt.getNode());
12293
12294 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12295 // the appropriate extension from the 32-bit load.
12296 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
12297 DCI.AddToWorklist(N: Cvt.getNode());
12298
12299 // Handle conversion back to floating point if necessary.
12300 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
12301
12302 return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: 1)}, dl: SL);
12303}
12304
12305static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
12306 const SIMachineFunctionInfo &Info) {
12307 // TODO: Should check if the address can definitely not access stack.
12308 if (Info.isEntryFunction())
12309 return Info.getUserSGPRInfo().hasFlatScratchInit();
12310 return true;
12311}
12312
12313SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12314 SDLoc DL(Op);
12315 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
12316 ISD::LoadExtType ExtType = Load->getExtensionType();
12317 EVT MemVT = Load->getMemoryVT();
12318 MachineMemOperand *MMO = Load->getMemOperand();
12319
12320 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12321 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
12322 return SDValue();
12323
12324 // FIXME: Copied from PPC
12325 // First, load into 32 bits, then truncate to 1 bit.
12326
12327 SDValue Chain = Load->getChain();
12328 SDValue BasePtr = Load->getBasePtr();
12329
12330 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12331
12332 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
12333 MemVT: RealMemVT, MMO);
12334
12335 if (!MemVT.isVector()) {
12336 SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
12337 NewLD.getValue(R: 1)};
12338
12339 return DAG.getMergeValues(Ops, dl: DL);
12340 }
12341
12342 SmallVector<SDValue, 3> Elts;
12343 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12344 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
12345 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
12346
12347 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
12348 }
12349
12350 SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: 1)};
12351
12352 return DAG.getMergeValues(Ops, dl: DL);
12353 }
12354
12355 if (!MemVT.isVector())
12356 return SDValue();
12357
12358 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12359 "Custom lowering for non-i32 vectors hasn't been implemented.");
12360
12361 Align Alignment = Load->getAlign();
12362 unsigned AS = Load->getAddressSpace();
12363 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12364 AS == AMDGPUAS::FLAT_ADDRESS &&
12365 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12366 return SplitVectorLoad(Op, DAG);
12367 }
12368
12369 MachineFunction &MF = DAG.getMachineFunction();
12370 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12371 // If there is a possibility that flat instruction access scratch memory
12372 // then we need to use the same legalization rules we use for private.
12373 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12374 !Subtarget->hasMultiDwordFlatScratchAddressing())
12375 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
12376 ? AMDGPUAS::PRIVATE_ADDRESS
12377 : AMDGPUAS::GLOBAL_ADDRESS;
12378
12379 unsigned NumElements = MemVT.getVectorNumElements();
12380
12381 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12382 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12383 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12384 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12385 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(N: Load)))) {
12386 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12387 Alignment >= Align(4) && NumElements < 32) {
12388 if (MemVT.isPow2VectorType() ||
12389 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12390 return SDValue();
12391 return WidenOrSplitVectorLoad(Op, DAG);
12392 }
12393 // Non-uniform loads will be selected to MUBUF instructions, so they
12394 // have the same legalization requirements as global and private
12395 // loads.
12396 //
12397 }
12398 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12399 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12400 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
12401 if (NumElements > 4)
12402 return SplitVectorLoad(Op, DAG);
12403 // v3 loads not supported on SI.
12404 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12405 return WidenOrSplitVectorLoad(Op, DAG);
12406
12407 // v3 and v4 loads are supported for private and global memory.
12408 return SDValue();
12409 }
12410 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12411 // Depending on the setting of the private_element_size field in the
12412 // resource descriptor, we can only make private accesses up to a certain
12413 // size.
12414 switch (Subtarget->getMaxPrivateElementSize()) {
12415 case 4: {
12416 auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
12417 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12418 }
12419 case 8:
12420 if (NumElements > 2)
12421 return SplitVectorLoad(Op, DAG);
12422 return SDValue();
12423 case 16:
12424 // Same as global/flat
12425 if (NumElements > 4)
12426 return SplitVectorLoad(Op, DAG);
12427 // v3 loads not supported on SI.
12428 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12429 return WidenOrSplitVectorLoad(Op, DAG);
12430
12431 return SDValue();
12432 default:
12433 llvm_unreachable("unsupported private_element_size");
12434 }
12435 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12436 unsigned Fast = 0;
12437 auto Flags = Load->getMemOperand()->getFlags();
12438 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
12439 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
12440 Fast > 1)
12441 return SDValue();
12442
12443 if (MemVT.isVector())
12444 return SplitVectorLoad(Op, DAG);
12445 }
12446
12447 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
12448 VT: MemVT, MMO: *Load->getMemOperand())) {
12449 auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
12450 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12451 }
12452
12453 return SDValue();
12454}
12455
12456SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12457 EVT VT = Op.getValueType();
12458 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12459 VT.getSizeInBits() == 512)
12460 return splitTernaryVectorOp(Op, DAG);
12461
12462 assert(VT.getSizeInBits() == 64);
12463
12464 SDLoc DL(Op);
12465 SDValue Cond = DAG.getFreeze(V: Op.getOperand(i: 0));
12466
12467 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12468 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
12469
12470 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
12471 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
12472
12473 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
12474 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
12475
12476 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
12477
12478 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
12479 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
12480
12481 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
12482
12483 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
12484 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
12485}
12486
12487// Catch division cases where we can use shortcuts with rcp and rsq
12488// instructions.
12489SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12490 SelectionDAG &DAG) const {
12491 SDLoc SL(Op);
12492 SDValue LHS = Op.getOperand(i: 0);
12493 SDValue RHS = Op.getOperand(i: 1);
12494 EVT VT = Op.getValueType();
12495 const SDNodeFlags Flags = Op->getFlags();
12496
12497 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12498
12499 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
12500 // Without !fpmath accuracy information, we can't do more because we don't
12501 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12502 // f16 is always accurate enough
12503 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12504 return SDValue();
12505
12506 if (CLHS->isExactlyValue(V: 1.0)) {
12507 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12508 // the CI documentation has a worst case error of 1 ulp.
12509 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12510 // use it as long as we aren't trying to use denormals.
12511 //
12512 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12513
12514 // 1.0 / sqrt(x) -> rsq(x)
12515
12516 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12517 // error seems really high at 2^29 ULP.
12518 // 1.0 / x -> rcp(x)
12519 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12520 }
12521
12522 // Same as for 1.0, but expand the sign out of the constant.
12523 if (CLHS->isExactlyValue(V: -1.0)) {
12524 // -1.0 / x -> rcp (fneg x)
12525 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
12526 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
12527 }
12528 }
12529
12530 // For f16 and bf16 require afn or arcp.
12531 // For f32 require afn.
12532 if (!AllowInaccurateRcp &&
12533 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12534 return SDValue();
12535
12536 // Turn into multiply by the reciprocal.
12537 // x / y -> x * (1.0 / y)
12538 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12539 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
12540}
12541
12542SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12543 SelectionDAG &DAG) const {
12544 SDLoc SL(Op);
12545 SDValue X = Op.getOperand(i: 0);
12546 SDValue Y = Op.getOperand(i: 1);
12547 EVT VT = Op.getValueType();
12548 const SDNodeFlags Flags = Op->getFlags();
12549
12550 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12551 if (!AllowInaccurateDiv)
12552 return SDValue();
12553
12554 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
12555 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
12556
12557 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
12558 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12559
12560 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
12561 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12562 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
12563 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
12564 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
12565 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
12566}
12567
12568static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12569 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12570 SDNodeFlags Flags) {
12571 if (GlueChain->getNumValues() <= 1) {
12572 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
12573 }
12574
12575 assert(GlueChain->getNumValues() == 3);
12576
12577 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12578 switch (Opcode) {
12579 default:
12580 llvm_unreachable("no chain equivalent for opcode");
12581 case ISD::FMUL:
12582 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12583 break;
12584 }
12585
12586 return DAG.getNode(Opcode, DL: SL, VTList,
12587 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
12588 Flags);
12589}
12590
12591static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12592 EVT VT, SDValue A, SDValue B, SDValue C,
12593 SDValue GlueChain, SDNodeFlags Flags) {
12594 if (GlueChain->getNumValues() <= 1) {
12595 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
12596 }
12597
12598 assert(GlueChain->getNumValues() == 3);
12599
12600 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12601 switch (Opcode) {
12602 default:
12603 llvm_unreachable("no chain equivalent for opcode");
12604 case ISD::FMA:
12605 Opcode = AMDGPUISD::FMA_W_CHAIN;
12606 break;
12607 }
12608
12609 return DAG.getNode(Opcode, DL: SL, VTList,
12610 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
12611 Flags);
12612}
12613
12614SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12615 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12616 return FastLowered;
12617
12618 SDLoc SL(Op);
12619 EVT VT = Op.getValueType();
12620 SDValue LHS = Op.getOperand(i: 0);
12621 SDValue RHS = Op.getOperand(i: 1);
12622
12623 SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
12624 SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
12625
12626 if (VT == MVT::bf16) {
12627 SDValue ExtDiv =
12628 DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT: MVT::f32, N1: LHSExt, N2: RHSExt, Flags: Op->getFlags());
12629 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ExtDiv,
12630 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12631 }
12632
12633 assert(VT == MVT::f16);
12634
12635 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12636 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12637 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12638 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12639 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12640 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12641 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12642 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12643 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12644 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12645 // q16.u = opx(V_CVT_F16_F32, q32.u);
12646 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12647
12648 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12649 unsigned FMADOpCode =
12650 isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
12651 SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
12652 SDValue Rcp =
12653 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op->getFlags());
12654 SDValue Quot =
12655 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op->getFlags());
12656 SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12657 Flags: Op->getFlags());
12658 Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op->getFlags());
12659 Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12660 Flags: Op->getFlags());
12661 SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op->getFlags());
12662 SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
12663 TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
12664 N2: DAG.getConstant(Val: 0xff800000, DL: SL, VT: MVT::i32));
12665 Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
12666 Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op->getFlags());
12667 SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
12668 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12669 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
12670 Flags: Op->getFlags());
12671}
12672
12673// Faster 2.5 ULP division that does not support denormals.
12674SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12675 SDNodeFlags Flags = Op->getFlags();
12676 SDLoc SL(Op);
12677 SDValue LHS = Op.getOperand(i: 1);
12678 SDValue RHS = Op.getOperand(i: 2);
12679
12680 // TODO: The combiner should probably handle elimination of redundant fabs.
12681 SDValue r1 = DAG.SignBitIsZeroFP(Op: RHS)
12682 ? RHS
12683 : DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
12684
12685 const APFloat K0Val(0x1p+96f);
12686 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
12687
12688 const APFloat K1Val(0x1p-32f);
12689 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
12690
12691 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12692
12693 EVT SetCCVT =
12694 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
12695
12696 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
12697
12698 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
12699
12700 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
12701
12702 // rcp does not support denormals.
12703 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
12704
12705 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
12706
12707 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
12708}
12709
12710// Returns immediate value for setting the F32 denorm mode when using the
12711// S_DENORM_MODE instruction.
12712static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
12713 const SIMachineFunctionInfo *Info,
12714 const GCNSubtarget *ST) {
12715 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12716 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12717 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12718 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
12719}
12720
12721SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12722 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12723 return FastLowered;
12724
12725 // The selection matcher assumes anything with a chain selecting to a
12726 // mayRaiseFPException machine instruction. Since we're introducing a chain
12727 // here, we need to explicitly report nofpexcept for the regular fdiv
12728 // lowering.
12729 SDNodeFlags Flags = Op->getFlags();
12730 Flags.setNoFPExcept(true);
12731
12732 SDLoc SL(Op);
12733 SDValue LHS = Op.getOperand(i: 0);
12734 SDValue RHS = Op.getOperand(i: 1);
12735
12736 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12737
12738 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
12739
12740 SDValue DenominatorScaled =
12741 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
12742 SDValue NumeratorScaled =
12743 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
12744
12745 // Denominator is scaled to not be denormal, so using rcp is ok.
12746 SDValue ApproxRcp =
12747 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12748 SDValue NegDivScale0 =
12749 DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12750
12751 using namespace AMDGPU::Hwreg;
12752 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
12753 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
12754
12755 const MachineFunction &MF = DAG.getMachineFunction();
12756 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12757 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12758
12759 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12760 const bool HasDynamicDenormals =
12761 (DenormMode.Input == DenormalMode::Dynamic) ||
12762 (DenormMode.Output == DenormalMode::Dynamic);
12763
12764 SDValue SavedDenormMode;
12765
12766 if (!PreservesDenormals) {
12767 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12768 // lowering. The chain dependence is insufficient, and we need glue. We do
12769 // not need the glue variants in a strictfp function.
12770
12771 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12772
12773 SDValue Glue = DAG.getEntryNode();
12774 if (HasDynamicDenormals) {
12775 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
12776 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
12777 Ops: {BitField, Glue});
12778 SavedDenormMode = SDValue(GetReg, 0);
12779
12780 Glue = DAG.getMergeValues(
12781 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
12782 }
12783
12784 SDNode *EnableDenorm;
12785 if (Subtarget->hasDenormModeInst()) {
12786 const SDValue EnableDenormValue =
12787 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
12788
12789 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
12790 N2: EnableDenormValue)
12791 .getNode();
12792 } else {
12793 const SDValue EnableDenormValue =
12794 DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
12795 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
12796 Ops: {EnableDenormValue, BitField, Glue});
12797 }
12798
12799 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12800 SDValue(EnableDenorm, 1)};
12801
12802 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
12803 }
12804
12805 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
12806 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
12807
12808 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
12809 C: ApproxRcp, GlueChain: Fma0, Flags);
12810
12811 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
12812 GlueChain: Fma1, Flags);
12813
12814 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
12815 C: NumeratorScaled, GlueChain: Mul, Flags);
12816
12817 SDValue Fma3 =
12818 getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
12819
12820 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
12821 C: NumeratorScaled, GlueChain: Fma3, Flags);
12822
12823 if (!PreservesDenormals) {
12824 SDNode *DisableDenorm;
12825 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12826 const SDValue DisableDenormValue = getSPDenormModeValue(
12827 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
12828
12829 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12830 DisableDenorm =
12831 DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
12832 N1: Fma4.getValue(R: 1), N2: DisableDenormValue, N3: Fma4.getValue(R: 2))
12833 .getNode();
12834 } else {
12835 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12836 const SDValue DisableDenormValue =
12837 HasDynamicDenormals
12838 ? SavedDenormMode
12839 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
12840
12841 DisableDenorm = DAG.getMachineNode(
12842 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
12843 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
12844 }
12845
12846 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
12847 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
12848 DAG.setRoot(OutputChain);
12849 }
12850
12851 SDValue Scale = NumeratorScaled.getValue(R: 1);
12852 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
12853 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
12854
12855 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
12856}
12857
12858SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12859 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12860 return FastLowered;
12861
12862 SDLoc SL(Op);
12863 SDValue X = Op.getOperand(i: 0);
12864 SDValue Y = Op.getOperand(i: 1);
12865
12866 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
12867
12868 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
12869
12870 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
12871
12872 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
12873
12874 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
12875
12876 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
12877
12878 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
12879
12880 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
12881
12882 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
12883
12884 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
12885 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
12886
12887 SDValue Fma4 =
12888 DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
12889
12890 SDValue Scale;
12891
12892 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12893 // Workaround a hardware bug on SI where the condition output from div_scale
12894 // is not usable.
12895
12896 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
12897
12898 // Figure out if the scale to use for div_fmas.
12899 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
12900 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
12901 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
12902 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
12903
12904 SDValue NumHi =
12905 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
12906 SDValue DenHi =
12907 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
12908
12909 SDValue Scale0Hi =
12910 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
12911 SDValue Scale1Hi =
12912 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
12913
12914 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
12915 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
12916 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
12917 } else {
12918 Scale = DivScale1.getValue(R: 1);
12919 }
12920
12921 SDValue Fmas =
12922 DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
12923
12924 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
12925}
12926
12927SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12928 EVT VT = Op.getValueType();
12929
12930 if (VT == MVT::f32)
12931 return LowerFDIV32(Op, DAG);
12932
12933 if (VT == MVT::f64)
12934 return LowerFDIV64(Op, DAG);
12935
12936 if (VT == MVT::f16 || VT == MVT::bf16)
12937 return LowerFDIV16(Op, DAG);
12938
12939 llvm_unreachable("Unexpected type for fdiv");
12940}
12941
12942SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12943 SDLoc dl(Op);
12944 SDValue Val = Op.getOperand(i: 0);
12945 EVT VT = Val.getValueType();
12946 EVT ResultExpVT = Op->getValueType(ResNo: 1);
12947 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12948
12949 SDValue Mant = DAG.getNode(
12950 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12951 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
12952
12953 SDValue Exp = DAG.getNode(
12954 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
12955 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
12956
12957 if (Subtarget->hasFractBug()) {
12958 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
12959 SDValue Inf =
12960 DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
12961
12962 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
12963 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
12964 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
12965 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
12966 }
12967
12968 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
12969 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
12970}
12971
12972SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12973 SDLoc DL(Op);
12974 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
12975 EVT VT = Store->getMemoryVT();
12976
12977 if (VT == MVT::i1) {
12978 return DAG.getTruncStore(
12979 Chain: Store->getChain(), dl: DL,
12980 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
12981 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
12982 }
12983
12984 assert(VT.isVector() &&
12985 Store->getValue().getValueType().getScalarType() == MVT::i32);
12986
12987 unsigned AS = Store->getAddressSpace();
12988 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12989 AS == AMDGPUAS::FLAT_ADDRESS &&
12990 Store->getAlign().value() < VT.getStoreSize() &&
12991 VT.getSizeInBits() > 32) {
12992 return SplitVectorStore(Op, DAG);
12993 }
12994
12995 MachineFunction &MF = DAG.getMachineFunction();
12996 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12997 // If there is a possibility that flat instruction access scratch memory
12998 // then we need to use the same legalization rules we use for private.
12999 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13000 !Subtarget->hasMultiDwordFlatScratchAddressing())
13001 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
13002 ? AMDGPUAS::PRIVATE_ADDRESS
13003 : AMDGPUAS::GLOBAL_ADDRESS;
13004
13005 unsigned NumElements = VT.getVectorNumElements();
13006 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
13007 if (NumElements > 4)
13008 return SplitVectorStore(Op, DAG);
13009 // v3 stores not supported on SI.
13010 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13011 return SplitVectorStore(Op, DAG);
13012
13013 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
13014 VT, MMO: *Store->getMemOperand()))
13015 return expandUnalignedStore(ST: Store, DAG);
13016
13017 return SDValue();
13018 }
13019 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13020 switch (Subtarget->getMaxPrivateElementSize()) {
13021 case 4:
13022 return scalarizeVectorStore(ST: Store, DAG);
13023 case 8:
13024 if (NumElements > 2)
13025 return SplitVectorStore(Op, DAG);
13026 return SDValue();
13027 case 16:
13028 if (NumElements > 4 ||
13029 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13030 return SplitVectorStore(Op, DAG);
13031 return SDValue();
13032 default:
13033 llvm_unreachable("unsupported private_element_size");
13034 }
13035 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13036 unsigned Fast = 0;
13037 auto Flags = Store->getMemOperand()->getFlags();
13038 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
13039 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
13040 Fast > 1)
13041 return SDValue();
13042
13043 if (VT.isVector())
13044 return SplitVectorStore(Op, DAG);
13045
13046 return expandUnalignedStore(ST: Store, DAG);
13047 }
13048
13049 // Probably an invalid store. If so we'll end up emitting a selection error.
13050 return SDValue();
13051}
13052
13053// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13054SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13055 SDLoc SL(Op);
13056 assert(!Subtarget->has16BitInsts());
13057 SDNodeFlags Flags = Op->getFlags();
13058 SDValue Ext =
13059 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
13060
13061 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
13062 SDValue Sqrt =
13063 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
13064
13065 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
13066 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
13067}
13068
13069SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13070 SDLoc DL(Op);
13071 SDNodeFlags Flags = Op->getFlags();
13072 MVT VT = Op.getValueType().getSimpleVT();
13073 const SDValue X = Op.getOperand(i: 0);
13074
13075 if (allowApproxFunc(DAG, Flags)) {
13076 // Instruction is 1ulp but ignores denormals.
13077 return DAG.getNode(
13078 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
13079 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
13080 }
13081
13082 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
13083 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
13084
13085 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
13086
13087 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
13088
13089 SDValue SqrtX =
13090 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
13091
13092 SDValue SqrtS;
13093 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
13094 SDValue SqrtID =
13095 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
13096 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
13097
13098 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
13099 SDValue SqrtSNextDownInt =
13100 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13101 N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
13102 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
13103
13104 SDValue NegSqrtSNextDown =
13105 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
13106
13107 SDValue SqrtVP =
13108 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
13109
13110 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13111 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
13112 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
13113
13114 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
13115 SDValue SqrtVS =
13116 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
13117
13118 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
13119 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
13120
13121 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
13122 Flags);
13123
13124 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
13125 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
13126 Flags);
13127 } else {
13128 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
13129
13130 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
13131
13132 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
13133 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
13134 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
13135
13136 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
13137 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
13138 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
13139
13140 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
13141 SDValue SqrtD =
13142 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
13143 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
13144 }
13145
13146 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
13147
13148 SDValue ScaledDown =
13149 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
13150
13151 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
13152 SDValue IsZeroOrInf =
13153 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13154 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13155
13156 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
13157}
13158
13159SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13160 // For double type, the SQRT and RSQ instructions don't have required
13161 // precision, we apply Goldschmidt's algorithm to improve the result:
13162 //
13163 // y0 = rsq(x)
13164 // g0 = x * y0
13165 // h0 = 0.5 * y0
13166 //
13167 // r0 = 0.5 - h0 * g0
13168 // g1 = g0 * r0 + g0
13169 // h1 = h0 * r0 + h0
13170 //
13171 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13172 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13173 // h2 = h1 * r1 + h1
13174 //
13175 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13176 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13177 //
13178 // sqrt(x) = g3
13179
13180 SDNodeFlags Flags = Op->getFlags();
13181
13182 SDLoc DL(Op);
13183
13184 SDValue X = Op.getOperand(i: 0);
13185 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
13186
13187 SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
13188
13189 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
13190
13191 // Scale up input if it is too small.
13192 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
13193 SDValue ScaleUp =
13194 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
13195 SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
13196
13197 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
13198
13199 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
13200
13201 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
13202 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
13203
13204 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
13205 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
13206
13207 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
13208
13209 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
13210
13211 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
13212 SDValue SqrtD0 =
13213 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
13214
13215 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
13216
13217 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
13218 SDValue SqrtD1 =
13219 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
13220
13221 SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
13222
13223 SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -128, DL, VT: MVT::i32);
13224 SDValue ScaleDown =
13225 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
13226 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
13227
13228 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13229 // with finite only or nsz because rsq(+/-0) = +/-inf
13230
13231 // TODO: Check for DAZ and expand to subnormals
13232 SDValue IsZeroOrInf =
13233 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13234 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13235
13236 // If x is +INF, +0, or -0, use its original value
13237 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
13238 Flags);
13239}
13240
13241SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13242 SDLoc DL(Op);
13243 EVT VT = Op.getValueType();
13244 SDValue Arg = Op.getOperand(i: 0);
13245 SDValue TrigVal;
13246
13247 // Propagate fast-math flags so that the multiply we introduce can be folded
13248 // if Arg is already the result of a multiply by constant.
13249 auto Flags = Op->getFlags();
13250
13251 // AMDGPUISD nodes of vector type must be unrolled here since
13252 // they will not be expanded elsewhere.
13253 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13254 if (!V.getValueType().isVector())
13255 return V;
13256
13257 return DAG.UnrollVectorOp(N: cast<SDNode>(Val&: V));
13258 };
13259
13260 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
13261
13262 if (Subtarget->hasTrigReducedRange()) {
13263 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13264 TrigVal = UnrollIfVec(DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags));
13265 } else {
13266 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13267 }
13268
13269 switch (Op.getOpcode()) {
13270 case ISD::FCOS:
13271 TrigVal = DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13272 break;
13273 case ISD::FSIN:
13274 TrigVal = DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13275 break;
13276 default:
13277 llvm_unreachable("Wrong trig opcode");
13278 }
13279
13280 return UnrollIfVec(TrigVal);
13281}
13282
13283SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13284 SelectionDAG &DAG) const {
13285 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
13286 assert(AtomicNode->isCompareAndSwap());
13287 unsigned AS = AtomicNode->getAddressSpace();
13288
13289 // No custom lowering required for local address space
13290 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
13291 return Op;
13292
13293 // Non-local address space requires custom lowering for atomic compare
13294 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13295 SDLoc DL(Op);
13296 SDValue ChainIn = Op.getOperand(i: 0);
13297 SDValue Addr = Op.getOperand(i: 1);
13298 SDValue Old = Op.getOperand(i: 2);
13299 SDValue New = Op.getOperand(i: 3);
13300 EVT VT = Op.getValueType();
13301 MVT SimpleVT = VT.getSimpleVT();
13302 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
13303
13304 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
13305 SDValue Ops[] = {ChainIn, Addr, NewOld};
13306
13307 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
13308 VTList: Op->getVTList(), Ops, MemVT: VT,
13309 MMO: AtomicNode->getMemOperand());
13310}
13311
13312//===----------------------------------------------------------------------===//
13313// Custom DAG optimizations
13314//===----------------------------------------------------------------------===//
13315
13316SDValue
13317SITargetLowering::performUCharToFloatCombine(SDNode *N,
13318 DAGCombinerInfo &DCI) const {
13319 EVT VT = N->getValueType(ResNo: 0);
13320 EVT ScalarVT = VT.getScalarType();
13321 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13322 return SDValue();
13323
13324 SelectionDAG &DAG = DCI.DAG;
13325 SDLoc DL(N);
13326
13327 SDValue Src = N->getOperand(Num: 0);
13328 EVT SrcVT = Src.getValueType();
13329
13330 // TODO: We could try to match extracting the higher bytes, which would be
13331 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13332 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13333 // about in practice.
13334 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13335 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
13336 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
13337 DCI.AddToWorklist(N: Cvt.getNode());
13338
13339 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13340 if (ScalarVT != MVT::f32) {
13341 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
13342 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
13343 }
13344 return Cvt;
13345 }
13346 }
13347
13348 return SDValue();
13349}
13350
13351SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13352 DAGCombinerInfo &DCI) const {
13353 SDValue MagnitudeOp = N->getOperand(Num: 0);
13354 SDValue SignOp = N->getOperand(Num: 1);
13355
13356 // The generic combine for fcopysign + fp cast is too conservative with
13357 // vectors, and also gets confused by the splitting we will perform here, so
13358 // peek through FP casts.
13359 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13360 SignOp.getOpcode() == ISD::FP_ROUND)
13361 SignOp = SignOp.getOperand(i: 0);
13362
13363 SelectionDAG &DAG = DCI.DAG;
13364 SDLoc DL(N);
13365 EVT SignVT = SignOp.getValueType();
13366
13367 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13368 // lower half with a copy.
13369 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13370 EVT MagVT = MagnitudeOp.getValueType();
13371
13372 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13373
13374 if (MagVT.getScalarType() == MVT::f64) {
13375 EVT F32VT = MagVT.isVector()
13376 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13377 : MVT::v2f32;
13378
13379 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
13380
13381 SmallVector<SDValue, 8> NewElts;
13382 for (unsigned I = 0; I != NumElts; ++I) {
13383 SDValue MagLo =
13384 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13385 N2: DAG.getConstant(Val: 2 * I, DL, VT: MVT::i32));
13386 SDValue MagHi =
13387 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13388 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13389
13390 SDValue SignOpElt =
13391 MagVT.isVector()
13392 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
13393 N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
13394 : SignOp;
13395
13396 SDValue HiOp =
13397 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
13398
13399 SDValue Vector =
13400 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
13401
13402 SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
13403 NewElts.push_back(Elt: NewElt);
13404 }
13405
13406 if (NewElts.size() == 1)
13407 return NewElts[0];
13408
13409 return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
13410 }
13411
13412 if (SignVT.getScalarType() != MVT::f64)
13413 return SDValue();
13414
13415 // Reduce width of sign operand, we only need the highest bit.
13416 //
13417 // fcopysign f64:x, f64:y ->
13418 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13419 // TODO: In some cases it might make sense to go all the way to f16.
13420
13421 EVT F32VT = MagVT.isVector()
13422 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13423 : MVT::v2f32;
13424
13425 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
13426
13427 SmallVector<SDValue, 8> F32Signs;
13428 for (unsigned I = 0; I != NumElts; ++I) {
13429 // Take sign from odd elements of cast vector
13430 SDValue SignAsF32 =
13431 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
13432 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13433 F32Signs.push_back(Elt: SignAsF32);
13434 }
13435
13436 SDValue NewSign =
13437 NumElts == 1
13438 ? F32Signs.back()
13439 : DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
13440 VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
13441 Ops: F32Signs);
13442
13443 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
13444 N2: NewSign);
13445}
13446
13447// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13448// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13449// bits
13450
13451// This is a variant of
13452// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13453//
13454// The normal DAG combiner will do this, but only if the add has one use since
13455// that would increase the number of instructions.
13456//
13457// This prevents us from seeing a constant offset that can be folded into a
13458// memory instruction's addressing mode. If we know the resulting add offset of
13459// a pointer can be folded into an addressing offset, we can replace the pointer
13460// operand with the add of new constant offset. This eliminates one of the uses,
13461// and may allow the remaining use to also be simplified.
13462//
13463SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13464 EVT MemVT,
13465 DAGCombinerInfo &DCI) const {
13466 SDValue N0 = N->getOperand(Num: 0);
13467 SDValue N1 = N->getOperand(Num: 1);
13468
13469 // We only do this to handle cases where it's profitable when there are
13470 // multiple uses of the add, so defer to the standard combine.
13471 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13472 return SDValue();
13473
13474 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
13475 if (!CN1)
13476 return SDValue();
13477
13478 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
13479 if (!CAdd)
13480 return SDValue();
13481
13482 SelectionDAG &DAG = DCI.DAG;
13483
13484 if (N0->getOpcode() == ISD::OR &&
13485 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
13486 return SDValue();
13487
13488 // If the resulting offset is too large, we can't fold it into the
13489 // addressing mode offset.
13490 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13491 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
13492
13493 AddrMode AM;
13494 AM.HasBaseReg = true;
13495 AM.BaseOffs = Offset.getSExtValue();
13496 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
13497 return SDValue();
13498
13499 SDLoc SL(N);
13500 EVT VT = N->getValueType(ResNo: 0);
13501
13502 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
13503 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
13504
13505 SDNodeFlags Flags;
13506 Flags.setNoUnsignedWrap(
13507 N->getFlags().hasNoUnsignedWrap() &&
13508 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13509
13510 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13511 // be sure that the new left operand is a proper base pointer.
13512 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
13513}
13514
13515/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13516/// by the chain and intrinsic ID. Theoretically we would also need to check the
13517/// specific intrinsic, but they all place the pointer operand first.
13518static unsigned getBasePtrIndex(const MemSDNode *N) {
13519 switch (N->getOpcode()) {
13520 case ISD::STORE:
13521 case ISD::INTRINSIC_W_CHAIN:
13522 case ISD::INTRINSIC_VOID:
13523 return 2;
13524 default:
13525 return 1;
13526 }
13527}
13528
13529SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13530 DAGCombinerInfo &DCI) const {
13531 SelectionDAG &DAG = DCI.DAG;
13532
13533 unsigned PtrIdx = getBasePtrIndex(N);
13534 SDValue Ptr = N->getOperand(Num: PtrIdx);
13535
13536 // TODO: We could also do this for multiplies.
13537 if (Ptr.getOpcode() == ISD::SHL) {
13538 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
13539 MemVT: N->getMemoryVT(), DCI);
13540 if (NewPtr) {
13541 SmallVector<SDValue, 8> NewOps(N->ops());
13542
13543 NewOps[PtrIdx] = NewPtr;
13544 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
13545 }
13546 }
13547
13548 return SDValue();
13549}
13550
13551static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13552 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13553 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13554 (Opc == ISD::XOR && Val == 0);
13555}
13556
13557// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13558// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13559// integer combine opportunities since most 64-bit operations are decomposed
13560// this way. TODO: We won't want this for SALU especially if it is an inline
13561// immediate.
13562SDValue SITargetLowering::splitBinaryBitConstantOp(
13563 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13564 const ConstantSDNode *CRHS) const {
13565 uint64_t Val = CRHS->getZExtValue();
13566 uint32_t ValLo = Lo_32(Value: Val);
13567 uint32_t ValHi = Hi_32(Value: Val);
13568 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13569
13570 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
13571 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
13572 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
13573 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13574 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13575 !CRHS->user_begin()->isDivergent())
13576 return SDValue();
13577
13578 // If we need to materialize a 64-bit immediate, it will be split up later
13579 // anyway. Avoid creating the harder to understand 64-bit immediate
13580 // materialization.
13581 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13582 }
13583
13584 return SDValue();
13585}
13586
13587bool llvm::isBoolSGPR(SDValue V) {
13588 if (V.getValueType() != MVT::i1)
13589 return false;
13590 switch (V.getOpcode()) {
13591 default:
13592 break;
13593 case ISD::SETCC:
13594 case ISD::IS_FPCLASS:
13595 case AMDGPUISD::FP_CLASS:
13596 return true;
13597 case ISD::AND:
13598 case ISD::OR:
13599 case ISD::XOR:
13600 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
13601 case ISD::SADDO:
13602 case ISD::UADDO:
13603 case ISD::SSUBO:
13604 case ISD::USUBO:
13605 case ISD::SMULO:
13606 case ISD::UMULO:
13607 return V.getResNo() == 1;
13608 case ISD::INTRINSIC_WO_CHAIN: {
13609 unsigned IntrinsicID = V.getConstantOperandVal(i: 0);
13610 switch (IntrinsicID) {
13611 case Intrinsic::amdgcn_is_shared:
13612 case Intrinsic::amdgcn_is_private:
13613 return true;
13614 default:
13615 return false;
13616 }
13617
13618 return false;
13619 }
13620 }
13621 return false;
13622}
13623
13624// If a constant has all zeroes or all ones within each byte return it.
13625// Otherwise return 0.
13626static uint32_t getConstantPermuteMask(uint32_t C) {
13627 // 0xff for any zero byte in the mask
13628 uint32_t ZeroByteMask = 0;
13629 if (!(C & 0x000000ff))
13630 ZeroByteMask |= 0x000000ff;
13631 if (!(C & 0x0000ff00))
13632 ZeroByteMask |= 0x0000ff00;
13633 if (!(C & 0x00ff0000))
13634 ZeroByteMask |= 0x00ff0000;
13635 if (!(C & 0xff000000))
13636 ZeroByteMask |= 0xff000000;
13637 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13638 if ((NonZeroByteMask & C) != NonZeroByteMask)
13639 return 0; // Partial bytes selected.
13640 return C;
13641}
13642
13643// Check if a node selects whole bytes from its operand 0 starting at a byte
13644// boundary while masking the rest. Returns select mask as in the v_perm_b32
13645// or -1 if not succeeded.
13646// Note byte select encoding:
13647// value 0-3 selects corresponding source byte;
13648// value 0xc selects zero;
13649// value 0xff selects 0xff.
13650static uint32_t getPermuteMask(SDValue V) {
13651 assert(V.getValueSizeInBits() == 32);
13652
13653 if (V.getNumOperands() != 2)
13654 return ~0;
13655
13656 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
13657 if (!N1)
13658 return ~0;
13659
13660 uint32_t C = N1->getZExtValue();
13661
13662 switch (V.getOpcode()) {
13663 default:
13664 break;
13665 case ISD::AND:
13666 if (uint32_t ConstMask = getConstantPermuteMask(C))
13667 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13668 break;
13669
13670 case ISD::OR:
13671 if (uint32_t ConstMask = getConstantPermuteMask(C))
13672 return (0x03020100 & ~ConstMask) | ConstMask;
13673 break;
13674
13675 case ISD::SHL:
13676 if (C % 8)
13677 return ~0;
13678
13679 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13680
13681 case ISD::SRL:
13682 if (C % 8)
13683 return ~0;
13684
13685 return uint32_t(0x0c0c0c0c03020100ull >> C);
13686 }
13687
13688 return ~0;
13689}
13690
13691SDValue SITargetLowering::performAndCombine(SDNode *N,
13692 DAGCombinerInfo &DCI) const {
13693 if (DCI.isBeforeLegalize())
13694 return SDValue();
13695
13696 SelectionDAG &DAG = DCI.DAG;
13697 EVT VT = N->getValueType(ResNo: 0);
13698 SDValue LHS = N->getOperand(Num: 0);
13699 SDValue RHS = N->getOperand(Num: 1);
13700
13701 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13702 if (VT == MVT::i64 && CRHS) {
13703 if (SDValue Split =
13704 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
13705 return Split;
13706 }
13707
13708 if (CRHS && VT == MVT::i32) {
13709 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13710 // nb = number of trailing zeroes in mask
13711 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13712 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13713 uint64_t Mask = CRHS->getZExtValue();
13714 unsigned Bits = llvm::popcount(Value: Mask);
13715 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13716 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
13717 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
13718 unsigned Shift = CShift->getZExtValue();
13719 unsigned NB = CRHS->getAPIntValue().countr_zero();
13720 unsigned Offset = NB + Shift;
13721 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13722 SDLoc SL(N);
13723 SDValue BFE =
13724 DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS->getOperand(Num: 0),
13725 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
13726 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
13727 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
13728 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
13729 N2: DAG.getValueType(NarrowVT));
13730 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
13731 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
13732 return Shl;
13733 }
13734 }
13735 }
13736
13737 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13738 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13739 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
13740 uint32_t Sel = getConstantPermuteMask(C: Mask);
13741 if (!Sel)
13742 return SDValue();
13743
13744 // Select 0xc for all zero bytes
13745 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
13746 SDLoc DL(N);
13747 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13748 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13749 }
13750 }
13751
13752 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13753 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13754 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13755 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13756 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
13757
13758 SDValue X = LHS.getOperand(i: 0);
13759 SDValue Y = RHS.getOperand(i: 0);
13760 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
13761 !isTypeLegal(VT: X.getValueType()))
13762 return SDValue();
13763
13764 if (LCC == ISD::SETO) {
13765 if (X != LHS.getOperand(i: 1))
13766 return SDValue();
13767
13768 if (RCC == ISD::SETUNE) {
13769 const ConstantFPSDNode *C1 =
13770 dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
13771 if (!C1 || !C1->isInfinity() || C1->isNegative())
13772 return SDValue();
13773
13774 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13775 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
13776 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
13777 SIInstrFlags::P_NORMAL;
13778
13779 static_assert(
13780 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
13781 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
13782 0x3ff) == Mask,
13783 "mask not equal");
13784
13785 SDLoc DL(N);
13786 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
13787 N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
13788 }
13789 }
13790 }
13791
13792 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13793 std::swap(a&: LHS, b&: RHS);
13794
13795 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13796 RHS.hasOneUse()) {
13797 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13798 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13799 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13800 // | n_nan)
13801 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
13802 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13803 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
13804 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
13805 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13806 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13807 : Mask->getZExtValue() & OrdMask;
13808
13809 SDLoc DL(N);
13810 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
13811 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13812 }
13813 }
13814
13815 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13816 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13817 // and x, (sext cc from i1) => select cc, x, 0
13818 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13819 std::swap(a&: LHS, b&: RHS);
13820 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
13821 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), LHS,
13822 RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
13823 }
13824
13825 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13826 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13827 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13828 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
13829 uint32_t LHSMask = getPermuteMask(V: LHS);
13830 uint32_t RHSMask = getPermuteMask(V: RHS);
13831 if (LHSMask != ~0u && RHSMask != ~0u) {
13832 // Canonicalize the expression in an attempt to have fewer unique masks
13833 // and therefore fewer registers used to hold the masks.
13834 if (LHSMask > RHSMask) {
13835 std::swap(a&: LHSMask, b&: RHSMask);
13836 std::swap(a&: LHS, b&: RHS);
13837 }
13838
13839 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13840 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13841 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13842 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13843
13844 // Check of we need to combine values from two sources within a byte.
13845 if (!(LHSUsedLanes & RHSUsedLanes) &&
13846 // If we select high and lower word keep it for SDWA.
13847 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13848 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13849 // Each byte in each mask is either selector mask 0-3, or has higher
13850 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13851 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13852 // mask which is not 0xff wins. By anding both masks we have a correct
13853 // result except that 0x0c shall be corrected to give 0x0c only.
13854 uint32_t Mask = LHSMask & RHSMask;
13855 for (unsigned I = 0; I < 32; I += 8) {
13856 uint32_t ByteSel = 0xff << I;
13857 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13858 Mask &= (0x0c << I) & 0xffffffff;
13859 }
13860
13861 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13862 // or 0x0c.
13863 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13864 SDLoc DL(N);
13865
13866 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13867 N2: RHS.getOperand(i: 0),
13868 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13869 }
13870 }
13871 }
13872
13873 return SDValue();
13874}
13875
13876// A key component of v_perm is a mapping between byte position of the src
13877// operands, and the byte position of the dest. To provide such, we need: 1. the
13878// node that provides x byte of the dest of the OR, and 2. the byte of the node
13879// used to provide that x byte. calculateByteProvider finds which node provides
13880// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13881// and finds an ultimate src and byte position For example: The supported
13882// LoadCombine pattern for vector loads is as follows
13883// t1
13884// or
13885// / \
13886// t2 t3
13887// zext shl
13888// | | \
13889// t4 t5 16
13890// or anyext
13891// / \ |
13892// t6 t7 t8
13893// srl shl or
13894// / | / \ / \
13895// t9 t10 t11 t12 t13 t14
13896// trunc* 8 trunc* 8 and and
13897// | | / | | \
13898// t15 t16 t17 t18 t19 t20
13899// trunc* 255 srl -256
13900// | / \
13901// t15 t15 16
13902//
13903// *In this example, the truncs are from i32->i16
13904//
13905// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13906// respectively. calculateSrcByte would find (given node) -> ultimate src &
13907// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13908// After finding the mapping, we can combine the tree into vperm t15, t16,
13909// 0x05000407
13910
13911// Find the source and byte position from a node.
13912// \p DestByte is the byte position of the dest of the or that the src
13913// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13914// dest of the or byte. \p Depth tracks how many recursive iterations we have
13915// performed.
13916static const std::optional<ByteProvider<SDValue>>
13917calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13918 unsigned Depth = 0) {
13919 // We may need to recursively traverse a series of SRLs
13920 if (Depth >= 6)
13921 return std::nullopt;
13922
13923 if (Op.getValueSizeInBits() < 8)
13924 return std::nullopt;
13925
13926 if (Op.getValueType().isVector())
13927 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13928
13929 switch (Op->getOpcode()) {
13930 case ISD::TRUNCATE: {
13931 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13932 }
13933
13934 case ISD::ANY_EXTEND:
13935 case ISD::SIGN_EXTEND:
13936 case ISD::ZERO_EXTEND:
13937 case ISD::SIGN_EXTEND_INREG: {
13938 SDValue NarrowOp = Op->getOperand(Num: 0);
13939 auto NarrowVT = NarrowOp.getValueType();
13940 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13941 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
13942 NarrowVT = VTSign->getVT();
13943 }
13944 if (!NarrowVT.isByteSized())
13945 return std::nullopt;
13946 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13947
13948 if (SrcIndex >= NarrowByteWidth)
13949 return std::nullopt;
13950 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13951 }
13952
13953 case ISD::SRA:
13954 case ISD::SRL: {
13955 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
13956 if (!ShiftOp)
13957 return std::nullopt;
13958
13959 uint64_t BitShift = ShiftOp->getZExtValue();
13960
13961 if (BitShift % 8 != 0)
13962 return std::nullopt;
13963
13964 SrcIndex += BitShift / 8;
13965
13966 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13967 }
13968
13969 default: {
13970 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13971 }
13972 }
13973 llvm_unreachable("fully handled switch");
13974}
13975
13976// For a byte position in the result of an Or, traverse the tree and find the
13977// node (and the byte of the node) which ultimately provides this {Or,
13978// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13979// the byte position of the Op that corresponds with the originally requested
13980// byte of the Or \p Depth tracks how many recursive iterations we have
13981// performed. \p StartingIndex is the originally requested byte of the Or
13982static const std::optional<ByteProvider<SDValue>>
13983calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13984 unsigned StartingIndex = 0) {
13985 // Finding Src tree of RHS of or typically requires at least 1 additional
13986 // depth
13987 if (Depth > 6)
13988 return std::nullopt;
13989
13990 unsigned BitWidth = Op.getScalarValueSizeInBits();
13991 if (BitWidth % 8 != 0)
13992 return std::nullopt;
13993 if (Index > BitWidth / 8 - 1)
13994 return std::nullopt;
13995
13996 bool IsVec = Op.getValueType().isVector();
13997 switch (Op.getOpcode()) {
13998 case ISD::OR: {
13999 if (IsVec)
14000 return std::nullopt;
14001
14002 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
14003 StartingIndex);
14004 if (!RHS)
14005 return std::nullopt;
14006 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14007 StartingIndex);
14008 if (!LHS)
14009 return std::nullopt;
14010 // A well formed Or will have two ByteProviders for each byte, one of which
14011 // is constant zero
14012 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14013 return std::nullopt;
14014 if (!LHS || LHS->isConstantZero())
14015 return RHS;
14016 if (!RHS || RHS->isConstantZero())
14017 return LHS;
14018 return std::nullopt;
14019 }
14020
14021 case ISD::AND: {
14022 if (IsVec)
14023 return std::nullopt;
14024
14025 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14026 if (!BitMaskOp)
14027 return std::nullopt;
14028
14029 uint32_t BitMask = BitMaskOp->getZExtValue();
14030 // Bits we expect for our StartingIndex
14031 uint32_t IndexMask = 0xFF << (Index * 8);
14032
14033 if ((IndexMask & BitMask) != IndexMask) {
14034 // If the result of the and partially provides the byte, then it
14035 // is not well formatted
14036 if (IndexMask & BitMask)
14037 return std::nullopt;
14038 return ByteProvider<SDValue>::getConstantZero();
14039 }
14040
14041 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
14042 }
14043
14044 case ISD::FSHR: {
14045 if (IsVec)
14046 return std::nullopt;
14047
14048 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14049 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14050 if (!ShiftOp || Op.getValueType().isVector())
14051 return std::nullopt;
14052
14053 uint64_t BitsProvided = Op.getValueSizeInBits();
14054 if (BitsProvided % 8 != 0)
14055 return std::nullopt;
14056
14057 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
14058 if (BitShift % 8)
14059 return std::nullopt;
14060
14061 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14062 uint64_t ByteShift = BitShift / 8;
14063
14064 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14065 uint64_t BytesProvided = BitsProvided / 8;
14066 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
14067 NewIndex %= BytesProvided;
14068 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
14069 }
14070
14071 case ISD::SRA:
14072 case ISD::SRL: {
14073 if (IsVec)
14074 return std::nullopt;
14075
14076 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14077 if (!ShiftOp)
14078 return std::nullopt;
14079
14080 uint64_t BitShift = ShiftOp->getZExtValue();
14081 if (BitShift % 8)
14082 return std::nullopt;
14083
14084 auto BitsProvided = Op.getScalarValueSizeInBits();
14085 if (BitsProvided % 8 != 0)
14086 return std::nullopt;
14087
14088 uint64_t BytesProvided = BitsProvided / 8;
14089 uint64_t ByteShift = BitShift / 8;
14090 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14091 // If the byte we are trying to provide (as tracked by index) falls in this
14092 // range, then the SRL provides the byte. The byte of interest of the src of
14093 // the SRL is Index + ByteShift
14094 return BytesProvided - ByteShift > Index
14095 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
14096 SrcIndex: Index + ByteShift)
14097 : ByteProvider<SDValue>::getConstantZero();
14098 }
14099
14100 case ISD::SHL: {
14101 if (IsVec)
14102 return std::nullopt;
14103
14104 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14105 if (!ShiftOp)
14106 return std::nullopt;
14107
14108 uint64_t BitShift = ShiftOp->getZExtValue();
14109 if (BitShift % 8 != 0)
14110 return std::nullopt;
14111 uint64_t ByteShift = BitShift / 8;
14112
14113 // If we are shifting by an amount greater than (or equal to)
14114 // the index we are trying to provide, then it provides 0s. If not,
14115 // then this bytes are not definitively 0s, and the corresponding byte
14116 // of interest is Index - ByteShift of the src
14117 return Index < ByteShift
14118 ? ByteProvider<SDValue>::getConstantZero()
14119 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
14120 Depth: Depth + 1, StartingIndex);
14121 }
14122 case ISD::ANY_EXTEND:
14123 case ISD::SIGN_EXTEND:
14124 case ISD::ZERO_EXTEND:
14125 case ISD::SIGN_EXTEND_INREG:
14126 case ISD::AssertZext:
14127 case ISD::AssertSext: {
14128 if (IsVec)
14129 return std::nullopt;
14130
14131 SDValue NarrowOp = Op->getOperand(Num: 0);
14132 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14133 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14134 Op->getOpcode() == ISD::AssertZext ||
14135 Op->getOpcode() == ISD::AssertSext) {
14136 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
14137 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14138 }
14139 if (NarrowBitWidth % 8 != 0)
14140 return std::nullopt;
14141 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14142
14143 if (Index >= NarrowByteWidth)
14144 return Op.getOpcode() == ISD::ZERO_EXTEND
14145 ? std::optional<ByteProvider<SDValue>>(
14146 ByteProvider<SDValue>::getConstantZero())
14147 : std::nullopt;
14148 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
14149 }
14150
14151 case ISD::TRUNCATE: {
14152 if (IsVec)
14153 return std::nullopt;
14154
14155 uint64_t NarrowByteWidth = BitWidth / 8;
14156
14157 if (NarrowByteWidth >= Index) {
14158 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14159 StartingIndex);
14160 }
14161
14162 return std::nullopt;
14163 }
14164
14165 case ISD::CopyFromReg: {
14166 if (BitWidth / 8 > Index)
14167 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14168
14169 return std::nullopt;
14170 }
14171
14172 case ISD::LOAD: {
14173 auto *L = cast<LoadSDNode>(Val: Op.getNode());
14174
14175 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14176 if (NarrowBitWidth % 8 != 0)
14177 return std::nullopt;
14178 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14179
14180 // If the width of the load does not reach byte we are trying to provide for
14181 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14182 // question
14183 if (Index >= NarrowByteWidth) {
14184 return L->getExtensionType() == ISD::ZEXTLOAD
14185 ? std::optional<ByteProvider<SDValue>>(
14186 ByteProvider<SDValue>::getConstantZero())
14187 : std::nullopt;
14188 }
14189
14190 if (NarrowByteWidth > Index) {
14191 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14192 }
14193
14194 return std::nullopt;
14195 }
14196
14197 case ISD::BSWAP: {
14198 if (IsVec)
14199 return std::nullopt;
14200
14201 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
14202 Depth: Depth + 1, StartingIndex);
14203 }
14204
14205 case ISD::EXTRACT_VECTOR_ELT: {
14206 auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14207 if (!IdxOp)
14208 return std::nullopt;
14209 auto VecIdx = IdxOp->getZExtValue();
14210 auto ScalarSize = Op.getScalarValueSizeInBits();
14211 if (ScalarSize < 32)
14212 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14213 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
14214 DestByte: StartingIndex, SrcIndex: Index);
14215 }
14216
14217 case AMDGPUISD::PERM: {
14218 if (IsVec)
14219 return std::nullopt;
14220
14221 auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14222 if (!PermMask)
14223 return std::nullopt;
14224
14225 auto IdxMask =
14226 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14227 if (IdxMask > 0x07 && IdxMask != 0x0c)
14228 return std::nullopt;
14229
14230 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
14231 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14232
14233 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
14234 : ByteProvider<SDValue>(
14235 ByteProvider<SDValue>::getConstantZero());
14236 }
14237
14238 default: {
14239 return std::nullopt;
14240 }
14241 }
14242
14243 llvm_unreachable("fully handled switch");
14244}
14245
14246// Returns true if the Operand is a scalar and is 16 bits
14247static bool isExtendedFrom16Bits(SDValue &Operand) {
14248
14249 switch (Operand.getOpcode()) {
14250 case ISD::ANY_EXTEND:
14251 case ISD::SIGN_EXTEND:
14252 case ISD::ZERO_EXTEND: {
14253 auto OpVT = Operand.getOperand(i: 0).getValueType();
14254 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14255 }
14256 case ISD::LOAD: {
14257 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
14258 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
14259 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14260 ExtType == ISD::EXTLOAD) {
14261 auto MemVT = L->getMemoryVT();
14262 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14263 }
14264 return L->getMemoryVT().getSizeInBits() == 16;
14265 }
14266 default:
14267 return false;
14268 }
14269}
14270
14271// Returns true if the mask matches consecutive bytes, and the first byte
14272// begins at a power of 2 byte offset from 0th byte
14273static bool addresses16Bits(int Mask) {
14274 int Low8 = Mask & 0xff;
14275 int Hi8 = (Mask & 0xff00) >> 8;
14276
14277 assert(Low8 < 8 && Hi8 < 8);
14278 // Are the bytes contiguous in the order of increasing addresses.
14279 bool IsConsecutive = (Hi8 - Low8 == 1);
14280 // Is the first byte at location that is aligned for 16 bit instructions.
14281 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14282 // In this case, we still need code to extract the 16 bit operand, so it
14283 // is better to use i8 v_perm
14284 bool Is16Aligned = !(Low8 % 2);
14285
14286 return IsConsecutive && Is16Aligned;
14287}
14288
14289// Do not lower into v_perm if the operands are actually 16 bit
14290// and the selected bits (based on PermMask) correspond with two
14291// easily addressable 16 bit operands.
14292static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
14293 SDValue &OtherOp) {
14294 int Low16 = PermMask & 0xffff;
14295 int Hi16 = (PermMask & 0xffff0000) >> 16;
14296
14297 auto TempOp = peekThroughBitcasts(V: Op);
14298 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
14299
14300 auto OpIs16Bit =
14301 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
14302 if (!OpIs16Bit)
14303 return true;
14304
14305 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14306 isExtendedFrom16Bits(Operand&: TempOtherOp);
14307 if (!OtherOpIs16Bit)
14308 return true;
14309
14310 // Do we cleanly address both
14311 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
14312}
14313
14314static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
14315 unsigned DWordOffset) {
14316 SDValue Ret;
14317
14318 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14319 // ByteProvider must be at least 8 bits
14320 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14321
14322 if (TypeSize <= 32)
14323 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
14324
14325 if (Src.getValueType().isVector()) {
14326 auto ScalarTySize = Src.getScalarValueSizeInBits();
14327 auto ScalarTy = Src.getValueType().getScalarType();
14328 if (ScalarTySize == 32) {
14329 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
14330 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
14331 }
14332 if (ScalarTySize > 32) {
14333 Ret = DAG.getNode(
14334 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
14335 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
14336 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14337 if (ShiftVal)
14338 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
14339 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14340 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14341 }
14342
14343 assert(ScalarTySize < 32);
14344 auto NumElements = TypeSize / ScalarTySize;
14345 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14346 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14347 auto NumElementsIn32 = 32 / ScalarTySize;
14348 auto NumAvailElements = DWordOffset < Trunc32Elements
14349 ? NumElementsIn32
14350 : NumElements - NormalizedTrunc;
14351
14352 SmallVector<SDValue, 4> VecSrcs;
14353 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
14354 Count: NumAvailElements);
14355
14356 Ret = DAG.getBuildVector(
14357 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
14358 Ops: VecSrcs);
14359 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14360 }
14361
14362 /// Scalar Type
14363 auto ShiftVal = 32 * DWordOffset;
14364 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
14365 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14366 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14367}
14368
14369static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14370 SelectionDAG &DAG = DCI.DAG;
14371 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
14372 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
14373
14374 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14375 assert(VT == MVT::i32);
14376 for (int i = 0; i < 4; i++) {
14377 // Find the ByteProvider that provides the ith byte of the result of OR
14378 std::optional<ByteProvider<SDValue>> P =
14379 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
14380 // TODO support constantZero
14381 if (!P || P->isConstantZero())
14382 return SDValue();
14383
14384 PermNodes.push_back(Elt: *P);
14385 }
14386 if (PermNodes.size() != 4)
14387 return SDValue();
14388
14389 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14390 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14391 uint64_t PermMask = 0x00000000;
14392 for (size_t i = 0; i < PermNodes.size(); i++) {
14393 auto PermOp = PermNodes[i];
14394 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14395 // by sizeof(Src2) = 4
14396 int SrcByteAdjust = 4;
14397
14398 // If the Src uses a byte from a different DWORD, then it corresponds
14399 // with a difference source
14400 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
14401 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14402 if (SecondSrc)
14403 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
14404 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14405 return SDValue();
14406
14407 // Set the index of the second distinct Src node
14408 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14409 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14410 SrcByteAdjust = 0;
14411 }
14412 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14413 assert(!DAG.getDataLayout().isBigEndian());
14414 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14415 }
14416 SDLoc DL(N);
14417 SDValue Op = *PermNodes[FirstSrc.first].Src;
14418 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
14419 assert(Op.getValueSizeInBits() == 32);
14420
14421 // Check that we are not just extracting the bytes in order from an op
14422 if (!SecondSrc) {
14423 int Low16 = PermMask & 0xffff;
14424 int Hi16 = (PermMask & 0xffff0000) >> 16;
14425
14426 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14427 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14428
14429 // The perm op would really just produce Op. So combine into Op
14430 if (WellFormedLow && WellFormedHi)
14431 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
14432 }
14433
14434 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14435
14436 if (SecondSrc) {
14437 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
14438 assert(OtherOp.getValueSizeInBits() == 32);
14439 }
14440
14441 // Check that we haven't just recreated the same FSHR node.
14442 if (N->getOpcode() == ISD::FSHR &&
14443 (N->getOperand(Num: 0) == Op || N->getOperand(Num: 0) == OtherOp) &&
14444 (N->getOperand(Num: 1) == Op || N->getOperand(Num: 1) == OtherOp))
14445 return SDValue();
14446
14447 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14448
14449 assert(Op.getValueType().isByteSized() &&
14450 OtherOp.getValueType().isByteSized());
14451
14452 // If the ultimate src is less than 32 bits, then we will only be
14453 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14454 // CalculateByteProvider would not have returned Op as source if we
14455 // used a byte that is outside its ValueType. Thus, we are free to
14456 // ANY_EXTEND as the extended bits are dont-cares.
14457 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
14458 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
14459
14460 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
14461 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14462 }
14463 return SDValue();
14464}
14465
14466SDValue SITargetLowering::performOrCombine(SDNode *N,
14467 DAGCombinerInfo &DCI) const {
14468 SelectionDAG &DAG = DCI.DAG;
14469 SDValue LHS = N->getOperand(Num: 0);
14470 SDValue RHS = N->getOperand(Num: 1);
14471
14472 EVT VT = N->getValueType(ResNo: 0);
14473 if (VT == MVT::i1) {
14474 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14475 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14476 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14477 SDValue Src = LHS.getOperand(i: 0);
14478 if (Src != RHS.getOperand(i: 0))
14479 return SDValue();
14480
14481 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
14482 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
14483 if (!CLHS || !CRHS)
14484 return SDValue();
14485
14486 // Only 10 bits are used.
14487 static const uint32_t MaxMask = 0x3ff;
14488
14489 uint32_t NewMask =
14490 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14491 SDLoc DL(N);
14492 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
14493 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
14494 }
14495
14496 return SDValue();
14497 }
14498
14499 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14500 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
14501 LHS.getOpcode() == AMDGPUISD::PERM &&
14502 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
14503 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
14504 if (!Sel)
14505 return SDValue();
14506
14507 Sel |= LHS.getConstantOperandVal(i: 2);
14508 SDLoc DL(N);
14509 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14510 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14511 }
14512
14513 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14514 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14515 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14516 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
14517
14518 // If all the uses of an or need to extract the individual elements, do not
14519 // attempt to lower into v_perm
14520 auto usesCombinedOperand = [](SDNode *OrUse) {
14521 // If we have any non-vectorized use, then it is a candidate for v_perm
14522 if (OrUse->getOpcode() != ISD::BITCAST ||
14523 !OrUse->getValueType(ResNo: 0).isVector())
14524 return true;
14525
14526 // If we have any non-vectorized use, then it is a candidate for v_perm
14527 for (auto *VUser : OrUse->users()) {
14528 if (!VUser->getValueType(ResNo: 0).isVector())
14529 return true;
14530
14531 // If the use of a vector is a store, then combining via a v_perm
14532 // is beneficial.
14533 // TODO -- whitelist more uses
14534 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14535 if (VUser->getOpcode() == VectorwiseOp)
14536 return true;
14537 }
14538 return false;
14539 };
14540
14541 if (!any_of(Range: N->users(), P: usesCombinedOperand))
14542 return SDValue();
14543
14544 uint32_t LHSMask = getPermuteMask(V: LHS);
14545 uint32_t RHSMask = getPermuteMask(V: RHS);
14546
14547 if (LHSMask != ~0u && RHSMask != ~0u) {
14548 // Canonicalize the expression in an attempt to have fewer unique masks
14549 // and therefore fewer registers used to hold the masks.
14550 if (LHSMask > RHSMask) {
14551 std::swap(a&: LHSMask, b&: RHSMask);
14552 std::swap(a&: LHS, b&: RHS);
14553 }
14554
14555 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14556 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14557 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14558 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14559
14560 // Check of we need to combine values from two sources within a byte.
14561 if (!(LHSUsedLanes & RHSUsedLanes) &&
14562 // If we select high and lower word keep it for SDWA.
14563 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14564 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14565 // Kill zero bytes selected by other mask. Zero value is 0xc.
14566 LHSMask &= ~RHSUsedLanes;
14567 RHSMask &= ~LHSUsedLanes;
14568 // Add 4 to each active LHS lane
14569 LHSMask |= LHSUsedLanes & 0x04040404;
14570 // Combine masks
14571 uint32_t Sel = LHSMask | RHSMask;
14572 SDLoc DL(N);
14573
14574 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14575 N2: RHS.getOperand(i: 0),
14576 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14577 }
14578 }
14579 if (LHSMask == ~0u || RHSMask == ~0u) {
14580 if (SDValue Perm = matchPERM(N, DCI))
14581 return Perm;
14582 }
14583 }
14584
14585 // Detect identity v2i32 OR and replace with identity source node.
14586 // Specifically an Or that has operands constructed from the same source node
14587 // via extract_vector_elt and build_vector. I.E.
14588 // v2i32 or(
14589 // v2i32 build_vector(
14590 // i32 extract_elt(%IdentitySrc, 0),
14591 // i32 0
14592 // ),
14593 // v2i32 build_vector(
14594 // i32 0,
14595 // i32 extract_elt(%IdentitySrc, 1)
14596 // ) )
14597 // =>
14598 // v2i32 %IdentitySrc
14599
14600 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14601 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14602
14603 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
14604 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 0));
14605
14606 // Test for and normalise build vectors.
14607 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14608
14609 // Get the extract_vector_element operands.
14610 SDValue LEVE = LHS->getOperand(Num: 0);
14611 SDValue REVE = RHS->getOperand(Num: 1);
14612
14613 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14614 REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
14615 // Check that different elements from the same vector are
14616 // extracted.
14617 if (LEVE->getOperand(Num: 0) == REVE->getOperand(Num: 0) &&
14618 LEVE->getOperand(Num: 1) != REVE->getOperand(Num: 1)) {
14619 SDValue IdentitySrc = LEVE.getOperand(i: 0);
14620 return IdentitySrc;
14621 }
14622 }
14623 }
14624 }
14625
14626 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14627 return SDValue();
14628
14629 // TODO: This could be a generic combine with a predicate for extracting the
14630 // high half of an integer being free.
14631
14632 // (or i64:x, (zero_extend i32:y)) ->
14633 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14634 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14635 RHS.getOpcode() != ISD::ZERO_EXTEND)
14636 std::swap(a&: LHS, b&: RHS);
14637
14638 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14639 SDValue ExtSrc = RHS.getOperand(i: 0);
14640 EVT SrcVT = ExtSrc.getValueType();
14641 if (SrcVT == MVT::i32) {
14642 SDLoc SL(N);
14643 auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
14644 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
14645
14646 DCI.AddToWorklist(N: LowOr.getNode());
14647 DCI.AddToWorklist(N: HiBits.getNode());
14648
14649 SDValue Vec =
14650 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
14651 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
14652 }
14653 }
14654
14655 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14656 if (CRHS) {
14657 if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
14658 LHS: N->getOperand(Num: 0), CRHS))
14659 return Split;
14660 }
14661
14662 return SDValue();
14663}
14664
14665SDValue SITargetLowering::performXorCombine(SDNode *N,
14666 DAGCombinerInfo &DCI) const {
14667 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
14668 return RV;
14669
14670 SDValue LHS = N->getOperand(Num: 0);
14671 SDValue RHS = N->getOperand(Num: 1);
14672
14673 const ConstantSDNode *CRHS = isConstOrConstSplat(N: RHS);
14674 SelectionDAG &DAG = DCI.DAG;
14675
14676 EVT VT = N->getValueType(ResNo: 0);
14677 if (CRHS && VT == MVT::i64) {
14678 if (SDValue Split =
14679 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
14680 return Split;
14681 }
14682
14683 // v2i32 (xor (vselect cc, x, y), K) ->
14684 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14685 // replaced with source modifiers when the select is lowered to CNDMASK.
14686 unsigned Opc = LHS.getOpcode();
14687 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14688 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14689 CRHS && CRHS->getAPIntValue().isSignMask()) {
14690 SDValue CC = LHS->getOperand(Num: 0);
14691 SDValue TRUE = LHS->getOperand(Num: 1);
14692 SDValue FALSE = LHS->getOperand(Num: 2);
14693 SDValue XTrue = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: TRUE, N2: RHS);
14694 SDValue XFalse = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: FALSE, N2: RHS);
14695 SDValue XSelect =
14696 DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT, N1: CC, N2: XTrue, N3: XFalse);
14697 return XSelect;
14698 }
14699
14700 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14701 // fneg-like xors into 64-bit select.
14702 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14703 // This looks like an fneg, try to fold as a source modifier.
14704 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14705 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
14706 // xor (select c, a, b), 0x80000000 ->
14707 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14708 SDLoc DL(N);
14709 SDValue CastLHS =
14710 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
14711 SDValue CastRHS =
14712 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
14713 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
14714 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
14715 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
14716 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
14717 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
14718 }
14719 }
14720
14721 return SDValue();
14722}
14723
14724SDValue
14725SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14726 DAGCombinerInfo &DCI) const {
14727 if (!Subtarget->has16BitInsts() ||
14728 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14729 return SDValue();
14730
14731 EVT VT = N->getValueType(ResNo: 0);
14732 if (VT != MVT::i32)
14733 return SDValue();
14734
14735 SDValue Src = N->getOperand(Num: 0);
14736 if (Src.getValueType() != MVT::i16)
14737 return SDValue();
14738
14739 if (!Src->hasOneUse())
14740 return SDValue();
14741
14742 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14743 // possible we're missing out on some combine opportunities, but we'd need to
14744 // weigh the cost of extracting the byte from the upper dwords.
14745
14746 std::optional<ByteProvider<SDValue>> BP0 =
14747 calculateByteProvider(Op: SDValue(N, 0), Index: 0, Depth: 0, StartingIndex: 0);
14748 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14749 return SDValue();
14750 SDValue V0 = *BP0->Src;
14751
14752 std::optional<ByteProvider<SDValue>> BP1 =
14753 calculateByteProvider(Op: SDValue(N, 0), Index: 1, Depth: 0, StartingIndex: 1);
14754 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14755 return SDValue();
14756
14757 SDValue V1 = *BP1->Src;
14758
14759 if (V0 == V1)
14760 return SDValue();
14761
14762 SelectionDAG &DAG = DCI.DAG;
14763 SDLoc DL(N);
14764 uint32_t PermMask = 0x0c0c0c0c;
14765 if (V0) {
14766 V0 = DAG.getBitcastedAnyExtOrTrunc(Op: V0, DL, VT: MVT::i32);
14767 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14768 }
14769
14770 if (V1) {
14771 V1 = DAG.getBitcastedAnyExtOrTrunc(Op: V1, DL, VT: MVT::i32);
14772 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14773 }
14774
14775 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: V0, N2: V1,
14776 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14777}
14778
14779SDValue
14780SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14781 DAGCombinerInfo &DCI) const {
14782 SDValue Src = N->getOperand(Num: 0);
14783 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
14784
14785 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14786 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14787 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14788 VTSign->getVT() == MVT::i8) ||
14789 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14790 VTSign->getVT() == MVT::i16))) {
14791 assert(Subtarget->hasScalarSubwordLoads() &&
14792 "s_buffer_load_{u8, i8} are supported "
14793 "in GFX12 (or newer) architectures.");
14794 EVT VT = Src.getValueType();
14795 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14796 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14797 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14798 SDLoc DL(N);
14799 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
14800 SDValue Ops[] = {
14801 Src.getOperand(i: 0), // source register
14802 Src.getOperand(i: 1), // offset
14803 Src.getOperand(i: 2) // cachePolicy
14804 };
14805 auto *M = cast<MemSDNode>(Val&: Src);
14806 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14807 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14808 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
14809 return LoadVal;
14810 }
14811 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14812 VTSign->getVT() == MVT::i8) ||
14813 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14814 VTSign->getVT() == MVT::i16)) &&
14815 Src.hasOneUse()) {
14816 auto *M = cast<MemSDNode>(Val&: Src);
14817 SDValue Ops[] = {Src.getOperand(i: 0), // Chain
14818 Src.getOperand(i: 1), // rsrc
14819 Src.getOperand(i: 2), // vindex
14820 Src.getOperand(i: 3), // voffset
14821 Src.getOperand(i: 4), // soffset
14822 Src.getOperand(i: 5), // offset
14823 Src.getOperand(i: 6), Src.getOperand(i: 7)};
14824 // replace with BUFFER_LOAD_BYTE/SHORT
14825 SDVTList ResList =
14826 DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: 0).getValueType());
14827 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14828 ? AMDGPUISD::BUFFER_LOAD_BYTE
14829 : AMDGPUISD::BUFFER_LOAD_SHORT;
14830 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14831 Opcode: Opc, dl: SDLoc(N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14832 return DCI.DAG.getMergeValues(
14833 Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
14834 }
14835 return SDValue();
14836}
14837
14838SDValue SITargetLowering::performClassCombine(SDNode *N,
14839 DAGCombinerInfo &DCI) const {
14840 SelectionDAG &DAG = DCI.DAG;
14841 SDValue Mask = N->getOperand(Num: 1);
14842
14843 // fp_class x, 0 -> false
14844 if (isNullConstant(V: Mask))
14845 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
14846
14847 if (N->getOperand(Num: 0).isUndef())
14848 return DAG.getUNDEF(VT: MVT::i1);
14849
14850 return SDValue();
14851}
14852
14853SDValue SITargetLowering::performRcpCombine(SDNode *N,
14854 DAGCombinerInfo &DCI) const {
14855 EVT VT = N->getValueType(ResNo: 0);
14856 SDValue N0 = N->getOperand(Num: 0);
14857
14858 if (N0.isUndef()) {
14859 return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
14860 DL: SDLoc(N), VT);
14861 }
14862
14863 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14864 N0.getOpcode() == ISD::SINT_TO_FP)) {
14865 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
14866 Flags: N->getFlags());
14867 }
14868
14869 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14870 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14871 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14872 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, Operand: N0.getOperand(i: 0),
14873 Flags: N->getFlags());
14874 }
14875
14876 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
14877}
14878
14879bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
14880 SDNodeFlags UserFlags,
14881 unsigned MaxDepth) const {
14882 unsigned Opcode = Op.getOpcode();
14883 if (Opcode == ISD::FCANONICALIZE)
14884 return true;
14885
14886 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
14887 const auto &F = CFP->getValueAPF();
14888 if (F.isNaN() && F.isSignaling())
14889 return false;
14890 if (!F.isDenormal())
14891 return true;
14892
14893 DenormalMode Mode =
14894 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
14895 return Mode == DenormalMode::getIEEE();
14896 }
14897
14898 // If source is a result of another standard FP operation it is already in
14899 // canonical form.
14900 if (MaxDepth == 0)
14901 return false;
14902
14903 switch (Opcode) {
14904 // These will flush denorms if required.
14905 case ISD::FADD:
14906 case ISD::FSUB:
14907 case ISD::FMUL:
14908 case ISD::FCEIL:
14909 case ISD::FFLOOR:
14910 case ISD::FMA:
14911 case ISD::FMAD:
14912 case ISD::FSQRT:
14913 case ISD::FDIV:
14914 case ISD::FREM:
14915 case ISD::FP_ROUND:
14916 case ISD::FP_EXTEND:
14917 case ISD::FP16_TO_FP:
14918 case ISD::FP_TO_FP16:
14919 case ISD::BF16_TO_FP:
14920 case ISD::FP_TO_BF16:
14921 case ISD::FLDEXP:
14922 case AMDGPUISD::FMUL_LEGACY:
14923 case AMDGPUISD::FMAD_FTZ:
14924 case AMDGPUISD::RCP:
14925 case AMDGPUISD::RSQ:
14926 case AMDGPUISD::RSQ_CLAMP:
14927 case AMDGPUISD::RCP_LEGACY:
14928 case AMDGPUISD::RCP_IFLAG:
14929 case AMDGPUISD::LOG:
14930 case AMDGPUISD::EXP:
14931 case AMDGPUISD::DIV_SCALE:
14932 case AMDGPUISD::DIV_FMAS:
14933 case AMDGPUISD::DIV_FIXUP:
14934 case AMDGPUISD::FRACT:
14935 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14936 case AMDGPUISD::CVT_F32_UBYTE0:
14937 case AMDGPUISD::CVT_F32_UBYTE1:
14938 case AMDGPUISD::CVT_F32_UBYTE2:
14939 case AMDGPUISD::CVT_F32_UBYTE3:
14940 case AMDGPUISD::FP_TO_FP16:
14941 case AMDGPUISD::SIN_HW:
14942 case AMDGPUISD::COS_HW:
14943 return true;
14944
14945 // It can/will be lowered or combined as a bit operation.
14946 // Need to check their input recursively to handle.
14947 case ISD::FNEG:
14948 case ISD::FABS:
14949 case ISD::FCOPYSIGN:
14950 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
14951
14952 case ISD::AND:
14953 if (Op.getValueType() == MVT::i32) {
14954 // Be careful as we only know it is a bitcast floating point type. It
14955 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14956 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14957 // is valid to optimize for all types.
14958 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
14959 if (RHS->getZExtValue() == 0xffff0000) {
14960 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
14961 }
14962 }
14963 }
14964 break;
14965
14966 case ISD::FSIN:
14967 case ISD::FCOS:
14968 case ISD::FSINCOS:
14969 return Op.getValueType().getScalarType() != MVT::f16;
14970
14971 case ISD::FMINNUM:
14972 case ISD::FMAXNUM:
14973 case ISD::FMINNUM_IEEE:
14974 case ISD::FMAXNUM_IEEE:
14975 case ISD::FMINIMUM:
14976 case ISD::FMAXIMUM:
14977 case ISD::FMINIMUMNUM:
14978 case ISD::FMAXIMUMNUM:
14979 case AMDGPUISD::CLAMP:
14980 case AMDGPUISD::FMED3:
14981 case AMDGPUISD::FMAX3:
14982 case AMDGPUISD::FMIN3:
14983 case AMDGPUISD::FMAXIMUM3:
14984 case AMDGPUISD::FMINIMUM3: {
14985 // FIXME: Shouldn't treat the generic operations different based these.
14986 // However, we aren't really required to flush the result from
14987 // minnum/maxnum..
14988
14989 // snans will be quieted, so we only need to worry about denormals.
14990 if (Subtarget->supportsMinMaxDenormModes() ||
14991 // FIXME: denormalsEnabledForType is broken for dynamic
14992 denormalsEnabledForType(DAG, VT: Op.getValueType()))
14993 return true;
14994
14995 // Flushing may be required.
14996 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14997 // targets need to check their input recursively.
14998
14999 // FIXME: Does this apply with clamp? It's implemented with max.
15000 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15001 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), UserFlags: MaxDepth - 1))
15002 return false;
15003 }
15004
15005 return true;
15006 }
15007 case ISD::SELECT: {
15008 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1) &&
15009 isCanonicalized(DAG, Op: Op.getOperand(i: 2), UserFlags: MaxDepth - 1);
15010 }
15011 case ISD::BUILD_VECTOR: {
15012 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15013 SDValue SrcOp = Op.getOperand(i);
15014 if (!isCanonicalized(DAG, Op: SrcOp, UserFlags: MaxDepth - 1))
15015 return false;
15016 }
15017
15018 return true;
15019 }
15020 case ISD::EXTRACT_VECTOR_ELT:
15021 case ISD::EXTRACT_SUBVECTOR: {
15022 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15023 }
15024 case ISD::INSERT_VECTOR_ELT: {
15025 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1) &&
15026 isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1);
15027 }
15028 case ISD::UNDEF:
15029 // Could be anything.
15030 return false;
15031
15032 case ISD::BITCAST:
15033 // TODO: This is incorrect as it loses track of the operand's type. We may
15034 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15035 // same bits that are canonicalized in one type need not be in the other.
15036 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15037 case ISD::TRUNCATE: {
15038 // Hack round the mess we make when legalizing extract_vector_elt
15039 if (Op.getValueType() == MVT::i16) {
15040 SDValue TruncSrc = Op.getOperand(i: 0);
15041 if (TruncSrc.getValueType() == MVT::i32 &&
15042 TruncSrc.getOpcode() == ISD::BITCAST &&
15043 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
15044 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), UserFlags: MaxDepth - 1);
15045 }
15046 }
15047 return false;
15048 }
15049 case ISD::INTRINSIC_WO_CHAIN: {
15050 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
15051 // TODO: Handle more intrinsics
15052 switch (IntrinsicID) {
15053 case Intrinsic::amdgcn_cvt_pkrtz:
15054 case Intrinsic::amdgcn_cubeid:
15055 case Intrinsic::amdgcn_frexp_mant:
15056 case Intrinsic::amdgcn_fdot2:
15057 case Intrinsic::amdgcn_rcp:
15058 case Intrinsic::amdgcn_rsq:
15059 case Intrinsic::amdgcn_rsq_clamp:
15060 case Intrinsic::amdgcn_rcp_legacy:
15061 case Intrinsic::amdgcn_rsq_legacy:
15062 case Intrinsic::amdgcn_trig_preop:
15063 case Intrinsic::amdgcn_tanh:
15064 case Intrinsic::amdgcn_log:
15065 case Intrinsic::amdgcn_exp2:
15066 case Intrinsic::amdgcn_sqrt:
15067 return true;
15068 default:
15069 break;
15070 }
15071
15072 break;
15073 }
15074 default:
15075 break;
15076 }
15077
15078 // FIXME: denormalsEnabledForType is broken for dynamic
15079 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
15080 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15081}
15082
15083bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
15084 unsigned MaxDepth) const {
15085 const MachineRegisterInfo &MRI = MF.getRegInfo();
15086 MachineInstr *MI = MRI.getVRegDef(Reg);
15087 unsigned Opcode = MI->getOpcode();
15088
15089 if (Opcode == AMDGPU::G_FCANONICALIZE)
15090 return true;
15091
15092 std::optional<FPValueAndVReg> FCR;
15093 // Constant splat (can be padded with undef) or scalar constant.
15094 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
15095 if (FCR->Value.isSignaling())
15096 return false;
15097 if (!FCR->Value.isDenormal())
15098 return true;
15099
15100 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
15101 return Mode == DenormalMode::getIEEE();
15102 }
15103
15104 if (MaxDepth == 0)
15105 return false;
15106
15107 switch (Opcode) {
15108 case AMDGPU::G_FADD:
15109 case AMDGPU::G_FSUB:
15110 case AMDGPU::G_FMUL:
15111 case AMDGPU::G_FCEIL:
15112 case AMDGPU::G_FFLOOR:
15113 case AMDGPU::G_FRINT:
15114 case AMDGPU::G_FNEARBYINT:
15115 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15116 case AMDGPU::G_INTRINSIC_TRUNC:
15117 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15118 case AMDGPU::G_FMA:
15119 case AMDGPU::G_FMAD:
15120 case AMDGPU::G_FSQRT:
15121 case AMDGPU::G_FDIV:
15122 case AMDGPU::G_FREM:
15123 case AMDGPU::G_FPOW:
15124 case AMDGPU::G_FPEXT:
15125 case AMDGPU::G_FLOG:
15126 case AMDGPU::G_FLOG2:
15127 case AMDGPU::G_FLOG10:
15128 case AMDGPU::G_FPTRUNC:
15129 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15130 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15131 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15132 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15133 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15134 return true;
15135 case AMDGPU::G_FNEG:
15136 case AMDGPU::G_FABS:
15137 case AMDGPU::G_FCOPYSIGN:
15138 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
15139 case AMDGPU::G_FMINNUM:
15140 case AMDGPU::G_FMAXNUM:
15141 case AMDGPU::G_FMINNUM_IEEE:
15142 case AMDGPU::G_FMAXNUM_IEEE:
15143 case AMDGPU::G_FMINIMUM:
15144 case AMDGPU::G_FMAXIMUM:
15145 case AMDGPU::G_FMINIMUMNUM:
15146 case AMDGPU::G_FMAXIMUMNUM: {
15147 if (Subtarget->supportsMinMaxDenormModes() ||
15148 // FIXME: denormalsEnabledForType is broken for dynamic
15149 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
15150 return true;
15151
15152 [[fallthrough]];
15153 }
15154 case AMDGPU::G_BUILD_VECTOR:
15155 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
15156 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
15157 return false;
15158 return true;
15159 case AMDGPU::G_INTRINSIC:
15160 case AMDGPU::G_INTRINSIC_CONVERGENT:
15161 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15162 case Intrinsic::amdgcn_fmul_legacy:
15163 case Intrinsic::amdgcn_fmad_ftz:
15164 case Intrinsic::amdgcn_sqrt:
15165 case Intrinsic::amdgcn_fmed3:
15166 case Intrinsic::amdgcn_sin:
15167 case Intrinsic::amdgcn_cos:
15168 case Intrinsic::amdgcn_log:
15169 case Intrinsic::amdgcn_exp2:
15170 case Intrinsic::amdgcn_log_clamp:
15171 case Intrinsic::amdgcn_rcp:
15172 case Intrinsic::amdgcn_rcp_legacy:
15173 case Intrinsic::amdgcn_rsq:
15174 case Intrinsic::amdgcn_rsq_clamp:
15175 case Intrinsic::amdgcn_rsq_legacy:
15176 case Intrinsic::amdgcn_div_scale:
15177 case Intrinsic::amdgcn_div_fmas:
15178 case Intrinsic::amdgcn_div_fixup:
15179 case Intrinsic::amdgcn_fract:
15180 case Intrinsic::amdgcn_cvt_pkrtz:
15181 case Intrinsic::amdgcn_cubeid:
15182 case Intrinsic::amdgcn_cubema:
15183 case Intrinsic::amdgcn_cubesc:
15184 case Intrinsic::amdgcn_cubetc:
15185 case Intrinsic::amdgcn_frexp_mant:
15186 case Intrinsic::amdgcn_fdot2:
15187 case Intrinsic::amdgcn_trig_preop:
15188 case Intrinsic::amdgcn_tanh:
15189 return true;
15190 default:
15191 break;
15192 }
15193
15194 [[fallthrough]];
15195 default:
15196 return false;
15197 }
15198
15199 llvm_unreachable("invalid operation");
15200}
15201
15202// Constant fold canonicalize.
15203SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15204 const SDLoc &SL, EVT VT,
15205 const APFloat &C) const {
15206 // Flush denormals to 0 if not enabled.
15207 if (C.isDenormal()) {
15208 DenormalMode Mode =
15209 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
15210 if (Mode == DenormalMode::getPreserveSign()) {
15211 return DAG.getConstantFP(
15212 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
15213 }
15214
15215 if (Mode != DenormalMode::getIEEE())
15216 return SDValue();
15217 }
15218
15219 if (C.isNaN()) {
15220 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
15221 if (C.isSignaling()) {
15222 // Quiet a signaling NaN.
15223 // FIXME: Is this supposed to preserve payload bits?
15224 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15225 }
15226
15227 // Make sure it is the canonical NaN bitpattern.
15228 //
15229 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15230 // immediate?
15231 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15232 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15233 }
15234
15235 // Already canonical.
15236 return DAG.getConstantFP(Val: C, DL: SL, VT);
15237}
15238
15239static bool vectorEltWillFoldAway(SDValue Op) {
15240 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
15241}
15242
15243SDValue
15244SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15245 DAGCombinerInfo &DCI) const {
15246 SelectionDAG &DAG = DCI.DAG;
15247 SDValue N0 = N->getOperand(Num: 0);
15248 EVT VT = N->getValueType(ResNo: 0);
15249
15250 // fcanonicalize undef -> qnan
15251 if (N0.isUndef()) {
15252 APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
15253 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
15254 }
15255
15256 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
15257 EVT VT = N->getValueType(ResNo: 0);
15258 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
15259 }
15260
15261 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15262 // (fcanonicalize k)
15263 //
15264 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15265
15266 // TODO: This could be better with wider vectors that will be split to v2f16,
15267 // and to consider uses since there aren't that many packed operations.
15268 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15269 isTypeLegal(VT: MVT::v2f16)) {
15270 SDLoc SL(N);
15271 SDValue NewElts[2];
15272 SDValue Lo = N0.getOperand(i: 0);
15273 SDValue Hi = N0.getOperand(i: 1);
15274 EVT EltVT = Lo.getValueType();
15275
15276 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
15277 for (unsigned I = 0; I != 2; ++I) {
15278 SDValue Op = N0.getOperand(i: I);
15279 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15280 NewElts[I] =
15281 getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
15282 } else if (Op.isUndef()) {
15283 // Handled below based on what the other operand is.
15284 NewElts[I] = Op;
15285 } else {
15286 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
15287 }
15288 }
15289
15290 // If one half is undef, and one is constant, prefer a splat vector rather
15291 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15292 // cheaper to use and may be free with a packed operation.
15293 if (NewElts[0].isUndef()) {
15294 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
15295 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1])
15296 ? NewElts[1]
15297 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15298 }
15299
15300 if (NewElts[1].isUndef()) {
15301 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0])
15302 ? NewElts[0]
15303 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15304 }
15305
15306 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
15307 }
15308 }
15309
15310 return SDValue();
15311}
15312
15313static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15314 switch (Opc) {
15315 case ISD::FMAXNUM:
15316 case ISD::FMAXNUM_IEEE:
15317 case ISD::FMAXIMUMNUM:
15318 return AMDGPUISD::FMAX3;
15319 case ISD::FMAXIMUM:
15320 return AMDGPUISD::FMAXIMUM3;
15321 case ISD::SMAX:
15322 return AMDGPUISD::SMAX3;
15323 case ISD::UMAX:
15324 return AMDGPUISD::UMAX3;
15325 case ISD::FMINNUM:
15326 case ISD::FMINNUM_IEEE:
15327 case ISD::FMINIMUMNUM:
15328 return AMDGPUISD::FMIN3;
15329 case ISD::FMINIMUM:
15330 return AMDGPUISD::FMINIMUM3;
15331 case ISD::SMIN:
15332 return AMDGPUISD::SMIN3;
15333 case ISD::UMIN:
15334 return AMDGPUISD::UMIN3;
15335 default:
15336 llvm_unreachable("Not a min/max opcode");
15337 }
15338}
15339
15340SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15341 const SDLoc &SL, SDValue Src,
15342 SDValue MinVal,
15343 SDValue MaxVal,
15344 bool Signed) const {
15345
15346 // med3 comes from
15347 // min(max(x, K0), K1), K0 < K1
15348 // max(min(x, K0), K1), K1 < K0
15349 //
15350 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15351 // min/max op.
15352 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
15353 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
15354
15355 if (!MinK || !MaxK)
15356 return SDValue();
15357
15358 if (Signed) {
15359 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
15360 return SDValue();
15361 } else {
15362 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
15363 return SDValue();
15364 }
15365
15366 EVT VT = MinK->getValueType(ResNo: 0);
15367 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15368 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15369 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
15370
15371 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15372 // not available, but this is unlikely to be profitable as constants
15373 // will often need to be materialized & extended, especially on
15374 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15375 return SDValue();
15376}
15377
15378static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
15379 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
15380 return C;
15381
15382 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15383 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15384 return C;
15385 }
15386
15387 return nullptr;
15388}
15389
15390SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15391 const SDLoc &SL, SDValue Op0,
15392 SDValue Op1) const {
15393 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
15394 if (!K1)
15395 return SDValue();
15396
15397 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
15398 if (!K0)
15399 return SDValue();
15400
15401 // Ordered >= (although NaN inputs should have folded away by now).
15402 if (K0->getValueAPF() > K1->getValueAPF())
15403 return SDValue();
15404
15405 // med3 with a nan input acts like
15406 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15407 //
15408 // So the result depends on whether the IEEE mode bit is enabled or not with a
15409 // signaling nan input.
15410 // ieee=1
15411 // s0 snan: yields s2
15412 // s1 snan: yields s2
15413 // s2 snan: qnan
15414
15415 // s0 qnan: min(s1, s2)
15416 // s1 qnan: min(s0, s2)
15417 // s2 qnan: min(s0, s1)
15418
15419 // ieee=0
15420 // s0 snan: min(s1, s2)
15421 // s1 snan: min(s0, s2)
15422 // s2 snan: qnan
15423
15424 // s0 qnan: min(s1, s2)
15425 // s1 qnan: min(s0, s2)
15426 // s2 qnan: min(s0, s1)
15427 const MachineFunction &MF = DAG.getMachineFunction();
15428 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15429
15430 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15431 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15432 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15433 EVT VT = Op0.getValueType();
15434 if (Info->getMode().DX10Clamp) {
15435 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15436 // hardware fmed3 behavior converting to a min.
15437 // FIXME: Should this be allowing -0.0?
15438 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
15439 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
15440 }
15441
15442 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15443 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15444 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15445 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15446 // then give the other result, which is different from med3 with a NaN
15447 // input.
15448 SDValue Var = Op0.getOperand(i: 0);
15449 if (!DAG.isKnownNeverSNaN(Op: Var))
15450 return SDValue();
15451
15452 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15453
15454 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
15455 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
15456 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), N1: Var,
15457 N2: SDValue(K0, 0), N3: SDValue(K1, 0));
15458 }
15459 }
15460
15461 return SDValue();
15462}
15463
15464/// \return true if the subtarget supports minimum3 and maximum3 with the given
15465/// base min/max opcode \p Opc for type \p VT.
15466static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15467 EVT VT) {
15468 switch (Opc) {
15469 case ISD::FMINNUM:
15470 case ISD::FMAXNUM:
15471 case ISD::FMINNUM_IEEE:
15472 case ISD::FMAXNUM_IEEE:
15473 case ISD::FMINIMUMNUM:
15474 case ISD::FMAXIMUMNUM:
15475 case AMDGPUISD::FMIN_LEGACY:
15476 case AMDGPUISD::FMAX_LEGACY:
15477 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15478 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15479 case ISD::FMINIMUM:
15480 case ISD::FMAXIMUM:
15481 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15482 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15483 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15484 case ISD::SMAX:
15485 case ISD::SMIN:
15486 case ISD::UMAX:
15487 case ISD::UMIN:
15488 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15489 default:
15490 return false;
15491 }
15492
15493 llvm_unreachable("not a min/max opcode");
15494}
15495
15496SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15497 DAGCombinerInfo &DCI) const {
15498 SelectionDAG &DAG = DCI.DAG;
15499
15500 EVT VT = N->getValueType(ResNo: 0);
15501 unsigned Opc = N->getOpcode();
15502 SDValue Op0 = N->getOperand(Num: 0);
15503 SDValue Op1 = N->getOperand(Num: 1);
15504
15505 // Only do this if the inner op has one use since this will just increases
15506 // register pressure for no benefit.
15507
15508 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
15509 // max(max(a, b), c) -> max3(a, b, c)
15510 // min(min(a, b), c) -> min3(a, b, c)
15511 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15512 SDLoc DL(N);
15513 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15514 N1: Op0.getOperand(i: 0), N2: Op0.getOperand(i: 1), N3: Op1);
15515 }
15516
15517 // Try commuted.
15518 // max(a, max(b, c)) -> max3(a, b, c)
15519 // min(a, min(b, c)) -> min3(a, b, c)
15520 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15521 SDLoc DL(N);
15522 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15523 N1: Op0, N2: Op1.getOperand(i: 0), N3: Op1.getOperand(i: 1));
15524 }
15525 }
15526
15527 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15528 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15529 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15530 if (SDValue Med3 = performIntMed3ImmCombine(
15531 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
15532 return Med3;
15533 }
15534 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15535 if (SDValue Med3 = performIntMed3ImmCombine(
15536 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
15537 return Med3;
15538 }
15539
15540 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15541 if (SDValue Med3 = performIntMed3ImmCombine(
15542 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
15543 return Med3;
15544 }
15545 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15546 if (SDValue Med3 = performIntMed3ImmCombine(
15547 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
15548 return Med3;
15549 }
15550
15551 // if !is_snan(x):
15552 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15553 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15554 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15555 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15556 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15557 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
15558 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
15559 (Opc == AMDGPUISD::FMIN_LEGACY &&
15560 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15561 (VT == MVT::f32 || VT == MVT::f64 ||
15562 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15563 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15564 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15565 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15566 Op0.hasOneUse()) {
15567 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1))
15568 return Res;
15569 }
15570
15571 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15572 // for some types, but at a higher cost since it's implemented with a 3
15573 // operand form.
15574 const SDNodeFlags Flags = N->getFlags();
15575 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15576 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15577 isOperationLegal(Op: ISD::FMINNUM_IEEE, VT: VT.getScalarType())) {
15578 unsigned NewOpc =
15579 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15580 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VT, N1: Op0, N2: Op1, Flags);
15581 }
15582
15583 return SDValue();
15584}
15585
15586static bool isClampZeroToOne(SDValue A, SDValue B) {
15587 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
15588 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
15589 // FIXME: Should this be allowing -0.0?
15590 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
15591 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
15592 }
15593 }
15594
15595 return false;
15596}
15597
15598// FIXME: Should only worry about snans for version with chain.
15599SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15600 DAGCombinerInfo &DCI) const {
15601 EVT VT = N->getValueType(ResNo: 0);
15602 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15603 // NaNs. With a NaN input, the order of the operands may change the result.
15604
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc SL(N);
15607
15608 SDValue Src0 = N->getOperand(Num: 0);
15609 SDValue Src1 = N->getOperand(Num: 1);
15610 SDValue Src2 = N->getOperand(Num: 2);
15611
15612 if (isClampZeroToOne(A: Src0, B: Src1)) {
15613 // const_a, const_b, x -> clamp is safe in all cases including signaling
15614 // nans.
15615 // FIXME: Should this be allowing -0.0?
15616 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
15617 }
15618
15619 const MachineFunction &MF = DAG.getMachineFunction();
15620 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15621
15622 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15623 // handling no dx10-clamp?
15624 if (Info->getMode().DX10Clamp) {
15625 // If NaNs is clamped to 0, we are free to reorder the inputs.
15626
15627 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15628 std::swap(a&: Src0, b&: Src1);
15629
15630 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
15631 std::swap(a&: Src1, b&: Src2);
15632
15633 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15634 std::swap(a&: Src0, b&: Src1);
15635
15636 if (isClampZeroToOne(A: Src1, B: Src2))
15637 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
15638 }
15639
15640 return SDValue();
15641}
15642
15643SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15644 DAGCombinerInfo &DCI) const {
15645 SDValue Src0 = N->getOperand(Num: 0);
15646 SDValue Src1 = N->getOperand(Num: 1);
15647 if (Src0.isUndef() && Src1.isUndef())
15648 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
15649 return SDValue();
15650}
15651
15652// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15653// expanded into a set of cmp/select instructions.
15654bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
15655 unsigned NumElem,
15656 bool IsDivergentIdx,
15657 const GCNSubtarget *Subtarget) {
15658 if (UseDivergentRegisterIndexing)
15659 return false;
15660
15661 unsigned VecSize = EltSize * NumElem;
15662
15663 // Sub-dword vectors of size 2 dword or less have better implementation.
15664 if (VecSize <= 64 && EltSize < 32)
15665 return false;
15666
15667 // Always expand the rest of sub-dword instructions, otherwise it will be
15668 // lowered via memory.
15669 if (EltSize < 32)
15670 return true;
15671
15672 // Always do this if var-idx is divergent, otherwise it will become a loop.
15673 if (IsDivergentIdx)
15674 return true;
15675
15676 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15677 unsigned NumInsts = NumElem /* Number of compares */ +
15678 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15679
15680 // On some architectures (GFX9) movrel is not available and it's better
15681 // to expand.
15682 if (Subtarget->useVGPRIndexMode())
15683 return NumInsts <= 16;
15684
15685 // If movrel is available, use it instead of expanding for vector of 8
15686 // elements.
15687 if (Subtarget->hasMovrel())
15688 return NumInsts <= 15;
15689
15690 return true;
15691}
15692
15693bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
15694 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
15695 if (isa<ConstantSDNode>(Val: Idx))
15696 return false;
15697
15698 SDValue Vec = N->getOperand(Num: 0);
15699 EVT VecVT = Vec.getValueType();
15700 EVT EltVT = VecVT.getVectorElementType();
15701 unsigned EltSize = EltVT.getSizeInBits();
15702 unsigned NumElem = VecVT.getVectorNumElements();
15703
15704 return SITargetLowering::shouldExpandVectorDynExt(
15705 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
15706}
15707
15708SDValue
15709SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15710 DAGCombinerInfo &DCI) const {
15711 SDValue Vec = N->getOperand(Num: 0);
15712 SelectionDAG &DAG = DCI.DAG;
15713
15714 EVT VecVT = Vec.getValueType();
15715 EVT VecEltVT = VecVT.getVectorElementType();
15716 EVT ResVT = N->getValueType(ResNo: 0);
15717
15718 unsigned VecSize = VecVT.getSizeInBits();
15719 unsigned VecEltSize = VecEltVT.getSizeInBits();
15720
15721 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15722 allUsesHaveSourceMods(N)) {
15723 SDLoc SL(N);
15724 SDValue Idx = N->getOperand(Num: 1);
15725 SDValue Elt =
15726 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
15727 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
15728 }
15729
15730 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15731 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15732 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15733 // depending on the shift operand. See e.g. performSraCombine().
15734 // This combine ensures that the optimisation is compatible with v2i32
15735 // legalised AND.
15736 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15737 Vec->getOperand(Num: 1)->getOpcode() == ISD::BUILD_VECTOR) {
15738
15739 const ConstantSDNode *C = isConstOrConstSplat(N: Vec.getOperand(i: 1));
15740 if (!C || C->getZExtValue() != 0x1f)
15741 return SDValue();
15742
15743 SDLoc SL(N);
15744 SDValue AndMask = DAG.getConstant(Val: 0x1f, DL: SL, VT: MVT::i32);
15745 SDValue EVE = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32,
15746 N1: Vec->getOperand(Num: 0), N2: N->getOperand(Num: 1));
15747 SDValue A = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: EVE, N2: AndMask);
15748 DAG.ReplaceAllUsesWith(From: N, To: A.getNode());
15749 }
15750
15751 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15752 // =>
15753 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15754 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15755 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15756 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15757 SDLoc SL(N);
15758 SDValue Idx = N->getOperand(Num: 1);
15759 unsigned Opc = Vec.getOpcode();
15760
15761 switch (Opc) {
15762 default:
15763 break;
15764 // TODO: Support other binary operations.
15765 case ISD::FADD:
15766 case ISD::FSUB:
15767 case ISD::FMUL:
15768 case ISD::ADD:
15769 case ISD::UMIN:
15770 case ISD::UMAX:
15771 case ISD::SMIN:
15772 case ISD::SMAX:
15773 case ISD::FMAXNUM:
15774 case ISD::FMINNUM:
15775 case ISD::FMAXNUM_IEEE:
15776 case ISD::FMINNUM_IEEE:
15777 case ISD::FMAXIMUM:
15778 case ISD::FMINIMUM: {
15779 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15780 N1: Vec.getOperand(i: 0), N2: Idx);
15781 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15782 N1: Vec.getOperand(i: 1), N2: Idx);
15783
15784 DCI.AddToWorklist(N: Elt0.getNode());
15785 DCI.AddToWorklist(N: Elt1.getNode());
15786 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
15787 }
15788 }
15789 }
15790
15791 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15792 if (shouldExpandVectorDynExt(N)) {
15793 SDLoc SL(N);
15794 SDValue Idx = N->getOperand(Num: 1);
15795 SDValue V;
15796 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15797 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
15798 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
15799 if (I == 0)
15800 V = Elt;
15801 else
15802 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
15803 }
15804 return V;
15805 }
15806
15807 if (!DCI.isBeforeLegalize())
15808 return SDValue();
15809
15810 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15811 // elements. This exposes more load reduction opportunities by replacing
15812 // multiple small extract_vector_elements with a single 32-bit extract.
15813 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
15814 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15815 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15816 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
15817
15818 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15819 unsigned EltIdx = BitIndex / 32;
15820 unsigned LeftoverBitIdx = BitIndex % 32;
15821 SDLoc SL(N);
15822
15823 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
15824 DCI.AddToWorklist(N: Cast.getNode());
15825
15826 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
15827 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
15828 DCI.AddToWorklist(N: Elt.getNode());
15829 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
15830 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
15831 DCI.AddToWorklist(N: Srl.getNode());
15832
15833 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15834 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
15835 DCI.AddToWorklist(N: Trunc.getNode());
15836
15837 if (VecEltVT == ResVT) {
15838 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
15839 }
15840
15841 assert(ResVT.isScalarInteger());
15842 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
15843 }
15844
15845 return SDValue();
15846}
15847
15848SDValue
15849SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15850 DAGCombinerInfo &DCI) const {
15851 SDValue Vec = N->getOperand(Num: 0);
15852 SDValue Idx = N->getOperand(Num: 2);
15853 EVT VecVT = Vec.getValueType();
15854 EVT EltVT = VecVT.getVectorElementType();
15855
15856 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15857 // => BUILD_VECTOR n x select (e, const-idx)
15858 if (!shouldExpandVectorDynExt(N))
15859 return SDValue();
15860
15861 SelectionDAG &DAG = DCI.DAG;
15862 SDLoc SL(N);
15863 SDValue Ins = N->getOperand(Num: 1);
15864 EVT IdxVT = Idx.getValueType();
15865
15866 SmallVector<SDValue, 16> Ops;
15867 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15868 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
15869 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
15870 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
15871 Ops.push_back(Elt: V);
15872 }
15873
15874 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
15875}
15876
15877/// Return the source of an fp_extend from f16 to f32, or a converted FP
15878/// constant.
15879static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
15880 if (Src.getOpcode() == ISD::FP_EXTEND &&
15881 Src.getOperand(i: 0).getValueType() == MVT::f16) {
15882 return Src.getOperand(i: 0);
15883 }
15884
15885 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
15886 APFloat Val = CFP->getValueAPF();
15887 bool LosesInfo = true;
15888 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
15889 if (!LosesInfo)
15890 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
15891 }
15892
15893 return SDValue();
15894}
15895
15896SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15897 DAGCombinerInfo &DCI) const {
15898 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15899 "combine only useful on gfx8");
15900
15901 SDValue TruncSrc = N->getOperand(Num: 0);
15902 EVT VT = N->getValueType(ResNo: 0);
15903 if (VT != MVT::f16)
15904 return SDValue();
15905
15906 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15907 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15908 return SDValue();
15909
15910 SelectionDAG &DAG = DCI.DAG;
15911 SDLoc SL(N);
15912
15913 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15914 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15915 // casting back.
15916
15917 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15918 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15919 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
15920 if (!A)
15921 return SDValue();
15922
15923 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
15924 if (!B)
15925 return SDValue();
15926
15927 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
15928 if (!C)
15929 return SDValue();
15930
15931 // This changes signaling nan behavior. If an input is a signaling nan, it
15932 // would have been quieted by the fpext originally. We don't care because
15933 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15934 // we would be worse off than just doing the promotion.
15935 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15936 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15937 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
15938 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
15939}
15940
15941unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15942 const SDNode *N0,
15943 const SDNode *N1) const {
15944 EVT VT = N0->getValueType(ResNo: 0);
15945
15946 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15947 // support denormals ever.
15948 if (((VT == MVT::f32 &&
15949 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
15950 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15951 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
15952 isOperationLegal(Op: ISD::FMAD, VT))
15953 return ISD::FMAD;
15954
15955 const TargetOptions &Options = DAG.getTarget().Options;
15956 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15957 (N0->getFlags().hasAllowContract() &&
15958 N1->getFlags().hasAllowContract())) &&
15959 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
15960 return ISD::FMA;
15961 }
15962
15963 return 0;
15964}
15965
15966// For a reassociatable opcode perform:
15967// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15968SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15969 SelectionDAG &DAG) const {
15970 EVT VT = N->getValueType(ResNo: 0);
15971 if (VT != MVT::i32 && VT != MVT::i64)
15972 return SDValue();
15973
15974 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
15975 return SDValue();
15976
15977 unsigned Opc = N->getOpcode();
15978 SDValue Op0 = N->getOperand(Num: 0);
15979 SDValue Op1 = N->getOperand(Num: 1);
15980
15981 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15982 return SDValue();
15983
15984 if (Op0->isDivergent())
15985 std::swap(a&: Op0, b&: Op1);
15986
15987 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15988 return SDValue();
15989
15990 SDValue Op2 = Op1.getOperand(i: 1);
15991 Op1 = Op1.getOperand(i: 0);
15992 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15993 return SDValue();
15994
15995 if (Op1->isDivergent())
15996 std::swap(a&: Op1, b&: Op2);
15997
15998 SDLoc SL(N);
15999 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
16000 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
16001}
16002
16003static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16004 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16005 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
16006 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
16007 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
16008 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
16009}
16010
16011// Fold
16012// y = lshr i64 x, 32
16013// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16014// with Const.hi == -1
16015// To
16016// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16017static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
16018 SDValue MulLHS, SDValue MulRHS,
16019 SDValue AddRHS) {
16020 if (MulRHS.getOpcode() == ISD::SRL)
16021 std::swap(a&: MulLHS, b&: MulRHS);
16022
16023 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16024 return SDValue();
16025
16026 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: 1));
16027 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16028 MulLHS.getOperand(i: 0) != AddRHS)
16029 return SDValue();
16030
16031 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
16032 if (!Const || Hi_32(Value: Const->getZExtValue()) != uint32_t(-1))
16033 return SDValue();
16034
16035 SDValue ConstMul =
16036 DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
16037 return getMad64_32(DAG, SL, VT: MVT::i64,
16038 N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
16039 N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
16040}
16041
16042// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16043// multiplies, if any.
16044//
16045// Full 64-bit multiplies that feed into an addition are lowered here instead
16046// of using the generic expansion. The generic expansion ends up with
16047// a tree of ADD nodes that prevents us from using the "add" part of the
16048// MAD instruction. The expansion produced here results in a chain of ADDs
16049// instead of a tree.
16050SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16051 DAGCombinerInfo &DCI) const {
16052 assert(N->isAnyAdd());
16053
16054 SelectionDAG &DAG = DCI.DAG;
16055 EVT VT = N->getValueType(ResNo: 0);
16056 SDLoc SL(N);
16057 SDValue LHS = N->getOperand(Num: 0);
16058 SDValue RHS = N->getOperand(Num: 1);
16059
16060 if (VT.isVector())
16061 return SDValue();
16062
16063 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16064 // result in scalar registers for uniform values.
16065 if (!N->isDivergent() && Subtarget->hasSMulHi())
16066 return SDValue();
16067
16068 unsigned NumBits = VT.getScalarSizeInBits();
16069 if (NumBits <= 32 || NumBits > 64)
16070 return SDValue();
16071
16072 if (LHS.getOpcode() != ISD::MUL) {
16073 assert(RHS.getOpcode() == ISD::MUL);
16074 std::swap(a&: LHS, b&: RHS);
16075 }
16076
16077 // Avoid the fold if it would unduly increase the number of multiplies due to
16078 // multiple uses, except on hardware with full-rate multiply-add (which is
16079 // part of full-rate 64-bit ops).
16080 if (!Subtarget->hasFullRate64Ops()) {
16081 unsigned NumUsers = 0;
16082 for (SDNode *User : LHS->users()) {
16083 // There is a use that does not feed into addition, so the multiply can't
16084 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16085 if (!User->isAnyAdd())
16086 return SDValue();
16087
16088 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16089 // MUL + 3xADD + 3xADDC over 3xMAD.
16090 ++NumUsers;
16091 if (NumUsers >= 3)
16092 return SDValue();
16093 }
16094 }
16095
16096 SDValue MulLHS = LHS.getOperand(i: 0);
16097 SDValue MulRHS = LHS.getOperand(i: 1);
16098 SDValue AddRHS = RHS;
16099
16100 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16101 return FoldedMAD;
16102
16103 // Always check whether operands are small unsigned values, since that
16104 // knowledge is useful in more cases. Check for small signed values only if
16105 // doing so can unlock a shorter code sequence.
16106 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
16107 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
16108
16109 bool MulSignedLo = false;
16110 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16111 MulSignedLo =
16112 numBitsSigned(Op: MulLHS, DAG) <= 32 && numBitsSigned(Op: MulRHS, DAG) <= 32;
16113 }
16114
16115 // The operands and final result all have the same number of bits. If
16116 // operands need to be extended, they can be extended with garbage. The
16117 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16118 // truncated away in the end.
16119 if (VT != MVT::i64) {
16120 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
16121 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
16122 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
16123 }
16124
16125 // The basic code generated is conceptually straightforward. Pseudo code:
16126 //
16127 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16128 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16129 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16130 //
16131 // The second and third lines are optional, depending on whether the factors
16132 // are {sign,zero}-extended or not.
16133 //
16134 // The actual DAG is noisier than the pseudo code, but only due to
16135 // instructions that disassemble values into low and high parts, and
16136 // assemble the final result.
16137 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
16138
16139 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
16140 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
16141 SDValue Accum =
16142 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
16143
16144 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16145 auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
16146
16147 if (!MulLHSUnsigned32) {
16148 auto MulLHSHi =
16149 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
16150 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
16151 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16152 }
16153
16154 if (!MulRHSUnsigned32) {
16155 auto MulRHSHi =
16156 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
16157 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
16158 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16159 }
16160
16161 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
16162 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
16163 }
16164
16165 if (VT != MVT::i64)
16166 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
16167 return Accum;
16168}
16169
16170SDValue
16171SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16172 DAGCombinerInfo &DCI) const {
16173 SDValue RHS = N->getOperand(Num: 1);
16174 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
16175 if (!CRHS)
16176 return SDValue();
16177
16178 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16179 // common.
16180 uint64_t Val = CRHS->getZExtValue();
16181 if (countr_zero(Val) >= 32) {
16182 SelectionDAG &DAG = DCI.DAG;
16183 SDLoc SL(N);
16184 SDValue LHS = N->getOperand(Num: 0);
16185
16186 // Avoid carry machinery if we know the low half of the add does not
16187 // contribute to the final result.
16188 //
16189 // add i64:x, K if computeTrailingZeros(K) >= 32
16190 // => build_pair (add x.hi, K.hi), x.lo
16191
16192 // Breaking the 64-bit add here with this strange constant is unlikely
16193 // to interfere with addressing mode patterns.
16194
16195 SDValue Hi = getHiHalf64(Op: LHS, DAG);
16196 SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
16197 unsigned Opcode = N->getOpcode();
16198 if (Opcode == ISD::PTRADD)
16199 Opcode = ISD::ADD;
16200 SDValue AddHi =
16201 DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
16202
16203 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
16204 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
16205 }
16206
16207 return SDValue();
16208}
16209
16210// Collect the ultimate src of each of the mul node's operands, and confirm
16211// each operand is 8 bytes.
16212static std::optional<ByteProvider<SDValue>>
16213handleMulOperand(const SDValue &MulOperand) {
16214 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
16215 if (!Byte0 || Byte0->isConstantZero()) {
16216 return std::nullopt;
16217 }
16218 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
16219 if (Byte1 && !Byte1->isConstantZero()) {
16220 return std::nullopt;
16221 }
16222 return Byte0;
16223}
16224
16225static unsigned addPermMasks(unsigned First, unsigned Second) {
16226 unsigned FirstCs = First & 0x0c0c0c0c;
16227 unsigned SecondCs = Second & 0x0c0c0c0c;
16228 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16229 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16230
16231 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16232 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16233 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16234 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16235
16236 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16237}
16238
16239struct DotSrc {
16240 SDValue SrcOp;
16241 int64_t PermMask;
16242 int64_t DWordOffset;
16243};
16244
16245static void placeSources(ByteProvider<SDValue> &Src0,
16246 ByteProvider<SDValue> &Src1,
16247 SmallVectorImpl<DotSrc> &Src0s,
16248 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16249
16250 assert(Src0.Src.has_value() && Src1.Src.has_value());
16251 // Src0s and Src1s are empty, just place arbitrarily.
16252 if (Step == 0) {
16253 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16254 .DWordOffset: Src0.SrcOffset / 4});
16255 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16256 .DWordOffset: Src1.SrcOffset / 4});
16257 return;
16258 }
16259
16260 for (int BPI = 0; BPI < 2; BPI++) {
16261 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16262 if (BPI == 1) {
16263 BPP = {Src1, Src0};
16264 }
16265 unsigned ZeroMask = 0x0c0c0c0c;
16266 unsigned FMask = 0xFF << (8 * (3 - Step));
16267
16268 unsigned FirstMask =
16269 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16270 unsigned SecondMask =
16271 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16272 // Attempt to find Src vector which contains our SDValue, if so, add our
16273 // perm mask to the existing one. If we are unable to find a match for the
16274 // first SDValue, attempt to find match for the second.
16275 int FirstGroup = -1;
16276 for (int I = 0; I < 2; I++) {
16277 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16278 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16279 return IterElt.SrcOp == *BPP.first.Src &&
16280 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16281 };
16282
16283 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
16284 if (Match != Srcs.end()) {
16285 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
16286 FirstGroup = I;
16287 break;
16288 }
16289 }
16290 if (FirstGroup != -1) {
16291 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16292 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16293 return IterElt.SrcOp == *BPP.second.Src &&
16294 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16295 };
16296 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
16297 if (Match != Srcs.end()) {
16298 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
16299 } else
16300 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
16301 return;
16302 }
16303 }
16304
16305 // If we have made it here, then we could not find a match in Src0s or Src1s
16306 // for either Src0 or Src1, so just place them arbitrarily.
16307
16308 unsigned ZeroMask = 0x0c0c0c0c;
16309 unsigned FMask = 0xFF << (8 * (3 - Step));
16310
16311 Src0s.push_back(
16312 Elt: {.SrcOp: *Src0.Src,
16313 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16314 .DWordOffset: Src0.SrcOffset / 4});
16315 Src1s.push_back(
16316 Elt: {.SrcOp: *Src1.Src,
16317 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16318 .DWordOffset: Src1.SrcOffset / 4});
16319}
16320
16321static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
16322 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16323 bool IsAny) {
16324
16325 // If we just have one source, just permute it accordingly.
16326 if (Srcs.size() == 1) {
16327 auto *Elt = Srcs.begin();
16328 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
16329
16330 // v_perm will produce the original value
16331 if (Elt->PermMask == 0x3020100)
16332 return EltOp;
16333
16334 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16335 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
16336 }
16337
16338 auto *FirstElt = Srcs.begin();
16339 auto *SecondElt = std::next(x: FirstElt);
16340
16341 SmallVector<SDValue, 2> Perms;
16342
16343 // If we have multiple sources in the chain, combine them via perms (using
16344 // calculated perm mask) and Ors.
16345 while (true) {
16346 auto FirstMask = FirstElt->PermMask;
16347 auto SecondMask = SecondElt->PermMask;
16348
16349 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16350 unsigned FirstPlusFour = FirstMask | 0x04040404;
16351 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16352 // original 0x0C.
16353 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16354
16355 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
16356 auto FirstVal =
16357 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16358 auto SecondVal =
16359 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
16360
16361 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
16362 N2: SecondVal,
16363 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
16364
16365 FirstElt = std::next(x: SecondElt);
16366 if (FirstElt == Srcs.end())
16367 break;
16368
16369 SecondElt = std::next(x: FirstElt);
16370 // If we only have a FirstElt, then just combine that into the cumulative
16371 // source node.
16372 if (SecondElt == Srcs.end()) {
16373 auto EltOp =
16374 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16375
16376 Perms.push_back(
16377 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16378 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
16379 break;
16380 }
16381 }
16382
16383 assert(Perms.size() == 1 || Perms.size() == 2);
16384 return Perms.size() == 2
16385 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
16386 : Perms[0];
16387}
16388
16389static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16390 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16391 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16392 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16393 EntryMask += ZeroMask;
16394 }
16395}
16396
16397static bool isMul(const SDValue Op) {
16398 auto Opcode = Op.getOpcode();
16399
16400 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16401 Opcode == AMDGPUISD::MUL_I24);
16402}
16403
16404static std::optional<bool>
16405checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
16406 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16407 const SDValue &S1Op, const SelectionDAG &DAG) {
16408 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16409 // of the dot4 is irrelevant.
16410 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16411 return false;
16412
16413 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
16414 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16415 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16416 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
16417 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16418 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16419
16420 assert(!(S0IsUnsigned && S0IsSigned));
16421 assert(!(S1IsUnsigned && S1IsSigned));
16422
16423 // There are 9 possible permutations of
16424 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16425
16426 // In two permutations, the sign bits are known to be the same for both Ops,
16427 // so simply return Signed / Unsigned corresponding to the MSB
16428
16429 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16430 return S0IsSigned;
16431
16432 // In another two permutations, the sign bits are known to be opposite. In
16433 // this case return std::nullopt to indicate a bad match.
16434
16435 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16436 return std::nullopt;
16437
16438 // In the remaining five permutations, we don't know the value of the sign
16439 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16440 // the upper bits must be extension bits. Thus, the only ways for the sign
16441 // bit to be unknown is if it was sign extended from unknown value, or if it
16442 // was any extended. In either case, it is correct to use the signed
16443 // version of the signedness semantics of dot4
16444
16445 // In two of such permutations, we known the sign bit is set for
16446 // one op, and the other is unknown. It is okay to used signed version of
16447 // dot4.
16448 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16449 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16450 return true;
16451
16452 // In one such permutation, we don't know either of the sign bits. It is okay
16453 // to used the signed version of dot4.
16454 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16455 return true;
16456
16457 // In two of such permutations, we known the sign bit is unset for
16458 // one op, and the other is unknown. Return std::nullopt to indicate a
16459 // bad match.
16460 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16461 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16462 return std::nullopt;
16463
16464 llvm_unreachable("Fully covered condition");
16465}
16466
16467SDValue SITargetLowering::performAddCombine(SDNode *N,
16468 DAGCombinerInfo &DCI) const {
16469 SelectionDAG &DAG = DCI.DAG;
16470 EVT VT = N->getValueType(ResNo: 0);
16471 SDLoc SL(N);
16472 SDValue LHS = N->getOperand(Num: 0);
16473 SDValue RHS = N->getOperand(Num: 1);
16474
16475 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16476 if (Subtarget->hasMad64_32()) {
16477 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16478 return Folded;
16479 }
16480 }
16481
16482 if (SDValue V = reassociateScalarOps(N, DAG)) {
16483 return V;
16484 }
16485
16486 if (VT == MVT::i64) {
16487 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16488 return Folded;
16489 }
16490
16491 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
16492 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16493 SDValue TempNode(N, 0);
16494 std::optional<bool> IsSigned;
16495 SmallVector<DotSrc, 4> Src0s;
16496 SmallVector<DotSrc, 4> Src1s;
16497 SmallVector<SDValue, 4> Src2s;
16498
16499 // Match the v_dot4 tree, while collecting src nodes.
16500 int ChainLength = 0;
16501 for (int I = 0; I < 4; I++) {
16502 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
16503 if (MulIdx == -1)
16504 break;
16505 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
16506 if (!Src0)
16507 break;
16508 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
16509 if (!Src1)
16510 break;
16511
16512 auto IterIsSigned = checkDot4MulSignedness(
16513 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
16514 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
16515 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
16516 if (!IterIsSigned)
16517 break;
16518 if (!IsSigned)
16519 IsSigned = *IterIsSigned;
16520 if (*IterIsSigned != *IsSigned)
16521 break;
16522 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
16523 auto AddIdx = 1 - MulIdx;
16524 // Allow the special case where add (add (mul24, 0), mul24) became ->
16525 // add (mul24, mul24).
16526 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
16527 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
16528 auto Src0 =
16529 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
16530 if (!Src0)
16531 break;
16532 auto Src1 =
16533 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
16534 if (!Src1)
16535 break;
16536 auto IterIsSigned = checkDot4MulSignedness(
16537 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
16538 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
16539 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
16540 if (!IterIsSigned)
16541 break;
16542 assert(IsSigned);
16543 if (*IterIsSigned != *IsSigned)
16544 break;
16545 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
16546 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
16547 ChainLength = I + 2;
16548 break;
16549 }
16550
16551 TempNode = TempNode->getOperand(Num: AddIdx);
16552 Src2s.push_back(Elt: TempNode);
16553 ChainLength = I + 1;
16554 if (TempNode->getNumOperands() < 2)
16555 break;
16556 LHS = TempNode->getOperand(Num: 0);
16557 RHS = TempNode->getOperand(Num: 1);
16558 }
16559
16560 if (ChainLength < 2)
16561 return SDValue();
16562
16563 // Masks were constructed with assumption that we would find a chain of
16564 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16565 // 0x0c) so they do not affect dot calculation.
16566 if (ChainLength < 4) {
16567 fixMasks(Srcs&: Src0s, ChainLength);
16568 fixMasks(Srcs&: Src1s, ChainLength);
16569 }
16570
16571 SDValue Src0, Src1;
16572
16573 // If we are just using a single source for both, and have permuted the
16574 // bytes consistently, we can just use the sources without permuting
16575 // (commutation).
16576 bool UseOriginalSrc = false;
16577 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16578 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16579 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16580 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16581 SmallVector<unsigned, 4> SrcBytes;
16582 auto Src0Mask = Src0s.begin()->PermMask;
16583 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
16584 bool UniqueEntries = true;
16585 for (auto I = 1; I < 4; I++) {
16586 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16587
16588 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
16589 UniqueEntries = false;
16590 break;
16591 }
16592 SrcBytes.push_back(Elt: NextByte);
16593 }
16594
16595 if (UniqueEntries) {
16596 UseOriginalSrc = true;
16597
16598 auto *FirstElt = Src0s.begin();
16599 auto FirstEltOp =
16600 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16601
16602 auto *SecondElt = Src1s.begin();
16603 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
16604 DWordOffset: SecondElt->DWordOffset);
16605
16606 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
16607 VT: MVT::getIntegerVT(BitWidth: 32));
16608 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
16609 VT: MVT::getIntegerVT(BitWidth: 32));
16610 }
16611 }
16612
16613 if (!UseOriginalSrc) {
16614 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
16615 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
16616 }
16617
16618 assert(IsSigned);
16619 SDValue Src2 =
16620 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
16621
16622 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
16623 : Intrinsic::amdgcn_udot4,
16624 DL: SL, VT: MVT::i64);
16625
16626 assert(!VT.isVector());
16627 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
16628 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
16629
16630 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
16631 }
16632
16633 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16634 return SDValue();
16635
16636 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16637 // add x, sext (setcc) => usubo_carry x, 0, setcc
16638 unsigned Opc = LHS.getOpcode();
16639 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
16640 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
16641 std::swap(a&: RHS, b&: LHS);
16642
16643 Opc = RHS.getOpcode();
16644 switch (Opc) {
16645 default:
16646 break;
16647 case ISD::ZERO_EXTEND:
16648 case ISD::SIGN_EXTEND:
16649 case ISD::ANY_EXTEND: {
16650 auto Cond = RHS.getOperand(i: 0);
16651 // If this won't be a real VOPC output, we would still need to insert an
16652 // extra instruction anyway.
16653 if (!isBoolSGPR(V: Cond))
16654 break;
16655 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16656 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16657 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
16658 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16659 }
16660 case ISD::UADDO_CARRY: {
16661 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16662 if (!isNullConstant(V: RHS.getOperand(i: 1)))
16663 break;
16664 SDValue Args[] = {LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2)};
16665 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
16666 }
16667 }
16668 return SDValue();
16669}
16670
16671SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16672 DAGCombinerInfo &DCI) const {
16673 SelectionDAG &DAG = DCI.DAG;
16674 SDLoc DL(N);
16675 EVT VT = N->getValueType(ResNo: 0);
16676 SDValue N0 = N->getOperand(Num: 0);
16677 SDValue N1 = N->getOperand(Num: 1);
16678
16679 // The following folds transform PTRADDs into regular arithmetic in cases
16680 // where the PTRADD wouldn't be folded as an immediate offset into memory
16681 // instructions anyway. They are target-specific in that other targets might
16682 // prefer to not lose information about the pointer arithmetic.
16683
16684 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16685 // Adapted from DAGCombiner::visitADDLikeCommutative.
16686 SDValue V, K;
16687 if (sd_match(N: N1, P: m_Shl(L: m_Neg(V: m_Value(N&: V)), R: m_Value(N&: K)))) {
16688 SDNodeFlags ShlFlags = N1->getFlags();
16689 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16690 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16691 // preserved.
16692 SDNodeFlags NewShlFlags =
16693 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16694 ? SDNodeFlags::NoSignedWrap
16695 : SDNodeFlags();
16696 SDValue Inner = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: V, N2: K, Flags: NewShlFlags);
16697 DCI.AddToWorklist(N: Inner.getNode());
16698 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: Inner);
16699 }
16700
16701 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16702 // performAddCombine.
16703 if (N1.getOpcode() == ISD::MUL) {
16704 if (Subtarget->hasMad64_32()) {
16705 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16706 return Folded;
16707 }
16708 }
16709
16710 // If the 32 low bits of the constant are all zero, there is nothing to fold
16711 // into an immediate offset, so it's better to eliminate the unnecessary
16712 // addition for the lower 32 bits than to preserve the PTRADD.
16713 // Analogous to a fold in performAddCombine.
16714 if (VT == MVT::i64) {
16715 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16716 return Folded;
16717 }
16718
16719 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16720 return SDValue();
16721
16722 SDValue X = N0;
16723 SDValue Y = N1.getOperand(i: 0);
16724 SDValue Z = N1.getOperand(i: 1);
16725 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
16726 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
16727
16728 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16729 Y->isDivergent() != Z->isDivergent()) {
16730 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16731 // y are uniform and z isn't.
16732 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16733 // z are uniform and y isn't.
16734 // The goal is to push uniform operands up in the computation, so that they
16735 // can be handled with scalar operations. We can't use reassociateScalarOps
16736 // for this since it requires two identical commutative operations to
16737 // reassociate.
16738 if (Y->isDivergent())
16739 std::swap(a&: Y, b&: Z);
16740 // If both additions in the original were NUW, reassociation preserves that.
16741 SDNodeFlags ReassocFlags =
16742 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16743 SDValue UniformInner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags: ReassocFlags);
16744 DCI.AddToWorklist(N: UniformInner.getNode());
16745 return DAG.getMemBasePlusOffset(Base: UniformInner, Offset: Z, DL, Flags: ReassocFlags);
16746 }
16747
16748 return SDValue();
16749}
16750
16751SDValue SITargetLowering::performSubCombine(SDNode *N,
16752 DAGCombinerInfo &DCI) const {
16753 SelectionDAG &DAG = DCI.DAG;
16754 EVT VT = N->getValueType(ResNo: 0);
16755
16756 if (VT == MVT::i64) {
16757 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16758 return Folded;
16759 }
16760
16761 if (VT != MVT::i32)
16762 return SDValue();
16763
16764 SDLoc SL(N);
16765 SDValue LHS = N->getOperand(Num: 0);
16766 SDValue RHS = N->getOperand(Num: 1);
16767
16768 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16769 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16770 unsigned Opc = RHS.getOpcode();
16771 switch (Opc) {
16772 default:
16773 break;
16774 case ISD::ZERO_EXTEND:
16775 case ISD::SIGN_EXTEND:
16776 case ISD::ANY_EXTEND: {
16777 auto Cond = RHS.getOperand(i: 0);
16778 // If this won't be a real VOPC output, we would still need to insert an
16779 // extra instruction anyway.
16780 if (!isBoolSGPR(V: Cond))
16781 break;
16782 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16783 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16784 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
16785 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16786 }
16787 }
16788
16789 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16790 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16791 if (!isNullConstant(V: LHS.getOperand(i: 1)))
16792 return SDValue();
16793 SDValue Args[] = {LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2)};
16794 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
16795 }
16796 return SDValue();
16797}
16798
16799SDValue
16800SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16801 DAGCombinerInfo &DCI) const {
16802
16803 if (N->getValueType(ResNo: 0) != MVT::i32)
16804 return SDValue();
16805
16806 if (!isNullConstant(V: N->getOperand(Num: 1)))
16807 return SDValue();
16808
16809 SelectionDAG &DAG = DCI.DAG;
16810 SDValue LHS = N->getOperand(Num: 0);
16811
16812 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16813 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16814 unsigned LHSOpc = LHS.getOpcode();
16815 unsigned Opc = N->getOpcode();
16816 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16817 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16818 SDValue Args[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2)};
16819 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
16820 }
16821 return SDValue();
16822}
16823
16824SDValue SITargetLowering::performFAddCombine(SDNode *N,
16825 DAGCombinerInfo &DCI) const {
16826 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16827 return SDValue();
16828
16829 SelectionDAG &DAG = DCI.DAG;
16830 EVT VT = N->getValueType(ResNo: 0);
16831
16832 SDLoc SL(N);
16833 SDValue LHS = N->getOperand(Num: 0);
16834 SDValue RHS = N->getOperand(Num: 1);
16835
16836 // These should really be instruction patterns, but writing patterns with
16837 // source modifiers is a pain.
16838
16839 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16840 if (LHS.getOpcode() == ISD::FADD) {
16841 SDValue A = LHS.getOperand(i: 0);
16842 if (A == LHS.getOperand(i: 1)) {
16843 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16844 if (FusedOp != 0) {
16845 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16846 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
16847 }
16848 }
16849 }
16850
16851 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16852 if (RHS.getOpcode() == ISD::FADD) {
16853 SDValue A = RHS.getOperand(i: 0);
16854 if (A == RHS.getOperand(i: 1)) {
16855 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16856 if (FusedOp != 0) {
16857 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16858 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
16859 }
16860 }
16861 }
16862
16863 return SDValue();
16864}
16865
16866SDValue SITargetLowering::performFSubCombine(SDNode *N,
16867 DAGCombinerInfo &DCI) const {
16868 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16869 return SDValue();
16870
16871 SelectionDAG &DAG = DCI.DAG;
16872 SDLoc SL(N);
16873 EVT VT = N->getValueType(ResNo: 0);
16874 assert(!VT.isVector());
16875
16876 // Try to get the fneg to fold into the source modifier. This undoes generic
16877 // DAG combines and folds them into the mad.
16878 //
16879 // Only do this if we are not trying to support denormals. v_mad_f32 does
16880 // not support denormals ever.
16881 SDValue LHS = N->getOperand(Num: 0);
16882 SDValue RHS = N->getOperand(Num: 1);
16883 if (LHS.getOpcode() == ISD::FADD) {
16884 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16885 SDValue A = LHS.getOperand(i: 0);
16886 if (A == LHS.getOperand(i: 1)) {
16887 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16888 if (FusedOp != 0) {
16889 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16890 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
16891
16892 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
16893 }
16894 }
16895 }
16896
16897 if (RHS.getOpcode() == ISD::FADD) {
16898 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16899
16900 SDValue A = RHS.getOperand(i: 0);
16901 if (A == RHS.getOperand(i: 1)) {
16902 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16903 if (FusedOp != 0) {
16904 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
16905 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
16906 }
16907 }
16908 }
16909
16910 return SDValue();
16911}
16912
16913SDValue SITargetLowering::performFDivCombine(SDNode *N,
16914 DAGCombinerInfo &DCI) const {
16915 SelectionDAG &DAG = DCI.DAG;
16916 SDLoc SL(N);
16917 EVT VT = N->getValueType(ResNo: 0);
16918
16919 // fsqrt legality correlates to rsq availability.
16920 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(Op: ISD::FSQRT, VT))
16921 return SDValue();
16922
16923 SDValue LHS = N->getOperand(Num: 0);
16924 SDValue RHS = N->getOperand(Num: 1);
16925
16926 SDNodeFlags Flags = N->getFlags();
16927 SDNodeFlags RHSFlags = RHS->getFlags();
16928 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16929 !RHS->hasOneUse())
16930 return SDValue();
16931
16932 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
16933 bool IsNegative = false;
16934 if (CLHS->isExactlyValue(V: 1.0) ||
16935 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
16936 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16937 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16938 if (RHS.getOpcode() == ISD::FSQRT) {
16939 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16940 SDValue Rsq =
16941 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
16942 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
16943 }
16944 }
16945 }
16946
16947 return SDValue();
16948}
16949
16950SDValue SITargetLowering::performFMulCombine(SDNode *N,
16951 DAGCombinerInfo &DCI) const {
16952 SelectionDAG &DAG = DCI.DAG;
16953 EVT VT = N->getValueType(ResNo: 0);
16954 EVT ScalarVT = VT.getScalarType();
16955 EVT IntVT = VT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
16956
16957 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16958 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16959 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16960 return SDValue();
16961 }
16962
16963 SDValue LHS = N->getOperand(Num: 0);
16964 SDValue RHS = N->getOperand(Num: 1);
16965
16966 // It is cheaper to realize i32 inline constants as compared against
16967 // materializing f16 or f64 (or even non-inline f32) values,
16968 // possible via ldexp usage, as shown below :
16969 //
16970 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16971 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16972 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16973 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16974 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16975 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: 1));
16976 if (!TrueNode)
16977 return SDValue();
16978 const ConstantFPSDNode *FalseNode =
16979 isConstOrConstSplatFP(N: RHS.getOperand(i: 2));
16980 if (!FalseNode)
16981 return SDValue();
16982
16983 if (TrueNode->isNegative() != FalseNode->isNegative())
16984 return SDValue();
16985
16986 // For f32, only non-inline constants should be transformed.
16987 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16988 if (ScalarVT == MVT::f32 &&
16989 TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
16990 TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
16991 return SDValue();
16992
16993 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16994 if (TrueNodeExpVal == INT_MIN)
16995 return SDValue();
16996 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16997 if (FalseNodeExpVal == INT_MIN)
16998 return SDValue();
16999
17000 SDLoc SL(N);
17001 SDValue SelectNode =
17002 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: 0),
17003 N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
17004 N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
17005
17006 LHS = TrueNode->isNegative()
17007 ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS->getFlags())
17008 : LHS;
17009
17010 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
17011 }
17012
17013 return SDValue();
17014}
17015
17016SDValue SITargetLowering::performFMACombine(SDNode *N,
17017 DAGCombinerInfo &DCI) const {
17018 SelectionDAG &DAG = DCI.DAG;
17019 EVT VT = N->getValueType(ResNo: 0);
17020 SDLoc SL(N);
17021
17022 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17023 return SDValue();
17024
17025 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17026 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17027 SDValue Op1 = N->getOperand(Num: 0);
17028 SDValue Op2 = N->getOperand(Num: 1);
17029 SDValue FMA = N->getOperand(Num: 2);
17030
17031 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17032 Op2.getOpcode() != ISD::FP_EXTEND)
17033 return SDValue();
17034
17035 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17036 // regardless of the denorm mode setting. Therefore,
17037 // fp-contract is sufficient to allow generating fdot2.
17038 const TargetOptions &Options = DAG.getTarget().Options;
17039 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17040 (N->getFlags().hasAllowContract() &&
17041 FMA->getFlags().hasAllowContract())) {
17042 Op1 = Op1.getOperand(i: 0);
17043 Op2 = Op2.getOperand(i: 0);
17044 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17045 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17046 return SDValue();
17047
17048 SDValue Vec1 = Op1.getOperand(i: 0);
17049 SDValue Idx1 = Op1.getOperand(i: 1);
17050 SDValue Vec2 = Op2.getOperand(i: 0);
17051
17052 SDValue FMAOp1 = FMA.getOperand(i: 0);
17053 SDValue FMAOp2 = FMA.getOperand(i: 1);
17054 SDValue FMAAcc = FMA.getOperand(i: 2);
17055
17056 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17057 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17058 return SDValue();
17059
17060 FMAOp1 = FMAOp1.getOperand(i: 0);
17061 FMAOp2 = FMAOp2.getOperand(i: 0);
17062 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17063 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17064 return SDValue();
17065
17066 SDValue Vec3 = FMAOp1.getOperand(i: 0);
17067 SDValue Vec4 = FMAOp2.getOperand(i: 0);
17068 SDValue Idx2 = FMAOp1.getOperand(i: 1);
17069
17070 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
17071 // Idx1 and Idx2 cannot be the same.
17072 Idx1 == Idx2)
17073 return SDValue();
17074
17075 if (Vec1 == Vec2 || Vec3 == Vec4)
17076 return SDValue();
17077
17078 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17079 return SDValue();
17080
17081 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17082 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
17083 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
17084 }
17085 }
17086 return SDValue();
17087}
17088
17089SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17090 DAGCombinerInfo &DCI) const {
17091 SelectionDAG &DAG = DCI.DAG;
17092 SDLoc SL(N);
17093
17094 SDValue LHS = N->getOperand(Num: 0);
17095 SDValue RHS = N->getOperand(Num: 1);
17096 EVT VT = LHS.getValueType();
17097 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
17098
17099 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
17100 if (!CRHS) {
17101 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
17102 if (CRHS) {
17103 std::swap(a&: LHS, b&: RHS);
17104 CC = getSetCCSwappedOperands(Operation: CC);
17105 }
17106 }
17107
17108 if (CRHS) {
17109 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17110 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17111 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17112 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17113 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17114 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17115 if ((CRHS->isAllOnes() &&
17116 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17117 (CRHS->isZero() &&
17118 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17119 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17120 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
17121 if ((CRHS->isAllOnes() &&
17122 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17123 (CRHS->isZero() &&
17124 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17125 return LHS.getOperand(i: 0);
17126 }
17127
17128 const APInt &CRHSVal = CRHS->getAPIntValue();
17129 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17130 LHS.getOpcode() == ISD::SELECT &&
17131 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
17132 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
17133 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17134 // Given CT != FT:
17135 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17136 // setcc (select cc, CT, CF), CF, ne => cc
17137 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17138 // setcc (select cc, CT, CF), CT, eq => cc
17139 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
17140 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
17141
17142 if (CT != CF) {
17143 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17144 (CT == CRHSVal && CC == ISD::SETNE))
17145 return DAG.getNOT(DL: SL, Val: LHS.getOperand(i: 0), VT: MVT::i1);
17146 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17147 (CT == CRHSVal && CC == ISD::SETEQ))
17148 return LHS.getOperand(i: 0);
17149 }
17150 }
17151 }
17152
17153 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17154 // following cases where information about the lower 32-bits of its operands
17155 // is known:
17156 //
17157 // If LHS.lo32 == RHS.lo32:
17158 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17159 // If LHS.lo32 != RHS.lo32:
17160 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17161 // If LHS.lo32 >= RHS.lo32 (unsigned):
17162 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17163 // If LHS.lo32 > RHS.lo32 (unsigned):
17164 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17165 // If LHS.lo32 <= RHS.lo32 (unsigned):
17166 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17167 // If LHS.lo32 < RHS.lo32 (unsigned):
17168 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17169 if (VT == MVT::i64) {
17170 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(Op: LHS).trunc(BitWidth: 32);
17171 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(Op: RHS).trunc(BitWidth: 32);
17172
17173 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17174 // bits
17175 ISD::CondCode NewCC = ISD::SETCC_INVALID;
17176
17177 switch (CC) {
17178 default:
17179 break;
17180 case ISD::SETEQ: {
17181 const std::optional<bool> KnownEq =
17182 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17183 if (KnownEq)
17184 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17185
17186 break;
17187 }
17188 case ISD::SETNE: {
17189 const std::optional<bool> KnownEq =
17190 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17191 if (KnownEq)
17192 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17193
17194 break;
17195 }
17196 case ISD::SETULT:
17197 case ISD::SETUGE:
17198 case ISD::SETLT:
17199 case ISD::SETGE: {
17200 const std::optional<bool> KnownUge =
17201 KnownBits::uge(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17202 if (KnownUge) {
17203 if (*KnownUge) {
17204 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17205 NewCC = CC;
17206 } else {
17207 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17208 NewCC = CC == ISD::SETULT ? ISD::SETULE
17209 : CC == ISD::SETUGE ? ISD::SETUGT
17210 : CC == ISD::SETLT ? ISD::SETLE
17211 : ISD::SETGT;
17212 }
17213 }
17214 break;
17215 }
17216 case ISD::SETULE:
17217 case ISD::SETUGT:
17218 case ISD::SETLE:
17219 case ISD::SETGT: {
17220 const std::optional<bool> KnownUle =
17221 KnownBits::ule(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17222 if (KnownUle) {
17223 if (*KnownUle) {
17224 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17225 NewCC = CC;
17226 } else {
17227 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17228 NewCC = CC == ISD::SETULE ? ISD::SETULT
17229 : CC == ISD::SETUGT ? ISD::SETUGE
17230 : CC == ISD::SETLE ? ISD::SETLT
17231 : ISD::SETGE;
17232 }
17233 }
17234 break;
17235 }
17236 }
17237
17238 if (NewCC != ISD::SETCC_INVALID)
17239 return DAG.getSetCC(DL: SL, VT: N->getValueType(ResNo: 0), LHS: getHiHalf64(Op: LHS, DAG),
17240 RHS: getHiHalf64(Op: RHS, DAG), Cond: NewCC);
17241 }
17242
17243 // Eliminate setcc by using carryout from add/sub instruction
17244
17245 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17246 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17247 // similarly for subtraction
17248
17249 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17250 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17251
17252 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17253 sd_match(N: LHS, P: m_Add(L: m_Specific(N: RHS), R: m_Value()))) ||
17254 (CC == ISD::SETUGT &&
17255 sd_match(N: LHS, P: m_Sub(L: m_Specific(N: RHS), R: m_Value()))) ||
17256 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17257 sd_match(N: LHS, P: m_Add(L: m_Value(), R: m_One()))))) {
17258 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17259
17260 SDValue Op0 = LHS.getOperand(i: 0);
17261 SDValue Op1 = LHS.getOperand(i: 1);
17262
17263 SDValue Op0Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op0);
17264 SDValue Op1Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op1);
17265
17266 SDValue Op0Hi = getHiHalf64(Op: Op0, DAG);
17267 SDValue Op1Hi = getHiHalf64(Op: Op1, DAG);
17268
17269 SDValue NodeLo =
17270 DAG.getNode(Opcode: IsAdd ? ISD::UADDO : ISD::USUBO, DL: SL,
17271 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1), Ops: {Op0Lo, Op1Lo});
17272
17273 SDValue CarryInHi = NodeLo.getValue(R: 1);
17274 SDValue NodeHi = DAG.getNode(Opcode: IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17275 DL: SL, VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1),
17276 Ops: {Op0Hi, Op1Hi, CarryInHi});
17277
17278 SDValue ResultLo = NodeLo.getValue(R: 0);
17279 SDValue ResultHi = NodeHi.getValue(R: 0);
17280
17281 SDValue JoinedResult =
17282 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {ResultLo, ResultHi});
17283
17284 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: JoinedResult);
17285 SDValue Overflow = NodeHi.getValue(R: 1);
17286 DCI.CombineTo(N: LHS.getNode(), Res: Result);
17287 return Overflow;
17288 }
17289
17290 if (VT != MVT::f32 && VT != MVT::f64 &&
17291 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17292 return SDValue();
17293
17294 // Match isinf/isfinite pattern
17295 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17296 // (fcmp one (fabs x), inf) -> (fp_class x,
17297 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17298 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17299 LHS.getOpcode() == ISD::FABS) {
17300 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
17301 if (!CRHS)
17302 return SDValue();
17303
17304 const APFloat &APF = CRHS->getValueAPF();
17305 if (APF.isInfinity() && !APF.isNegative()) {
17306 const unsigned IsInfMask =
17307 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
17308 const unsigned IsFiniteMask =
17309 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
17310 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
17311 SIInstrFlags::P_SUBNORMAL;
17312 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17313 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17314 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
17315 }
17316 }
17317
17318 return SDValue();
17319}
17320
17321SDValue
17322SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17323 DAGCombinerInfo &DCI) const {
17324 SelectionDAG &DAG = DCI.DAG;
17325 SDLoc SL(N);
17326 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17327
17328 SDValue Src = N->getOperand(Num: 0);
17329 SDValue Shift = N->getOperand(Num: 0);
17330
17331 // TODO: Extend type shouldn't matter (assuming legal types).
17332 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17333 Shift = Shift.getOperand(i: 0);
17334
17335 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17336 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17337 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17338 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17339 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17340 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17341 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
17342 SDValue Shifted = DAG.getZExtOrTrunc(
17343 Op: Shift.getOperand(i: 0), DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
17344
17345 unsigned ShiftOffset = 8 * Offset;
17346 if (Shift.getOpcode() == ISD::SHL)
17347 ShiftOffset -= C->getZExtValue();
17348 else
17349 ShiftOffset += C->getZExtValue();
17350
17351 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17352 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
17353 VT: MVT::f32, Operand: Shifted);
17354 }
17355 }
17356 }
17357
17358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17359 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
17360 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
17361 // We simplified Src. If this node is not dead, visit it again so it is
17362 // folded properly.
17363 if (N->getOpcode() != ISD::DELETED_NODE)
17364 DCI.AddToWorklist(N);
17365 return SDValue(N, 0);
17366 }
17367
17368 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17369 if (SDValue DemandedSrc =
17370 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
17371 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
17372
17373 return SDValue();
17374}
17375
17376SDValue SITargetLowering::performClampCombine(SDNode *N,
17377 DAGCombinerInfo &DCI) const {
17378 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
17379 if (!CSrc)
17380 return SDValue();
17381
17382 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17383 const APFloat &F = CSrc->getValueAPF();
17384 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
17385 if (F < Zero ||
17386 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17387 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17388 }
17389
17390 APFloat One(F.getSemantics(), "1.0");
17391 if (F > One)
17392 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17393
17394 return SDValue(CSrc, 0);
17395}
17396
17397SDValue SITargetLowering::performSelectCombine(SDNode *N,
17398 DAGCombinerInfo &DCI) const {
17399
17400 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17401 // integer).
17402 // Detect when CMP and SELECT use the same constant and fold them to avoid
17403 // loading the constant twice. Specifically handles patterns like:
17404 // %cmp = icmp eq i32 %val, 4242
17405 // %sel = select i1 %cmp, i32 4242, i32 %other
17406 // It can be optimized to reuse %val instead of 4242 in select.
17407 SDValue Cond = N->getOperand(Num: 0);
17408 SDValue TrueVal = N->getOperand(Num: 1);
17409 SDValue FalseVal = N->getOperand(Num: 2);
17410
17411 // Check if condition is a comparison.
17412 if (Cond.getOpcode() != ISD::SETCC)
17413 return SDValue();
17414
17415 SDValue LHS = Cond.getOperand(i: 0);
17416 SDValue RHS = Cond.getOperand(i: 1);
17417 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
17418
17419 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17420 bool isInteger = LHS.getValueType().isInteger();
17421
17422 // Handle simple floating-point and integer types only.
17423 if (!isFloatingPoint && !isInteger)
17424 return SDValue();
17425
17426 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17427 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17428 if (!isEquality && !isNonEquality)
17429 return SDValue();
17430
17431 SDValue ArgVal, ConstVal;
17432 if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: RHS)) ||
17433 (isInteger && isa<ConstantSDNode>(Val: RHS))) {
17434 ConstVal = RHS;
17435 ArgVal = LHS;
17436 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: LHS)) ||
17437 (isInteger && isa<ConstantSDNode>(Val: LHS))) {
17438 ConstVal = LHS;
17439 ArgVal = RHS;
17440 } else {
17441 return SDValue();
17442 }
17443
17444 // Skip optimization for inlinable immediates.
17445 if (isFloatingPoint) {
17446 const APFloat &Val = cast<ConstantFPSDNode>(Val&: ConstVal)->getValueAPF();
17447 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Imm: Val))
17448 return SDValue();
17449 } else {
17450 if (AMDGPU::isInlinableIntLiteral(
17451 Literal: cast<ConstantSDNode>(Val&: ConstVal)->getSExtValue()))
17452 return SDValue();
17453 }
17454
17455 // For equality and non-equality comparisons, patterns:
17456 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17457 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17458 if (!(isEquality && TrueVal == ConstVal) &&
17459 !(isNonEquality && FalseVal == ConstVal))
17460 return SDValue();
17461
17462 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17463 SDValue SelectRHS =
17464 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17465 return DCI.DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Cond,
17466 N2: SelectLHS, N3: SelectRHS);
17467}
17468
17469SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
17470 DAGCombinerInfo &DCI) const {
17471 switch (N->getOpcode()) {
17472 case ISD::ADD:
17473 case ISD::SUB:
17474 case ISD::SHL:
17475 case ISD::SRL:
17476 case ISD::SRA:
17477 case ISD::AND:
17478 case ISD::OR:
17479 case ISD::XOR:
17480 case ISD::MUL:
17481 case ISD::SETCC:
17482 case ISD::SELECT:
17483 case ISD::SMIN:
17484 case ISD::SMAX:
17485 case ISD::UMIN:
17486 case ISD::UMAX:
17487 if (auto Res = promoteUniformOpToI32(Op: SDValue(N, 0), DCI))
17488 return Res;
17489 break;
17490 default:
17491 break;
17492 }
17493
17494 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17495 return SDValue();
17496
17497 switch (N->getOpcode()) {
17498 case ISD::ADD:
17499 return performAddCombine(N, DCI);
17500 case ISD::PTRADD:
17501 return performPtrAddCombine(N, DCI);
17502 case ISD::SUB:
17503 return performSubCombine(N, DCI);
17504 case ISD::UADDO_CARRY:
17505 case ISD::USUBO_CARRY:
17506 return performAddCarrySubCarryCombine(N, DCI);
17507 case ISD::FADD:
17508 return performFAddCombine(N, DCI);
17509 case ISD::FSUB:
17510 return performFSubCombine(N, DCI);
17511 case ISD::FDIV:
17512 return performFDivCombine(N, DCI);
17513 case ISD::FMUL:
17514 return performFMulCombine(N, DCI);
17515 case ISD::SETCC:
17516 return performSetCCCombine(N, DCI);
17517 case ISD::SELECT:
17518 if (auto Res = performSelectCombine(N, DCI))
17519 return Res;
17520 break;
17521 case ISD::FMAXNUM:
17522 case ISD::FMINNUM:
17523 case ISD::FMAXNUM_IEEE:
17524 case ISD::FMINNUM_IEEE:
17525 case ISD::FMAXIMUM:
17526 case ISD::FMINIMUM:
17527 case ISD::FMAXIMUMNUM:
17528 case ISD::FMINIMUMNUM:
17529 case ISD::SMAX:
17530 case ISD::SMIN:
17531 case ISD::UMAX:
17532 case ISD::UMIN:
17533 case AMDGPUISD::FMIN_LEGACY:
17534 case AMDGPUISD::FMAX_LEGACY:
17535 return performMinMaxCombine(N, DCI);
17536 case ISD::FMA:
17537 return performFMACombine(N, DCI);
17538 case ISD::AND:
17539 return performAndCombine(N, DCI);
17540 case ISD::OR:
17541 return performOrCombine(N, DCI);
17542 case ISD::FSHR: {
17543 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17544 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
17545 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
17546 return matchPERM(N, DCI);
17547 }
17548 break;
17549 }
17550 case ISD::XOR:
17551 return performXorCombine(N, DCI);
17552 case ISD::ANY_EXTEND:
17553 case ISD::ZERO_EXTEND:
17554 return performZeroOrAnyExtendCombine(N, DCI);
17555 case ISD::SIGN_EXTEND_INREG:
17556 return performSignExtendInRegCombine(N, DCI);
17557 case AMDGPUISD::FP_CLASS:
17558 return performClassCombine(N, DCI);
17559 case ISD::FCANONICALIZE:
17560 return performFCanonicalizeCombine(N, DCI);
17561 case AMDGPUISD::RCP:
17562 return performRcpCombine(N, DCI);
17563 case ISD::FLDEXP:
17564 case AMDGPUISD::FRACT:
17565 case AMDGPUISD::RSQ:
17566 case AMDGPUISD::RCP_LEGACY:
17567 case AMDGPUISD::RCP_IFLAG:
17568 case AMDGPUISD::RSQ_CLAMP: {
17569 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17570 SDValue Src = N->getOperand(Num: 0);
17571 if (Src.isUndef())
17572 return Src;
17573 break;
17574 }
17575 case ISD::SINT_TO_FP:
17576 case ISD::UINT_TO_FP:
17577 return performUCharToFloatCombine(N, DCI);
17578 case ISD::FCOPYSIGN:
17579 return performFCopySignCombine(N, DCI);
17580 case AMDGPUISD::CVT_F32_UBYTE0:
17581 case AMDGPUISD::CVT_F32_UBYTE1:
17582 case AMDGPUISD::CVT_F32_UBYTE2:
17583 case AMDGPUISD::CVT_F32_UBYTE3:
17584 return performCvtF32UByteNCombine(N, DCI);
17585 case AMDGPUISD::FMED3:
17586 return performFMed3Combine(N, DCI);
17587 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17588 return performCvtPkRTZCombine(N, DCI);
17589 case AMDGPUISD::CLAMP:
17590 return performClampCombine(N, DCI);
17591 case ISD::SCALAR_TO_VECTOR: {
17592 SelectionDAG &DAG = DCI.DAG;
17593 EVT VT = N->getValueType(ResNo: 0);
17594
17595 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17596 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17597 SDLoc SL(N);
17598 SDValue Src = N->getOperand(Num: 0);
17599 EVT EltVT = Src.getValueType();
17600 if (EltVT != MVT::i16)
17601 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
17602
17603 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
17604 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
17605 }
17606
17607 break;
17608 }
17609 case ISD::EXTRACT_VECTOR_ELT:
17610 return performExtractVectorEltCombine(N, DCI);
17611 case ISD::INSERT_VECTOR_ELT:
17612 return performInsertVectorEltCombine(N, DCI);
17613 case ISD::FP_ROUND:
17614 return performFPRoundCombine(N, DCI);
17615 case ISD::LOAD: {
17616 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
17617 return Widened;
17618 [[fallthrough]];
17619 }
17620 default: {
17621 if (!DCI.isBeforeLegalize()) {
17622 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
17623 return performMemSDNodeCombine(N: MemNode, DCI);
17624 }
17625
17626 break;
17627 }
17628 }
17629
17630 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
17631}
17632
17633/// Helper function for adjustWritemask
17634static unsigned SubIdx2Lane(unsigned Idx) {
17635 switch (Idx) {
17636 default:
17637 return ~0u;
17638 case AMDGPU::sub0:
17639 return 0;
17640 case AMDGPU::sub1:
17641 return 1;
17642 case AMDGPU::sub2:
17643 return 2;
17644 case AMDGPU::sub3:
17645 return 3;
17646 case AMDGPU::sub4:
17647 return 4; // Possible with TFE/LWE
17648 }
17649}
17650
17651/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17652SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17653 SelectionDAG &DAG) const {
17654 unsigned Opcode = Node->getMachineOpcode();
17655
17656 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17657 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - 1;
17658 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
17659 return Node; // not implemented for D16
17660
17661 SDNode *Users[5] = {nullptr};
17662 unsigned Lane = 0;
17663 unsigned DmaskIdx =
17664 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - 1;
17665 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
17666 unsigned NewDmask = 0;
17667 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - 1;
17668 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - 1;
17669 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
17670 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx));
17671 unsigned TFCLane = 0;
17672 bool HasChain = Node->getNumValues() > 1;
17673
17674 if (OldDmask == 0) {
17675 // These are folded out, but on the chance it happens don't assert.
17676 return Node;
17677 }
17678
17679 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
17680 // Work out which is the TFE/LWE lane if that is enabled.
17681 if (UsesTFC) {
17682 TFCLane = OldBitsSet;
17683 }
17684
17685 // Try to figure out the used register components
17686 for (SDUse &Use : Node->uses()) {
17687
17688 // Don't look at users of the chain.
17689 if (Use.getResNo() != 0)
17690 continue;
17691
17692 SDNode *User = Use.getUser();
17693
17694 // Abort if we can't understand the usage
17695 if (!User->isMachineOpcode() ||
17696 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17697 return Node;
17698
17699 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17700 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17701 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17702 // set, etc.
17703 Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: 1));
17704 if (Lane == ~0u)
17705 return Node;
17706
17707 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17708 if (UsesTFC && Lane == TFCLane) {
17709 Users[Lane] = User;
17710 } else {
17711 // Set which texture component corresponds to the lane.
17712 unsigned Comp;
17713 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17714 Comp = llvm::countr_zero(Val: Dmask);
17715 Dmask &= ~(1 << Comp);
17716 }
17717
17718 // Abort if we have more than one user per component.
17719 if (Users[Lane])
17720 return Node;
17721
17722 Users[Lane] = User;
17723 NewDmask |= 1 << Comp;
17724 }
17725 }
17726
17727 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17728 bool NoChannels = !NewDmask;
17729 if (NoChannels) {
17730 if (!UsesTFC) {
17731 // No uses of the result and not using TFC. Then do nothing.
17732 return Node;
17733 }
17734 // If the original dmask has one channel - then nothing to do
17735 if (OldBitsSet == 1)
17736 return Node;
17737 // Use an arbitrary dmask - required for the instruction to work
17738 NewDmask = 1;
17739 }
17740 // Abort if there's no change
17741 if (NewDmask == OldDmask)
17742 return Node;
17743
17744 unsigned BitsSet = llvm::popcount(Value: NewDmask);
17745
17746 // Check for TFE or LWE - increase the number of channels by one to account
17747 // for the extra return value
17748 // This will need adjustment for D16 if this is also included in
17749 // adjustWriteMask (this function) but at present D16 are excluded.
17750 unsigned NewChannels = BitsSet + UsesTFC;
17751
17752 int NewOpcode =
17753 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
17754 assert(NewOpcode != -1 &&
17755 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17756 "failed to find equivalent MIMG op");
17757
17758 // Adjust the writemask in the node
17759 SmallVector<SDValue, 12> Ops;
17760 llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
17761 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
17762 llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + 1));
17763
17764 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
17765
17766 MVT ResultVT = NewChannels == 1
17767 ? SVT
17768 : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4
17769 : NewChannels == 5 ? 8
17770 : NewChannels);
17771 SDVTList NewVTList =
17772 HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
17773
17774 MachineSDNode *NewNode =
17775 DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), VTs: NewVTList, Ops);
17776
17777 if (HasChain) {
17778 // Update chain.
17779 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
17780 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
17781 }
17782
17783 if (NewChannels == 1) {
17784 assert(Node->hasNUsesOfValue(1, 0));
17785 SDNode *Copy =
17786 DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc(Node),
17787 VT: Users[Lane]->getValueType(ResNo: 0), Op1: SDValue(NewNode, 0));
17788 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
17789 return nullptr;
17790 }
17791
17792 // Update the users of the node with the new indices
17793 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17794 SDNode *User = Users[i];
17795 if (!User) {
17796 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17797 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17798 if (i || !NoChannels)
17799 continue;
17800 } else {
17801 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
17802 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
17803 if (NewUser != User) {
17804 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
17805 DAG.RemoveDeadNode(N: User);
17806 }
17807 }
17808
17809 switch (Idx) {
17810 default:
17811 break;
17812 case AMDGPU::sub0:
17813 Idx = AMDGPU::sub1;
17814 break;
17815 case AMDGPU::sub1:
17816 Idx = AMDGPU::sub2;
17817 break;
17818 case AMDGPU::sub2:
17819 Idx = AMDGPU::sub3;
17820 break;
17821 case AMDGPU::sub3:
17822 Idx = AMDGPU::sub4;
17823 break;
17824 }
17825 }
17826
17827 DAG.RemoveDeadNode(N: Node);
17828 return nullptr;
17829}
17830
17831static bool isFrameIndexOp(SDValue Op) {
17832 if (Op.getOpcode() == ISD::AssertZext)
17833 Op = Op.getOperand(i: 0);
17834
17835 return isa<FrameIndexSDNode>(Val: Op);
17836}
17837
17838/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17839/// with frame index operands.
17840/// LLVM assumes that inputs are to these instructions are registers.
17841SDNode *
17842SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
17843 SelectionDAG &DAG) const {
17844 if (Node->getOpcode() == ISD::CopyToReg) {
17845 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
17846 SDValue SrcVal = Node->getOperand(Num: 2);
17847
17848 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17849 // to try understanding copies to physical registers.
17850 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17851 SDLoc SL(Node);
17852 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17853 SDValue VReg = DAG.getRegister(
17854 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
17855
17856 SDNode *Glued = Node->getGluedNode();
17857 SDValue ToVReg = DAG.getCopyToReg(
17858 Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
17859 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17860 SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
17861 N: VReg, Glue: ToVReg.getValue(R: 1));
17862 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
17863 DAG.RemoveDeadNode(N: Node);
17864 return ToResultReg.getNode();
17865 }
17866 }
17867
17868 SmallVector<SDValue, 8> Ops;
17869 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17870 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
17871 Ops.push_back(Elt: Node->getOperand(Num: i));
17872 continue;
17873 }
17874
17875 SDLoc DL(Node);
17876 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
17877 VT: Node->getOperand(Num: i).getValueType(),
17878 Op1: Node->getOperand(Num: i)),
17879 0));
17880 }
17881
17882 return DAG.UpdateNodeOperands(N: Node, Ops);
17883}
17884
17885/// Fold the instructions after selecting them.
17886/// Returns null if users were already updated.
17887SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
17888 SelectionDAG &DAG) const {
17889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17890 unsigned Opcode = Node->getMachineOpcode();
17891
17892 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17893 !TII->isGather4(Opcode) &&
17894 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
17895 return adjustWritemask(Node, DAG);
17896 }
17897
17898 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17899 legalizeTargetIndependentNode(Node, DAG);
17900 return Node;
17901 }
17902
17903 switch (Opcode) {
17904 case AMDGPU::V_DIV_SCALE_F32_e64:
17905 case AMDGPU::V_DIV_SCALE_F64_e64: {
17906 // Satisfy the operand register constraint when one of the inputs is
17907 // undefined. Ordinarily each undef value will have its own implicit_def of
17908 // a vreg, so force these to use a single register.
17909 SDValue Src0 = Node->getOperand(Num: 1);
17910 SDValue Src1 = Node->getOperand(Num: 3);
17911 SDValue Src2 = Node->getOperand(Num: 5);
17912
17913 if ((Src0.isMachineOpcode() &&
17914 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17915 (Src0 == Src1 || Src0 == Src2))
17916 break;
17917
17918 MVT VT = Src0.getValueType().getSimpleVT();
17919 const TargetRegisterClass *RC =
17920 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
17921
17922 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17923 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
17924
17925 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), Reg: UndefReg,
17926 N: Src0, Glue: SDValue());
17927
17928 // src0 must be the same register as src1 or src2, even if the value is
17929 // undefined, so make sure we don't violate this constraint.
17930 if (Src0.isMachineOpcode() &&
17931 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17932 if (Src1.isMachineOpcode() &&
17933 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17934 Src0 = Src1;
17935 else if (Src2.isMachineOpcode() &&
17936 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17937 Src0 = Src2;
17938 else {
17939 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17940 Src0 = UndefReg;
17941 Src1 = UndefReg;
17942 }
17943 } else
17944 break;
17945
17946 SmallVector<SDValue, 9> Ops(Node->ops());
17947 Ops[1] = Src0;
17948 Ops[3] = Src1;
17949 Ops[5] = Src2;
17950 Ops.push_back(Elt: ImpDef.getValue(R: 1));
17951 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
17952 }
17953 default:
17954 break;
17955 }
17956
17957 return Node;
17958}
17959
17960// Any MIMG instructions that use tfe or lwe require an initialization of the
17961// result register that will be written in the case of a memory access failure.
17962// The required code is also added to tie this init code to the result of the
17963// img instruction.
17964void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
17965 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17966 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17967 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17968 MachineBasicBlock &MBB = *MI.getParent();
17969
17970 int DstIdx =
17971 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
17972 unsigned InitIdx = 0;
17973
17974 if (TII->isImage(MI)) {
17975 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
17976 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
17977 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
17978
17979 if (!TFE && !LWE) // intersect_ray
17980 return;
17981
17982 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17983 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17984 unsigned D16Val = D16 ? D16->getImm() : 0;
17985
17986 if (!TFEVal && !LWEVal)
17987 return;
17988
17989 // At least one of TFE or LWE are non-zero
17990 // We have to insert a suitable initialization of the result value and
17991 // tie this to the dest of the image instruction.
17992
17993 // Calculate which dword we have to initialize to 0.
17994 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
17995
17996 // check that dmask operand is found.
17997 assert(MO_Dmask && "Expected dmask operand in instruction");
17998
17999 unsigned dmask = MO_Dmask->getImm();
18000 // Determine the number of active lanes taking into account the
18001 // Gather4 special case
18002 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
18003
18004 bool Packed = !Subtarget->hasUnpackedD16VMem();
18005
18006 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18007
18008 // Abandon attempt if the dst size isn't large enough
18009 // - this is in fact an error but this is picked up elsewhere and
18010 // reported correctly.
18011 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18012
18013 uint32_t DstSize = TRI.getRegSizeInBits(RC: *DstRC) / 32;
18014 if (DstSize < InitIdx)
18015 return;
18016 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
18017 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18018 InitIdx = TRI.getRegSizeInBits(RC: *DstRC) / 32;
18019 } else {
18020 return;
18021 }
18022
18023 const DebugLoc &DL = MI.getDebugLoc();
18024
18025 // Create a register for the initialization value.
18026 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
18027 unsigned NewDst = 0; // Final initialized value will be in here
18028
18029 // If PRTStrictNull feature is enabled (the default) then initialize
18030 // all the result registers to 0, otherwise just the error indication
18031 // register (VGPRn+1)
18032 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18033 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18034
18035 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
18036 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18037 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
18038 // Initialize dword
18039 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
18040 // clang-format off
18041 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
18042 .addImm(Val: 0);
18043 // clang-format on
18044 // Insert into the super-reg
18045 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
18046 .addReg(RegNo: PrevDst)
18047 .addReg(RegNo: SubReg)
18048 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
18049
18050 PrevDst = NewDst;
18051 }
18052
18053 // Add as an implicit operand
18054 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
18055
18056 // Tie the just added implicit operand to the dst
18057 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
18058}
18059
18060/// Assign the register class depending on the number of
18061/// bits set in the writemask
18062void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
18063 SDNode *Node) const {
18064 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18065
18066 MachineFunction *MF = MI.getMF();
18067 MachineRegisterInfo &MRI = MF->getRegInfo();
18068
18069 if (TII->isVOP3(Opcode: MI.getOpcode())) {
18070 // Make sure constant bus requirements are respected.
18071 TII->legalizeOperandsVOP3(MRI, MI);
18072
18073 if (TII->isMAI(MI)) {
18074 // The ordinary src0, src1, src2 were legalized above.
18075 //
18076 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18077 // as a separate instruction.
18078 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18079 Name: AMDGPU::OpName::scale_src0);
18080 if (Src0Idx != -1) {
18081 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18082 Name: AMDGPU::OpName::scale_src1);
18083 if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
18084 TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
18085 TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
18086 }
18087 }
18088
18089 return;
18090 }
18091
18092 if (TII->isImage(MI))
18093 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
18094}
18095
18096static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
18097 uint64_t Val) {
18098 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
18099 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
18100}
18101
18102MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
18103 const SDLoc &DL,
18104 SDValue Ptr) const {
18105 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18106
18107 // Build the half of the subregister with the constants before building the
18108 // full 128-bit register. If we are building multiple resource descriptors,
18109 // this will allow CSEing of the 2-component register.
18110 const SDValue Ops0[] = {
18111 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
18112 buildSMovImm32(DAG, DL, Val: 0),
18113 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18114 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
18115 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
18116
18117 SDValue SubRegHi = SDValue(
18118 DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), 0);
18119
18120 // Combine the constants and the pointer.
18121 const SDValue Ops1[] = {
18122 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
18123 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
18124 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
18125
18126 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
18127}
18128
18129/// Return a resource descriptor with the 'Add TID' bit enabled
18130/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18131/// of the resource descriptor) to create an offset, which is added to
18132/// the resource pointer.
18133MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
18134 SDValue Ptr, uint32_t RsrcDword1,
18135 uint64_t RsrcDword2And3) const {
18136 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
18137 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
18138 if (RsrcDword1) {
18139 PtrHi =
18140 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
18141 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
18142 0);
18143 }
18144
18145 SDValue DataLo =
18146 buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18147 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
18148
18149 const SDValue Ops[] = {
18150 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
18151 PtrLo,
18152 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18153 PtrHi,
18154 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
18155 DataLo,
18156 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
18157 DataHi,
18158 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
18159
18160 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
18161}
18162
18163//===----------------------------------------------------------------------===//
18164// SI Inline Assembly Support
18165//===----------------------------------------------------------------------===//
18166
18167std::pair<unsigned, const TargetRegisterClass *>
18168SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
18169 StringRef Constraint,
18170 MVT VT) const {
18171 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18172
18173 const TargetRegisterClass *RC = nullptr;
18174 if (Constraint.size() == 1) {
18175 // Check if we cannot determine the bit size of the given value type. This
18176 // can happen, for example, in this situation where we have an empty struct
18177 // (size 0): `call void asm "", "v"({} poison)`-
18178 if (VT == MVT::Other)
18179 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18180 const unsigned BitWidth = VT.getSizeInBits();
18181 switch (Constraint[0]) {
18182 default:
18183 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18184 case 's':
18185 case 'r':
18186 switch (BitWidth) {
18187 case 16:
18188 RC = &AMDGPU::SReg_32RegClass;
18189 break;
18190 case 64:
18191 RC = &AMDGPU::SGPR_64RegClass;
18192 break;
18193 default:
18194 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
18195 if (!RC)
18196 return std::pair(0U, nullptr);
18197 break;
18198 }
18199 break;
18200 case 'v':
18201 switch (BitWidth) {
18202 case 1:
18203 return std::pair(0U, nullptr);
18204 case 16:
18205 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18206 : &AMDGPU::VGPR_32_Lo256RegClass;
18207 break;
18208 default:
18209 RC = Subtarget->has1024AddressableVGPRs()
18210 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18211 : TRI->getVGPRClassForBitWidth(BitWidth);
18212 if (!RC)
18213 return std::pair(0U, nullptr);
18214 break;
18215 }
18216 break;
18217 case 'a':
18218 if (!Subtarget->hasMAIInsts())
18219 break;
18220 switch (BitWidth) {
18221 case 1:
18222 return std::pair(0U, nullptr);
18223 case 16:
18224 RC = &AMDGPU::AGPR_32RegClass;
18225 break;
18226 default:
18227 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18228 if (!RC)
18229 return std::pair(0U, nullptr);
18230 break;
18231 }
18232 break;
18233 }
18234 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18235 const unsigned BitWidth = VT.getSizeInBits();
18236 switch (BitWidth) {
18237 case 16:
18238 RC = &AMDGPU::AV_32RegClass;
18239 break;
18240 default:
18241 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18242 if (!RC)
18243 return std::pair(0U, nullptr);
18244 break;
18245 }
18246 }
18247
18248 // We actually support i128, i16 and f16 as inline parameters
18249 // even if they are not reported as legal
18250 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18251 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18252 return std::pair(0U, RC);
18253
18254 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18255 if (Kind != '\0') {
18256 if (Kind == 'v') {
18257 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18258 } else if (Kind == 's') {
18259 RC = &AMDGPU::SGPR_32RegClass;
18260 } else if (Kind == 'a') {
18261 RC = &AMDGPU::AGPR_32RegClass;
18262 }
18263
18264 if (RC) {
18265 if (NumRegs > 1) {
18266 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18267 return std::pair(0U, nullptr);
18268
18269 uint32_t Width = NumRegs * 32;
18270 // Prohibit constraints for register ranges with a width that does not
18271 // match the required type.
18272 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18273 return std::pair(0U, nullptr);
18274
18275 MCRegister Reg = RC->getRegister(i: Idx);
18276 if (SIRegisterInfo::isVGPRClass(RC))
18277 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
18278 else if (SIRegisterInfo::isSGPRClass(RC))
18279 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
18280 else if (SIRegisterInfo::isAGPRClass(RC))
18281 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
18282 if (RC) {
18283 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
18284 if (!Reg) {
18285 // The register class does not contain the requested register,
18286 // e.g., because it is an SGPR pair that would violate alignment
18287 // requirements.
18288 return std::pair(0U, nullptr);
18289 }
18290 return std::pair(Reg, RC);
18291 }
18292 }
18293
18294 // Check for lossy scalar/vector conversions.
18295 if (VT.isVector() && VT.getSizeInBits() != 32)
18296 return std::pair(0U, nullptr);
18297 if (Idx < RC->getNumRegs())
18298 return std::pair(RC->getRegister(i: Idx), RC);
18299 return std::pair(0U, nullptr);
18300 }
18301 }
18302
18303 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18304 if (Ret.first)
18305 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
18306
18307 return Ret;
18308}
18309
18310static bool isImmConstraint(StringRef Constraint) {
18311 if (Constraint.size() == 1) {
18312 switch (Constraint[0]) {
18313 default:
18314 break;
18315 case 'I':
18316 case 'J':
18317 case 'A':
18318 case 'B':
18319 case 'C':
18320 return true;
18321 }
18322 } else if (Constraint == "DA" || Constraint == "DB") {
18323 return true;
18324 }
18325 return false;
18326}
18327
18328SITargetLowering::ConstraintType
18329SITargetLowering::getConstraintType(StringRef Constraint) const {
18330 if (Constraint.size() == 1) {
18331 switch (Constraint[0]) {
18332 default:
18333 break;
18334 case 's':
18335 case 'v':
18336 case 'a':
18337 return C_RegisterClass;
18338 }
18339 } else if (Constraint.size() == 2) {
18340 if (Constraint == "VA")
18341 return C_RegisterClass;
18342 }
18343 if (isImmConstraint(Constraint)) {
18344 return C_Other;
18345 }
18346 return TargetLowering::getConstraintType(Constraint);
18347}
18348
18349static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18350 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
18351 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
18352 }
18353 return Val;
18354}
18355
18356void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18357 StringRef Constraint,
18358 std::vector<SDValue> &Ops,
18359 SelectionDAG &DAG) const {
18360 if (isImmConstraint(Constraint)) {
18361 uint64_t Val;
18362 if (getAsmOperandConstVal(Op, Val) &&
18363 checkAsmConstraintVal(Op, Constraint, Val)) {
18364 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
18365 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
18366 }
18367 } else {
18368 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18369 }
18370}
18371
18372bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
18373 unsigned Size = Op.getScalarValueSizeInBits();
18374 if (Size > 64)
18375 return false;
18376
18377 if (Size == 16 && !Subtarget->has16BitInsts())
18378 return false;
18379
18380 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
18381 Val = C->getSExtValue();
18382 return true;
18383 }
18384 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
18385 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18386 return true;
18387 }
18388 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
18389 if (Size != 16 || Op.getNumOperands() != 2)
18390 return false;
18391 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
18392 return false;
18393 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18394 Val = C->getSExtValue();
18395 return true;
18396 }
18397 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18398 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18399 return true;
18400 }
18401 }
18402
18403 return false;
18404}
18405
18406bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
18407 uint64_t Val) const {
18408 if (Constraint.size() == 1) {
18409 switch (Constraint[0]) {
18410 case 'I':
18411 return AMDGPU::isInlinableIntLiteral(Literal: Val);
18412 case 'J':
18413 return isInt<16>(x: Val);
18414 case 'A':
18415 return checkAsmConstraintValA(Op, Val);
18416 case 'B':
18417 return isInt<32>(x: Val);
18418 case 'C':
18419 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
18420 AMDGPU::isInlinableIntLiteral(Literal: Val);
18421 default:
18422 break;
18423 }
18424 } else if (Constraint.size() == 2) {
18425 if (Constraint == "DA") {
18426 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18427 int64_t LoBits = static_cast<int32_t>(Val);
18428 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
18429 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
18430 }
18431 if (Constraint == "DB") {
18432 return true;
18433 }
18434 }
18435 llvm_unreachable("Invalid asm constraint");
18436}
18437
18438bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
18439 unsigned MaxSize) const {
18440 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
18441 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18442 if (Size == 16) {
18443 MVT VT = Op.getSimpleValueType();
18444 switch (VT.SimpleTy) {
18445 default:
18446 return false;
18447 case MVT::i16:
18448 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
18449 case MVT::f16:
18450 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
18451 case MVT::bf16:
18452 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
18453 case MVT::v2i16:
18454 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
18455 case MVT::v2f16:
18456 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
18457 case MVT::v2bf16:
18458 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
18459 }
18460 }
18461 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
18462 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
18463 return true;
18464 return false;
18465}
18466
18467static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18468 switch (UnalignedClassID) {
18469 case AMDGPU::VReg_64RegClassID:
18470 return AMDGPU::VReg_64_Align2RegClassID;
18471 case AMDGPU::VReg_96RegClassID:
18472 return AMDGPU::VReg_96_Align2RegClassID;
18473 case AMDGPU::VReg_128RegClassID:
18474 return AMDGPU::VReg_128_Align2RegClassID;
18475 case AMDGPU::VReg_160RegClassID:
18476 return AMDGPU::VReg_160_Align2RegClassID;
18477 case AMDGPU::VReg_192RegClassID:
18478 return AMDGPU::VReg_192_Align2RegClassID;
18479 case AMDGPU::VReg_224RegClassID:
18480 return AMDGPU::VReg_224_Align2RegClassID;
18481 case AMDGPU::VReg_256RegClassID:
18482 return AMDGPU::VReg_256_Align2RegClassID;
18483 case AMDGPU::VReg_288RegClassID:
18484 return AMDGPU::VReg_288_Align2RegClassID;
18485 case AMDGPU::VReg_320RegClassID:
18486 return AMDGPU::VReg_320_Align2RegClassID;
18487 case AMDGPU::VReg_352RegClassID:
18488 return AMDGPU::VReg_352_Align2RegClassID;
18489 case AMDGPU::VReg_384RegClassID:
18490 return AMDGPU::VReg_384_Align2RegClassID;
18491 case AMDGPU::VReg_512RegClassID:
18492 return AMDGPU::VReg_512_Align2RegClassID;
18493 case AMDGPU::VReg_1024RegClassID:
18494 return AMDGPU::VReg_1024_Align2RegClassID;
18495 case AMDGPU::AReg_64RegClassID:
18496 return AMDGPU::AReg_64_Align2RegClassID;
18497 case AMDGPU::AReg_96RegClassID:
18498 return AMDGPU::AReg_96_Align2RegClassID;
18499 case AMDGPU::AReg_128RegClassID:
18500 return AMDGPU::AReg_128_Align2RegClassID;
18501 case AMDGPU::AReg_160RegClassID:
18502 return AMDGPU::AReg_160_Align2RegClassID;
18503 case AMDGPU::AReg_192RegClassID:
18504 return AMDGPU::AReg_192_Align2RegClassID;
18505 case AMDGPU::AReg_256RegClassID:
18506 return AMDGPU::AReg_256_Align2RegClassID;
18507 case AMDGPU::AReg_512RegClassID:
18508 return AMDGPU::AReg_512_Align2RegClassID;
18509 case AMDGPU::AReg_1024RegClassID:
18510 return AMDGPU::AReg_1024_Align2RegClassID;
18511 default:
18512 return -1;
18513 }
18514}
18515
18516// Figure out which registers should be reserved for stack access. Only after
18517// the function is legalized do we know all of the non-spill stack objects or if
18518// calls are present.
18519void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
18520 MachineRegisterInfo &MRI = MF.getRegInfo();
18521 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18522 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18523 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18524 const SIInstrInfo *TII = ST.getInstrInfo();
18525
18526 if (Info->isEntryFunction()) {
18527 // Callable functions have fixed registers used for stack access.
18528 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
18529 }
18530
18531 // TODO: Move this logic to getReservedRegs()
18532 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18533 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18534 Register SReg = ST.isWave32()
18535 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
18536 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18537 RC: &AMDGPU::SGPR_64RegClass);
18538 Info->setSGPRForEXECCopy(SReg);
18539
18540 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18541 Info->getStackPtrOffsetReg()));
18542 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18543 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
18544
18545 // We need to worry about replacing the default register with itself in case
18546 // of MIR testcases missing the MFI.
18547 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18548 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
18549
18550 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18551 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
18552
18553 Info->limitOccupancy(MF);
18554
18555 if (ST.isWave32() && !MF.empty()) {
18556 for (auto &MBB : MF) {
18557 for (auto &MI : MBB) {
18558 TII->fixImplicitOperands(MI);
18559 }
18560 }
18561 }
18562
18563 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18564 // classes if required. Ideally the register class constraints would differ
18565 // per-subtarget, but there's no easy way to achieve that right now. This is
18566 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18567 // from using them as the register class for legal types.
18568 if (ST.needsAlignedVGPRs()) {
18569 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18570 const Register Reg = Register::index2VirtReg(Index: I);
18571 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18572 if (!RC)
18573 continue;
18574 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
18575 if (NewClassID != -1)
18576 MRI.setRegClass(Reg, RC: TRI->getRegClass(i: NewClassID));
18577 }
18578 }
18579
18580 TargetLoweringBase::finalizeLowering(MF);
18581}
18582
18583void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18584 KnownBits &Known,
18585 const APInt &DemandedElts,
18586 const SelectionDAG &DAG,
18587 unsigned Depth) const {
18588 Known.resetAll();
18589 unsigned Opc = Op.getOpcode();
18590 switch (Opc) {
18591 case ISD::INTRINSIC_WO_CHAIN: {
18592 unsigned IID = Op.getConstantOperandVal(i: 0);
18593 switch (IID) {
18594 case Intrinsic::amdgcn_mbcnt_lo:
18595 case Intrinsic::amdgcn_mbcnt_hi: {
18596 const GCNSubtarget &ST =
18597 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
18598 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18599 // most 31 + src1.
18600 Known.Zero.setBitsFrom(
18601 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18602 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
18603 Known = KnownBits::add(LHS: Known, RHS: Known2);
18604 return;
18605 }
18606 }
18607 break;
18608 }
18609 }
18610 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
18611 Op, Known, DemandedElts, DAG, Depth);
18612}
18613
18614void SITargetLowering::computeKnownBitsForFrameIndex(
18615 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18616 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
18617
18618 // Set the high bits to zero based on the maximum allowed scratch size per
18619 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18620 // calculation won't overflow, so assume the sign bit is never set.
18621 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18622}
18623
18624static void knownBitsForWorkitemID(const GCNSubtarget &ST,
18625 GISelValueTracking &VT, KnownBits &Known,
18626 unsigned Dim) {
18627 unsigned MaxValue =
18628 ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
18629 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
18630}
18631
18632static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
18633 KnownBits &Known, const APInt &DemandedElts,
18634 unsigned BFEWidth, bool SExt, unsigned Depth) {
18635 const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
18636 const MachineOperand &Src1 = MI.getOperand(i: 2);
18637
18638 unsigned Src1Cst = 0;
18639 if (Src1.isImm()) {
18640 Src1Cst = Src1.getImm();
18641 } else if (Src1.isReg()) {
18642 auto Cst = getIConstantVRegValWithLookThrough(VReg: Src1.getReg(), MRI);
18643 if (!Cst)
18644 return;
18645 Src1Cst = Cst->Value.getZExtValue();
18646 } else {
18647 return;
18648 }
18649
18650 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18651 // Width is always [22:16].
18652 const unsigned Offset =
18653 Src1Cst & maskTrailingOnes<unsigned>(N: (BFEWidth == 32) ? 5 : 6);
18654 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(N: 6);
18655
18656 if (Width >= BFEWidth) // Ill-formed.
18657 return;
18658
18659 VT.computeKnownBitsImpl(R: MI.getOperand(i: 1).getReg(), Known, DemandedElts,
18660 Depth: Depth + 1);
18661
18662 Known = Known.extractBits(NumBits: Width, BitPosition: Offset);
18663
18664 if (SExt)
18665 Known = Known.sext(BitWidth: BFEWidth);
18666 else
18667 Known = Known.zext(BitWidth: BFEWidth);
18668}
18669
18670void SITargetLowering::computeKnownBitsForTargetInstr(
18671 GISelValueTracking &VT, Register R, KnownBits &Known,
18672 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18673 unsigned Depth) const {
18674 Known.resetAll();
18675 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18676 switch (MI->getOpcode()) {
18677 case AMDGPU::S_BFE_I32:
18678 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18679 /*SExt=*/true, Depth);
18680 case AMDGPU::S_BFE_U32:
18681 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18682 /*SExt=*/false, Depth);
18683 case AMDGPU::S_BFE_I64:
18684 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18685 /*SExt=*/true, Depth);
18686 case AMDGPU::S_BFE_U64:
18687 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18688 /*SExt=*/false, Depth);
18689 case AMDGPU::G_INTRINSIC:
18690 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18691 Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
18692 switch (IID) {
18693 case Intrinsic::amdgcn_workitem_id_x:
18694 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 0);
18695 break;
18696 case Intrinsic::amdgcn_workitem_id_y:
18697 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 1);
18698 break;
18699 case Intrinsic::amdgcn_workitem_id_z:
18700 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 2);
18701 break;
18702 case Intrinsic::amdgcn_mbcnt_lo:
18703 case Intrinsic::amdgcn_mbcnt_hi: {
18704 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18705 // most 31 + src1.
18706 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18707 ? getSubtarget()->getWavefrontSizeLog2()
18708 : 5);
18709 KnownBits Known2;
18710 VT.computeKnownBitsImpl(R: MI->getOperand(i: 3).getReg(), Known&: Known2, DemandedElts,
18711 Depth: Depth + 1);
18712 Known = KnownBits::add(LHS: Known, RHS: Known2);
18713 break;
18714 }
18715 case Intrinsic::amdgcn_groupstaticsize: {
18716 // We can report everything over the maximum size as 0. We can't report
18717 // based on the actual size because we don't know if it's accurate or not
18718 // at any given point.
18719 Known.Zero.setHighBits(
18720 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
18721 break;
18722 }
18723 }
18724 break;
18725 }
18726 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18727 Known.Zero.setHighBits(24);
18728 break;
18729 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18730 Known.Zero.setHighBits(16);
18731 break;
18732 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18733 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18734 // producing exactly 0 or 1.
18735 Known.Zero.setHighBits(Known.getBitWidth() - 1);
18736 break;
18737 case AMDGPU::G_AMDGPU_SMED3:
18738 case AMDGPU::G_AMDGPU_UMED3: {
18739 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18740
18741 KnownBits Known2;
18742 VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
18743 if (Known2.isUnknown())
18744 break;
18745
18746 KnownBits Known1;
18747 VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
18748 if (Known1.isUnknown())
18749 break;
18750
18751 KnownBits Known0;
18752 VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
18753 if (Known0.isUnknown())
18754 break;
18755
18756 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18757 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18758 Known.One = Known0.One & Known1.One & Known2.One;
18759 break;
18760 }
18761 }
18762}
18763
18764Align SITargetLowering::computeKnownAlignForTargetInstr(
18765 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
18766 unsigned Depth) const {
18767 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18768 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
18769 // FIXME: Can this move to generic code? What about the case where the call
18770 // site specifies a lower alignment?
18771 Intrinsic::ID IID = GI->getIntrinsicID();
18772 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
18773 AttributeList Attrs =
18774 Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
18775 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18776 return *RetAlign;
18777 }
18778 return Align(1);
18779}
18780
18781Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18782 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
18783 const Align CacheLineAlign = Align(64);
18784
18785 // GFX950: Prevent an 8-byte instruction at loop header from being split by
18786 // the 32-byte instruction fetch window boundary. This avoids a significant
18787 // fetch delay after backward branch. We use 32-byte alignment with max
18788 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
18789 if (ML && !DisableLoopAlignment &&
18790 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
18791 const MachineBasicBlock *Header = ML->getHeader();
18792 // Respect user-specified or previously set alignment.
18793 if (Header->getAlignment() != PrefAlign)
18794 return Header->getAlignment();
18795 if (needsFetchWindowAlignment(MBB: *Header))
18796 return Align(32);
18797 }
18798
18799 // Pre-GFX10 target did not benefit from loop alignment
18800 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18801 getSubtarget()->hasInstFwdPrefetchBug())
18802 return PrefAlign;
18803
18804 // On GFX10 I$ is 4 x 64 bytes cache lines.
18805 // By default prefetcher keeps one cache line behind and reads two ahead.
18806 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18807 // behind and one ahead.
18808 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18809 // If loop fits 64 bytes it always spans no more than two cache lines and
18810 // does not need an alignment.
18811 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18812 // Else if loop is less or equal 192 bytes we need two lines behind.
18813
18814 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18815 const MachineBasicBlock *Header = ML->getHeader();
18816 if (Header->getAlignment() != PrefAlign)
18817 return Header->getAlignment(); // Already processed.
18818
18819 unsigned LoopSize = 0;
18820 for (const MachineBasicBlock *MBB : ML->blocks()) {
18821 // If inner loop block is aligned assume in average half of the alignment
18822 // size to be added as nops.
18823 if (MBB != Header)
18824 LoopSize += MBB->getAlignment().value() / 2;
18825
18826 for (const MachineInstr &MI : *MBB) {
18827 LoopSize += TII->getInstSizeInBytes(MI);
18828 if (LoopSize > 192)
18829 return PrefAlign;
18830 }
18831 }
18832
18833 if (LoopSize <= 64)
18834 return PrefAlign;
18835
18836 if (LoopSize <= 128)
18837 return CacheLineAlign;
18838
18839 // If any of parent loops is surrounded by prefetch instructions do not
18840 // insert new for inner loop, which would reset parent's settings.
18841 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18842 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18843 auto I = Exit->getFirstNonDebugInstr();
18844 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18845 return CacheLineAlign;
18846 }
18847 }
18848
18849 MachineBasicBlock *Pre = ML->getLoopPreheader();
18850 MachineBasicBlock *Exit = ML->getExitBlock();
18851
18852 if (Pre && Exit) {
18853 auto PreTerm = Pre->getFirstTerminator();
18854 if (PreTerm == Pre->begin() ||
18855 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18856 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18857 .addImm(Val: 1); // prefetch 2 lines behind PC
18858
18859 auto ExitHead = Exit->getFirstNonDebugInstr();
18860 if (ExitHead == Exit->end() ||
18861 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18862 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18863 .addImm(Val: 2); // prefetch 1 line behind PC
18864 }
18865
18866 return CacheLineAlign;
18867}
18868
18869unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
18870 MachineBasicBlock *MBB) const {
18871 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
18872 // instruction could be split by the 32-byte fetch window boundary.
18873 // See getPrefLoopAlignment() for context.
18874 if (needsFetchWindowAlignment(MBB: *MBB))
18875 return 4;
18876 return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
18877}
18878
18879bool SITargetLowering::needsFetchWindowAlignment(
18880 const MachineBasicBlock &MBB) const {
18881 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
18882 return false;
18883 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18884 for (const MachineInstr &MI : MBB) {
18885 if (MI.isMetaInstruction())
18886 continue;
18887 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
18888 return TII->getInstSizeInBytes(MI) > 4;
18889 }
18890 return false;
18891}
18892
18893[[maybe_unused]]
18894static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18895 assert(N->getOpcode() == ISD::CopyFromReg);
18896 do {
18897 // Follow the chain until we find an INLINEASM node.
18898 N = N->getOperand(Num: 0).getNode();
18899 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18900 return true;
18901 } while (N->getOpcode() == ISD::CopyFromReg);
18902 return false;
18903}
18904
18905bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
18906 FunctionLoweringInfo *FLI,
18907 UniformityInfo *UA) const {
18908 switch (N->getOpcode()) {
18909 case ISD::CopyFromReg: {
18910 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
18911 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18912 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18913 Register Reg = R->getReg();
18914
18915 // FIXME: Why does this need to consider isLiveIn?
18916 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18917 return !TRI->isSGPRReg(MRI, Reg);
18918
18919 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
18920 return UA->isDivergent(V);
18921
18922 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
18923 return !TRI->isSGPRReg(MRI, Reg);
18924 }
18925 case ISD::LOAD: {
18926 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
18927 unsigned AS = L->getAddressSpace();
18928 // A flat load may access private memory.
18929 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
18930 }
18931 case ISD::CALLSEQ_END:
18932 return true;
18933 case ISD::INTRINSIC_WO_CHAIN:
18934 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
18935 case ISD::INTRINSIC_W_CHAIN:
18936 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
18937 case AMDGPUISD::ATOMIC_CMP_SWAP:
18938 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18939 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18940 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18941 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18942 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18943 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18944 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18945 case AMDGPUISD::BUFFER_ATOMIC_AND:
18946 case AMDGPUISD::BUFFER_ATOMIC_OR:
18947 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18948 case AMDGPUISD::BUFFER_ATOMIC_INC:
18949 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18950 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18951 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18952 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18953 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18954 // Target-specific read-modify-write atomics are sources of divergence.
18955 return true;
18956 default:
18957 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
18958 // Generic read-modify-write atomics are sources of divergence.
18959 return A->readMem() && A->writeMem();
18960 }
18961 return false;
18962 }
18963}
18964
18965bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
18966 EVT VT) const {
18967 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18968 case MVT::f32:
18969 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
18970 case MVT::f64:
18971 case MVT::f16:
18972 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
18973 default:
18974 return false;
18975 }
18976}
18977
18978bool SITargetLowering::denormalsEnabledForType(
18979 LLT Ty, const MachineFunction &MF) const {
18980 switch (Ty.getScalarSizeInBits()) {
18981 case 32:
18982 return !denormalModeIsFlushAllF32(MF);
18983 case 64:
18984 case 16:
18985 return !denormalModeIsFlushAllF64F16(MF);
18986 default:
18987 return false;
18988 }
18989}
18990
18991bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
18992 const APInt &DemandedElts,
18993 const SelectionDAG &DAG,
18994 bool SNaN,
18995 unsigned Depth) const {
18996 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18997 const MachineFunction &MF = DAG.getMachineFunction();
18998 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18999
19000 if (Info->getMode().DX10Clamp)
19001 return true; // Clamped to 0.
19002 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
19003 }
19004
19005 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
19006 DAG, SNaN, Depth);
19007}
19008
19009// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19010// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19011static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
19012 if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
19013 return true;
19014
19015 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19016 auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
19017 if (DenormMode == DenormalMode::getPreserveSign())
19018 return true;
19019
19020 // TODO: Remove this.
19021 return RMW->getFunction()
19022 ->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
19023 .getValueAsBool();
19024}
19025
19026static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
19027 LLVMContext &Ctx = RMW->getContext();
19028 StringRef MemScope =
19029 Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
19030
19031 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19032 << "Hardware instruction generated for atomic "
19033 << RMW->getOperationName(Op: RMW->getOperation())
19034 << " operation at memory scope " << MemScope;
19035}
19036
19037static bool isV2F16OrV2BF16(Type *Ty) {
19038 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19039 Type *EltTy = VT->getElementType();
19040 return VT->getNumElements() == 2 &&
19041 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19042 }
19043
19044 return false;
19045}
19046
19047static bool isV2F16(Type *Ty) {
19048 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19049 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19050}
19051
19052static bool isV2BF16(Type *Ty) {
19053 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19054 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19055}
19056
19057/// \return true if atomicrmw integer ops work for the type.
19058static bool isAtomicRMWLegalIntTy(Type *Ty) {
19059 if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
19060 unsigned BW = IT->getBitWidth();
19061 return BW == 32 || BW == 64;
19062 }
19063
19064 return false;
19065}
19066
19067/// \return true if this atomicrmw xchg type can be selected.
19068static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19069 Type *Ty = RMW->getType();
19070 if (isAtomicRMWLegalIntTy(Ty))
19071 return true;
19072
19073 if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
19074 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19075 unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
19076 return BW == 32 || BW == 64;
19077 }
19078
19079 if (Ty->isFloatTy() || Ty->isDoubleTy())
19080 return true;
19081
19082 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19083 return VT->getNumElements() == 2 &&
19084 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19085 }
19086
19087 return false;
19088}
19089
19090/// \returns true if it's valid to emit a native instruction for \p RMW, based
19091/// on the properties of the target memory.
19092static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19093 const AtomicRMWInst *RMW,
19094 bool HasSystemScope) {
19095 // The remote/fine-grained access logic is different from the integer
19096 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19097 // fine-grained access does not work, even for a device local allocation.
19098 //
19099 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19100 // allocations work.
19101 if (HasSystemScope) {
19102 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19103 RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19104 return true;
19105 if (Subtarget.hasEmulatedSystemScopeAtomics())
19106 return true;
19107 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19108 return true;
19109
19110 return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
19111}
19112
19113/// \return Action to perform on AtomicRMWInsts for integer operations.
19114static TargetLowering::AtomicExpansionKind
19115atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
19116 return isAtomicRMWLegalIntTy(Ty: RMW->getType())
19117 ? TargetLowering::AtomicExpansionKind::None
19118 : TargetLowering::AtomicExpansionKind::CmpXChg;
19119}
19120
19121/// Return if a flat address space atomicrmw can access private memory.
19122static bool flatInstrMayAccessPrivate(const Instruction *I) {
19123 const MDNode *MD = I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
19124 return !MD ||
19125 !AMDGPU::hasValueInRangeLikeMetadata(MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
19126}
19127
19128static TargetLowering::AtomicExpansionKind
19129getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
19130 // For GAS, lower to flat atomic.
19131 return STI.hasGloballyAddressableScratch()
19132 ? TargetLowering::AtomicExpansionKind::CustomExpand
19133 : TargetLowering::AtomicExpansionKind::NotAtomic;
19134}
19135
19136TargetLowering::AtomicExpansionKind
19137SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
19138 unsigned AS = RMW->getPointerAddressSpace();
19139 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19140 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19141
19142 // 64-bit flat atomics that dynamically reside in private memory will silently
19143 // be dropped.
19144 //
19145 // Note that we will emit a new copy of the original atomic in the expansion,
19146 // which will be incrementally relegalized.
19147 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19148 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19149 DL.getTypeSizeInBits(Ty: RMW->getType()) == 64 &&
19150 flatInstrMayAccessPrivate(I: RMW))
19151 return AtomicExpansionKind::CustomExpand;
19152
19153 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19154 OptimizationRemarkEmitter ORE(RMW->getFunction());
19155 ORE.emit(RemarkBuilder: [=]() {
19156 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19157 });
19158 return Kind;
19159 };
19160
19161 auto SSID = RMW->getSyncScopeID();
19162 bool HasSystemScope =
19163 SSID == SyncScope::System ||
19164 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
19165
19166 auto Op = RMW->getOperation();
19167 switch (Op) {
19168 case AtomicRMWInst::Xchg:
19169 // PCIe supports add and xchg for system atomics.
19170 return isAtomicRMWLegalXChgTy(RMW)
19171 ? TargetLowering::AtomicExpansionKind::None
19172 : TargetLowering::AtomicExpansionKind::CmpXChg;
19173 case AtomicRMWInst::Add:
19174 // PCIe supports add and xchg for system atomics.
19175 return atomicSupportedIfLegalIntType(RMW);
19176 case AtomicRMWInst::Sub:
19177 case AtomicRMWInst::And:
19178 case AtomicRMWInst::Or:
19179 case AtomicRMWInst::Xor:
19180 case AtomicRMWInst::Max:
19181 case AtomicRMWInst::Min:
19182 case AtomicRMWInst::UMax:
19183 case AtomicRMWInst::UMin:
19184 case AtomicRMWInst::UIncWrap:
19185 case AtomicRMWInst::UDecWrap:
19186 case AtomicRMWInst::USubCond:
19187 case AtomicRMWInst::USubSat: {
19188 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19189 return AtomicExpansionKind::CmpXChg;
19190 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19191 return AtomicExpansionKind::CmpXChg;
19192 if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
19193 auto *IT = dyn_cast<IntegerType>(Val: RMW->getType());
19194 if (!IT || IT->getBitWidth() != 32)
19195 return AtomicExpansionKind::CmpXChg;
19196 }
19197
19198 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
19199 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19200 if (Subtarget->hasEmulatedSystemScopeAtomics())
19201 return atomicSupportedIfLegalIntType(RMW);
19202
19203 // On most subtargets, for atomicrmw operations other than add/xchg,
19204 // whether or not the instructions will behave correctly depends on where
19205 // the address physically resides and what interconnect is used in the
19206 // system configuration. On some some targets the instruction will nop,
19207 // and in others synchronization will only occur at degraded device scope.
19208 //
19209 // If the allocation is known local to the device, the instructions should
19210 // work correctly.
19211 if (RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19212 return atomicSupportedIfLegalIntType(RMW);
19213
19214 // If fine-grained remote memory works at device scope, we don't need to
19215 // do anything.
19216 if (!HasSystemScope &&
19217 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19218 return atomicSupportedIfLegalIntType(RMW);
19219
19220 // If we are targeting a remote allocated address, it depends what kind of
19221 // allocation the address belongs to.
19222 //
19223 // If the allocation is fine-grained (in host memory, or in PCIe peer
19224 // device memory), the operation will fail depending on the target.
19225 //
19226 // Note fine-grained host memory access does work on APUs or if XGMI is
19227 // used, but we do not know if we are targeting an APU or the system
19228 // configuration from the ISA version/target-cpu.
19229 if (RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory"))
19230 return atomicSupportedIfLegalIntType(RMW);
19231
19232 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19233 Op == AtomicRMWInst::Xor) {
19234 // Atomic sub/or/xor do not work over PCI express, but atomic add
19235 // does. InstCombine transforms these with 0 to or, so undo that.
19236 if (const Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
19237 ConstVal && ConstVal->isNullValue())
19238 return AtomicExpansionKind::CustomExpand;
19239 }
19240
19241 // If the allocation could be in remote, fine-grained memory, the rmw
19242 // instructions may fail. cmpxchg should work, so emit that. On some
19243 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19244 // even work, so you're out of luck anyway.
19245
19246 // In summary:
19247 //
19248 // Cases that may fail:
19249 // - fine-grained pinned host memory
19250 // - fine-grained migratable host memory
19251 // - fine-grained PCIe peer device
19252 //
19253 // Cases that should work, but may be treated overly conservatively.
19254 // - fine-grained host memory on an APU
19255 // - fine-grained XGMI peer device
19256 return AtomicExpansionKind::CmpXChg;
19257 }
19258
19259 return atomicSupportedIfLegalIntType(RMW);
19260 }
19261 case AtomicRMWInst::FAdd: {
19262 Type *Ty = RMW->getType();
19263
19264 // TODO: Handle REGION_ADDRESS
19265 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19266 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19267 // is fixed to round-to-nearest-even.
19268 //
19269 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19270 // round-to-nearest-even.
19271 //
19272 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19273 // suggests it is OK if the floating-point mode may not match the calling
19274 // thread.
19275 if (Ty->isFloatTy()) {
19276 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19277 : AtomicExpansionKind::CmpXChg;
19278 }
19279
19280 if (Ty->isDoubleTy()) {
19281 // Ignores denormal mode, but we don't consider flushing mandatory.
19282 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19283 : AtomicExpansionKind::CmpXChg;
19284 }
19285
19286 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19287 return AtomicExpansionKind::None;
19288
19289 return AtomicExpansionKind::CmpXChg;
19290 }
19291
19292 // LDS atomics respect the denormal mode from the mode register.
19293 //
19294 // Traditionally f32 global/buffer memory atomics would unconditionally
19295 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19296 // flush.
19297 //
19298 // On targets with flat atomic fadd, denormals would flush depending on
19299 // whether the target address resides in LDS or global memory. We consider
19300 // this flat-maybe-flush as will-flush.
19301 if (Ty->isFloatTy() &&
19302 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19303 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
19304 return AtomicExpansionKind::CmpXChg;
19305
19306 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19307 // safe. The message phrasing also should be better.
19308 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19309 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19310 // gfx942, gfx12
19311 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19312 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19313 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19314 // gfx90a, gfx942, gfx12
19315 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19316 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19317
19318 // gfx942, gfx12
19319 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19320 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19321 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19322 // gfx90a, gfx942, gfx12
19323 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19324 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19325
19326 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19327 // buffer. gfx12 does have the buffer version.
19328 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19329 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19330 }
19331
19332 // global and flat atomic fadd f64: gfx90a, gfx942.
19333 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19334 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19335
19336 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19337 if (Ty->isFloatTy()) {
19338 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19339 // gfx11+.
19340 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19341 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19342 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19343 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19344 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19345 } else {
19346 // gfx908
19347 if (RMW->use_empty() &&
19348 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19349 isV2F16(Ty))
19350 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19351 }
19352 }
19353
19354 // flat atomic fadd f32: gfx942, gfx11+.
19355 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19356 if (Subtarget->hasFlatAtomicFaddF32Inst())
19357 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19358
19359 // If it is in flat address space, and the type is float, we will try to
19360 // expand it, if the target supports global and lds atomic fadd. The
19361 // reason we need that is, in the expansion, we emit the check of
19362 // address space. If it is in global address space, we emit the global
19363 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19364 // fadd.
19365 if (Subtarget->hasLDSFPAtomicAddF32()) {
19366 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19367 return AtomicExpansionKind::CustomExpand;
19368 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19369 return AtomicExpansionKind::CustomExpand;
19370 }
19371 }
19372 }
19373
19374 return AtomicExpansionKind::CmpXChg;
19375 }
19376 case AtomicRMWInst::FMin:
19377 case AtomicRMWInst::FMax: {
19378 Type *Ty = RMW->getType();
19379
19380 // LDS float and double fmin/fmax were always supported.
19381 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19382 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19383 : AtomicExpansionKind::CmpXChg;
19384 }
19385
19386 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19387 // For flat and global cases:
19388 // float, double in gfx7. Manual claims denormal support.
19389 // Removed in gfx8.
19390 // float, double restored in gfx10.
19391 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19392 //
19393 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19394 // no f32.
19395 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19396 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19397 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19398 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19399 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19400 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19401 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19402 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19403 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19404 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19405 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19406 }
19407 }
19408
19409 return AtomicExpansionKind::CmpXChg;
19410 }
19411 case AtomicRMWInst::Nand:
19412 case AtomicRMWInst::FSub:
19413 default:
19414 return AtomicExpansionKind::CmpXChg;
19415 }
19416
19417 llvm_unreachable("covered atomicrmw op switch");
19418}
19419
19420TargetLowering::AtomicExpansionKind
19421SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19422 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19423 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19424 : AtomicExpansionKind::None;
19425}
19426
19427TargetLowering::AtomicExpansionKind
19428SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19429 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19430 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19431 : AtomicExpansionKind::None;
19432}
19433
19434TargetLowering::AtomicExpansionKind
19435SITargetLowering::shouldExpandAtomicCmpXchgInIR(
19436 const AtomicCmpXchgInst *CmpX) const {
19437 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19438 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19439 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19440
19441 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(I: CmpX))
19442 return AtomicExpansionKind::None;
19443
19444 const DataLayout &DL = CmpX->getDataLayout();
19445
19446 Type *ValTy = CmpX->getNewValOperand()->getType();
19447
19448 // If a 64-bit flat atomic may alias private, we need to avoid using the
19449 // atomic in the private case.
19450 return DL.getTypeSizeInBits(Ty: ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19451 : AtomicExpansionKind::None;
19452}
19453
19454const TargetRegisterClass *
19455SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19456 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
19457 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19458 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19459 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19460 : &AMDGPU::SReg_32RegClass;
19461 if (!TRI->isSGPRClass(RC) && !isDivergent)
19462 return TRI->getEquivalentSGPRClass(VRC: RC);
19463 if (TRI->isSGPRClass(RC) && isDivergent) {
19464 if (Subtarget->hasGFX90AInsts())
19465 return TRI->getEquivalentAVClass(SRC: RC);
19466 return TRI->getEquivalentVGPRClass(SRC: RC);
19467 }
19468
19469 return RC;
19470}
19471
19472// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19473// uniform values (as produced by the mask results of control flow intrinsics)
19474// used outside of divergent blocks. The phi users need to also be treated as
19475// always uniform.
19476//
19477// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19478static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19479 unsigned WaveSize) {
19480 // FIXME: We assume we never cast the mask results of a control flow
19481 // intrinsic.
19482 // Early exit if the type won't be consistent as a compile time hack.
19483 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
19484 if (!IT || IT->getBitWidth() != WaveSize)
19485 return false;
19486
19487 if (!isa<Instruction>(Val: V))
19488 return false;
19489 if (!Visited.insert(Ptr: V).second)
19490 return false;
19491 bool Result = false;
19492 for (const auto *U : V->users()) {
19493 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
19494 if (V == U->getOperand(i: 1)) {
19495 switch (Intrinsic->getIntrinsicID()) {
19496 default:
19497 Result = false;
19498 break;
19499 case Intrinsic::amdgcn_if_break:
19500 case Intrinsic::amdgcn_if:
19501 case Intrinsic::amdgcn_else:
19502 Result = true;
19503 break;
19504 }
19505 }
19506 if (V == U->getOperand(i: 0)) {
19507 switch (Intrinsic->getIntrinsicID()) {
19508 default:
19509 Result = false;
19510 break;
19511 case Intrinsic::amdgcn_end_cf:
19512 case Intrinsic::amdgcn_loop:
19513 Result = true;
19514 break;
19515 }
19516 }
19517 } else {
19518 Result = hasCFUser(V: U, Visited, WaveSize);
19519 }
19520 if (Result)
19521 break;
19522 }
19523 return Result;
19524}
19525
19526bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
19527 const Value *V) const {
19528 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
19529 if (CI->isInlineAsm()) {
19530 // FIXME: This cannot give a correct answer. This should only trigger in
19531 // the case where inline asm returns mixed SGPR and VGPR results, used
19532 // outside the defining block. We don't have a specific result to
19533 // consider, so this assumes if any value is SGPR, the overall register
19534 // also needs to be SGPR.
19535 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19536 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
19537 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
19538 for (auto &TC : TargetConstraints) {
19539 if (TC.Type == InlineAsm::isOutput) {
19540 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
19541 const TargetRegisterClass *RC =
19542 getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
19543 VT: TC.ConstraintVT)
19544 .second;
19545 if (RC && SIRI->isSGPRClass(RC))
19546 return true;
19547 }
19548 }
19549 }
19550 }
19551 SmallPtrSet<const Value *, 16> Visited;
19552 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
19553}
19554
19555bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
19556 for (SDUse &Use : N->uses()) {
19557 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
19558 if (getBasePtrIndex(N: M) == Use.getOperandNo())
19559 return true;
19560 }
19561 }
19562 return false;
19563}
19564
19565bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
19566 SDValue N1) const {
19567 if (!N0.hasOneUse())
19568 return false;
19569 // Take care of the opportunity to keep N0 uniform
19570 if (N0->isDivergent() || !N1->isDivergent())
19571 return true;
19572 // Check if we have a good chance to form the memory access pattern with the
19573 // base and offset
19574 return (DAG.isBaseWithConstantOffset(Op: N0) &&
19575 hasMemSDNodeUser(N: *N0->user_begin()));
19576}
19577
19578bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
19579 Register N0, Register N1) const {
19580 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
19581}
19582
19583MachineMemOperand::Flags
19584SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
19585 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19586 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
19587 if (I.getMetadata(Kind: "amdgpu.noclobber"))
19588 Flags |= MONoClobber;
19589 if (I.getMetadata(Kind: "amdgpu.last.use"))
19590 Flags |= MOLastUse;
19591 return Flags;
19592}
19593
19594void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
19595 Instruction *AI) const {
19596 // Given: atomicrmw fadd ptr %addr, float %val ordering
19597 //
19598 // With this expansion we produce the following code:
19599 // [...]
19600 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19601 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19602 //
19603 // atomicrmw.shared:
19604 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19605 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19606 // float %val ordering
19607 // br label %atomicrmw.phi
19608 //
19609 // atomicrmw.check.private:
19610 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19611 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19612 //
19613 // atomicrmw.private:
19614 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19615 // %loaded.private = load float, ptr addrspace(5) %cast.private
19616 // %val.new = fadd float %loaded.private, %val
19617 // store float %val.new, ptr addrspace(5) %cast.private
19618 // br label %atomicrmw.phi
19619 //
19620 // atomicrmw.global:
19621 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19622 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19623 // float %val ordering
19624 // br label %atomicrmw.phi
19625 //
19626 // atomicrmw.phi:
19627 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19628 // [ %loaded.private, %atomicrmw.private ],
19629 // [ %loaded.global, %atomicrmw.global ]
19630 // br label %atomicrmw.end
19631 //
19632 // atomicrmw.end:
19633 // [...]
19634 //
19635 //
19636 // For 64-bit atomics which may reside in private memory, we perform a simpler
19637 // version that only inserts the private check, and uses the flat operation.
19638
19639 IRBuilder<> Builder(AI);
19640 LLVMContext &Ctx = Builder.getContext();
19641
19642 auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
19643 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19644 : AtomicCmpXchgInst::getPointerOperandIndex();
19645 Value *Addr = AI->getOperand(i: PtrOpIdx);
19646
19647 /// TODO: Only need to check private, then emit flat-known-not private (no
19648 /// need for shared block, or cast to global).
19649 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
19650
19651 Align Alignment;
19652 if (RMW)
19653 Alignment = RMW->getAlign();
19654 else if (CX)
19655 Alignment = CX->getAlign();
19656 else
19657 llvm_unreachable("unhandled atomic operation");
19658
19659 // FullFlatEmulation is true if we need to issue the private, shared, and
19660 // global cases.
19661 //
19662 // If this is false, we are only dealing with the flat-targeting-private case,
19663 // where we only insert a check for private and still use the flat instruction
19664 // for global and shared.
19665
19666 bool FullFlatEmulation =
19667 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19668 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19669 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19670 RMW->getType()->isDoubleTy()));
19671
19672 // If the return value isn't used, do not introduce a false use in the phi.
19673 bool ReturnValueIsUsed = !AI->use_empty();
19674
19675 BasicBlock *BB = Builder.GetInsertBlock();
19676 Function *F = BB->getParent();
19677 BasicBlock *ExitBB =
19678 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
19679 BasicBlock *SharedBB = nullptr;
19680
19681 BasicBlock *CheckPrivateBB = BB;
19682 if (FullFlatEmulation) {
19683 SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
19684 CheckPrivateBB =
19685 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
19686 }
19687
19688 BasicBlock *PrivateBB =
19689 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
19690 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
19691 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
19692
19693 std::prev(x: BB->end())->eraseFromParent();
19694 Builder.SetInsertPoint(BB);
19695
19696 Value *LoadedShared = nullptr;
19697 if (FullFlatEmulation) {
19698 CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
19699 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
19700 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
19701 Builder.SetInsertPoint(SharedBB);
19702 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19703 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
19704
19705 Instruction *Clone = AI->clone();
19706 Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
19707 Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
19708 LoadedShared = Clone;
19709
19710 Builder.CreateBr(Dest: PhiBB);
19711 Builder.SetInsertPoint(CheckPrivateBB);
19712 }
19713
19714 CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
19715 Args: {Addr}, FMFSource: nullptr, Name: "is.private");
19716 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
19717
19718 Builder.SetInsertPoint(PrivateBB);
19719
19720 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19721 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
19722
19723 Value *LoadedPrivate;
19724 if (RMW) {
19725 LoadedPrivate = Builder.CreateAlignedLoad(
19726 Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
19727
19728 Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
19729 Loaded: LoadedPrivate, Val: RMW->getValOperand());
19730
19731 Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
19732 } else {
19733 auto [ResultLoad, Equal] =
19734 buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
19735 Val: CX->getNewValOperand(), Alignment: CX->getAlign());
19736
19737 Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
19738 Val: ResultLoad, Idxs: 0);
19739 LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: 1);
19740 }
19741
19742 Builder.CreateBr(Dest: PhiBB);
19743
19744 Builder.SetInsertPoint(GlobalBB);
19745
19746 // Continue using a flat instruction if we only emitted the check for private.
19747 Instruction *LoadedGlobal = AI;
19748 if (FullFlatEmulation) {
19749 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19750 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
19751 AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
19752 }
19753
19754 AI->removeFromParent();
19755 AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
19756
19757 // The new atomicrmw may go through another round of legalization later.
19758 if (!FullFlatEmulation) {
19759 // We inserted the runtime check already, make sure we do not try to
19760 // re-expand this.
19761 // TODO: Should union with any existing metadata.
19762 MDBuilder MDB(F->getContext());
19763 MDNode *RangeNotPrivate =
19764 MDB.createRange(Lo: APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
19765 Hi: APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
19766 LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
19767 Node: RangeNotPrivate);
19768 }
19769
19770 Builder.CreateBr(Dest: PhiBB);
19771
19772 Builder.SetInsertPoint(PhiBB);
19773
19774 if (ReturnValueIsUsed) {
19775 PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: 3);
19776 AI->replaceAllUsesWith(V: Loaded);
19777 if (FullFlatEmulation)
19778 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
19779 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
19780 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
19781 Loaded->takeName(V: AI);
19782 }
19783
19784 Builder.CreateBr(Dest: ExitBB);
19785}
19786
19787static void convertScratchAtomicToFlatAtomic(Instruction *I,
19788 unsigned PtrOpIdx) {
19789 Value *PtrOp = I->getOperand(i: PtrOpIdx);
19790 assert(PtrOp->getType()->getPointerAddressSpace() ==
19791 AMDGPUAS::PRIVATE_ADDRESS);
19792
19793 Type *FlatPtr = PointerType::get(C&: I->getContext(), AddressSpace: AMDGPUAS::FLAT_ADDRESS);
19794 Value *ASCast = CastInst::CreatePointerCast(S: PtrOp, Ty: FlatPtr, Name: "scratch.ascast",
19795 InsertBefore: I->getIterator());
19796 I->setOperand(i: PtrOpIdx, Val: ASCast);
19797}
19798
19799void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
19800 AtomicRMWInst::BinOp Op = AI->getOperation();
19801
19802 if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19803 return convertScratchAtomicToFlatAtomic(I: AI, PtrOpIdx: AI->getPointerOperandIndex());
19804
19805 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19806 Op == AtomicRMWInst::Xor) {
19807 if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
19808 ConstVal && ConstVal->isNullValue()) {
19809 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19810 AI->setOperation(AtomicRMWInst::Add);
19811
19812 // We may still need the private-alias-flat handling below.
19813
19814 // TODO: Skip this for cases where we cannot access remote memory.
19815 }
19816 }
19817
19818 // The non-flat expansions should only perform the de-canonicalization of
19819 // identity values.
19820 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
19821 return;
19822
19823 emitExpandAtomicAddrSpacePredicate(AI);
19824}
19825
19826void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
19827 if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19828 return convertScratchAtomicToFlatAtomic(I: CI, PtrOpIdx: CI->getPointerOperandIndex());
19829
19830 emitExpandAtomicAddrSpacePredicate(AI: CI);
19831}
19832
19833void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
19834 if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19835 return convertScratchAtomicToFlatAtomic(I: LI, PtrOpIdx: LI->getPointerOperandIndex());
19836
19837 llvm_unreachable(
19838 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19839}
19840
19841void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
19842 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19843 return convertScratchAtomicToFlatAtomic(I: SI, PtrOpIdx: SI->getPointerOperandIndex());
19844
19845 llvm_unreachable(
19846 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19847}
19848
19849LoadInst *
19850SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19851 IRBuilder<> Builder(AI);
19852 auto Order = AI->getOrdering();
19853
19854 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19855 // must be flushed if the atomic ordering had a release semantics. This is
19856 // not necessary a fence, a release fence just coincides to do that flush.
19857 // Avoid replacing of an atomicrmw with a release semantics.
19858 if (isReleaseOrStronger(AO: Order))
19859 return nullptr;
19860
19861 LoadInst *LI = Builder.CreateAlignedLoad(
19862 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
19863 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
19864 LI->copyMetadata(SrcInst: *AI);
19865 LI->takeName(V: AI);
19866 AI->replaceAllUsesWith(V: LI);
19867 AI->eraseFromParent();
19868 return LI;
19869}
19870