1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUSelectionDAGInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
21#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22#include "SIMachineFunctionInfo.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/FloatingPointMode.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/UniformityAnalysis.h"
29#include "llvm/CodeGen/Analysis.h"
30#include "llvm/CodeGen/ByteProvider.h"
31#include "llvm/CodeGen/FunctionLoweringInfo.h"
32#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
33#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
34#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35#include "llvm/CodeGen/MachineFrameInfo.h"
36#include "llvm/CodeGen/MachineFunction.h"
37#include "llvm/CodeGen/MachineLoopInfo.h"
38#include "llvm/CodeGen/PseudoSourceValueManager.h"
39#include "llvm/CodeGen/SDPatternMatch.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/IntrinsicInst.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/KnownBits.h"
48#include "llvm/Support/ModRef.h"
49#include "llvm/Transforms/Utils/LowerAtomic.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(Val: false));
63
64static cl::opt<bool> UseDivergentRegisterIndexing(
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(Val: false));
68
69static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
70 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
71 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
72}
73
74static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
75 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
76 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
77}
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(Reg: AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
89SITargetLowering::SITargetLowering(const TargetMachine &TM,
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(VT: MVT::i1, RC: &AMDGPU::VReg_1RegClass);
93 addRegisterClass(VT: MVT::i64, RC: &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(VT: MVT::i32, RC: &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 32);
100 addRegisterClass(VT: MVT::f32, RC: V32RegClass);
101
102 addRegisterClass(VT: MVT::v2i32, RC: &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 64);
106
107 addRegisterClass(VT: MVT::f64, RC: V64RegClass);
108 addRegisterClass(VT: MVT::v2f32, RC: V64RegClass);
109 addRegisterClass(VT: MVT::Untyped, RC: V64RegClass);
110
111 addRegisterClass(VT: MVT::v3i32, RC: &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(VT: MVT::v3f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 96));
113
114 addRegisterClass(VT: MVT::v2i64, RC: &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(VT: MVT::v2f64, RC: &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(VT: MVT::v4i32, RC: &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(VT: MVT::v4f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 128));
119
120 addRegisterClass(VT: MVT::v5i32, RC: &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(VT: MVT::v5f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 160));
122
123 addRegisterClass(VT: MVT::v6i32, RC: &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(VT: MVT::v6f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
125
126 addRegisterClass(VT: MVT::v3i64, RC: &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(VT: MVT::v3f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 192));
128
129 addRegisterClass(VT: MVT::v7i32, RC: &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(VT: MVT::v7f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 224));
131
132 addRegisterClass(VT: MVT::v8i32, RC: &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(VT: MVT::v8f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
134
135 addRegisterClass(VT: MVT::v4i64, RC: &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(VT: MVT::v4f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 256));
137
138 addRegisterClass(VT: MVT::v9i32, RC: &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(VT: MVT::v9f32, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 288));
140
141 addRegisterClass(VT: MVT::v10i32, RC: &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(VT: MVT::v10f32,
143 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 320));
144
145 addRegisterClass(VT: MVT::v11i32, RC: &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(VT: MVT::v11f32,
147 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 352));
148
149 addRegisterClass(VT: MVT::v12i32, RC: &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(VT: MVT::v12f32,
151 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 384));
152
153 addRegisterClass(VT: MVT::v16i32, RC: &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(VT: MVT::v16f32,
155 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
156
157 addRegisterClass(VT: MVT::v8i64, RC: &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(VT: MVT::v8f64, RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 512));
159
160 addRegisterClass(VT: MVT::v16i64, RC: &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(VT: MVT::v16f64,
162 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(VT: MVT::i16, RC: &AMDGPU::SReg_32RegClass);
171 addRegisterClass(VT: MVT::f16, RC: &AMDGPU::SReg_32RegClass);
172 addRegisterClass(VT: MVT::bf16, RC: &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(VT: MVT::v2i16, RC: &AMDGPU::SReg_32RegClass);
177 addRegisterClass(VT: MVT::v2f16, RC: &AMDGPU::SReg_32RegClass);
178 addRegisterClass(VT: MVT::v2bf16, RC: &AMDGPU::SReg_32RegClass);
179 addRegisterClass(VT: MVT::v4i16, RC: &AMDGPU::SReg_64RegClass);
180 addRegisterClass(VT: MVT::v4f16, RC: &AMDGPU::SReg_64RegClass);
181 addRegisterClass(VT: MVT::v4bf16, RC: &AMDGPU::SReg_64RegClass);
182 addRegisterClass(VT: MVT::v8i16, RC: &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(VT: MVT::v8f16, RC: &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(VT: MVT::v8bf16, RC: &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(VT: MVT::v16i16, RC: &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(VT: MVT::v16f16, RC: &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(VT: MVT::v16bf16, RC: &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(VT: MVT::v32i16, RC: &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(VT: MVT::v32f16, RC: &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(VT: MVT::v32bf16, RC: &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(VT: MVT::v32i32, RC: &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(VT: MVT::v32f32,
195 RC: TRI->getDefaultVectorSuperClassForBitWidth(BitWidth: 1024));
196
197 computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
198
199 setMinFunctionAlignment(Align(4));
200 setPrefFunctionAlignment(Align(STI.getInstCacheLineSize()));
201
202 // The boolean content concept here is too inflexible. Compares only ever
203 // really produce a 1-bit result. Any copy/extend from these will turn into a
204 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205 // it's what most targets use.
206 setBooleanContents(ZeroOrOneBooleanContent);
207 setBooleanVectorContents(ZeroOrOneBooleanContent);
208
209 // We need to custom lower vector stores from local memory
210 setOperationAction(Ops: ISD::LOAD,
211 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Action: Custom);
216
217 setOperationAction(Ops: ISD::STORE,
218 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
222 Action: Custom);
223
224 if (isTypeLegal(VT: MVT::bf16)) {
225 for (unsigned Opc :
226 {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
227 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
228 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
229 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
230 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
231 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
232 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
233 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
234 ISD::SETCC}) {
235 setOperationAction(Op: Opc, VT: MVT::bf16, Action: Promote);
236 }
237
238 setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Expand);
239
240 setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Promote);
241 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::bf16, DestVT: MVT::i16);
242
243 setOperationAction(Op: ISD::FABS, VT: MVT::bf16, Action: Legal);
244 setOperationAction(Op: ISD::FNEG, VT: MVT::bf16, Action: Legal);
245 setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Legal);
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
249 setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
250 setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
251 }
252
253 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
254 setTruncStoreAction(ValVT: MVT::v3i32, MemVT: MVT::v3i16, Action: Expand);
255 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i16, Action: Expand);
256 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i16, Action: Expand);
257 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i16, Action: Expand);
258 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i16, Action: Expand);
259 setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Expand);
260 setTruncStoreAction(ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Expand);
261 setTruncStoreAction(ValVT: MVT::v8i32, MemVT: MVT::v8i8, Action: Expand);
262 setTruncStoreAction(ValVT: MVT::v16i32, MemVT: MVT::v16i8, Action: Expand);
263 setTruncStoreAction(ValVT: MVT::v32i32, MemVT: MVT::v32i8, Action: Expand);
264 setTruncStoreAction(ValVT: MVT::v2i16, MemVT: MVT::v2i8, Action: Expand);
265 setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Expand);
266 setTruncStoreAction(ValVT: MVT::v8i16, MemVT: MVT::v8i8, Action: Expand);
267 setTruncStoreAction(ValVT: MVT::v16i16, MemVT: MVT::v16i8, Action: Expand);
268 setTruncStoreAction(ValVT: MVT::v32i16, MemVT: MVT::v32i8, Action: Expand);
269
270 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i16, Action: Expand);
271 setTruncStoreAction(ValVT: MVT::v3i64, MemVT: MVT::v3i32, Action: Expand);
272 setTruncStoreAction(ValVT: MVT::v4i64, MemVT: MVT::v4i8, Action: Expand);
273 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i8, Action: Expand);
274 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i16, Action: Expand);
275 setTruncStoreAction(ValVT: MVT::v8i64, MemVT: MVT::v8i32, Action: Expand);
276 setTruncStoreAction(ValVT: MVT::v16i64, MemVT: MVT::v16i32, Action: Expand);
277
278 setOperationAction(Ops: ISD::GlobalAddress, VTs: {MVT::i32, MVT::i64}, Action: Custom);
279 setOperationAction(Ops: ISD::ExternalSymbol, VTs: {MVT::i32, MVT::i64}, Action: Custom);
280
281 setOperationAction(Op: ISD::SELECT, VT: MVT::i1, Action: Promote);
282 setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
283 setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Promote);
284 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::f64, DestVT: MVT::i64);
285
286 setOperationAction(Ops: ISD::FSQRT, VTs: {MVT::f32, MVT::f64}, Action: Custom);
287
288 setOperationAction(Ops: ISD::SELECT_CC,
289 VTs: {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Action: Expand);
290
291 setOperationAction(Op: ISD::SETCC, VT: MVT::i1, Action: Promote);
292 setOperationAction(Ops: ISD::SETCC, VTs: {MVT::v2i1, MVT::v4i1}, Action: Expand);
293 AddPromotedToType(Opc: ISD::SETCC, OrigVT: MVT::i1, DestVT: MVT::i32);
294
295 setOperationAction(Ops: ISD::TRUNCATE,
296 VTs: {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 Action: Expand);
300 setOperationAction(Ops: ISD::FP_ROUND,
301 VTs: {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 Action: Expand);
305
306 setOperationAction(Ops: ISD::SIGN_EXTEND_INREG,
307 VTs: {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
309 Action: Custom);
310
311 setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
312 setOperationAction(Ops: ISD::BR_CC,
313 VTs: {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Action: Expand);
314
315 setOperationAction(Ops: {ISD::ABS, ISD::UADDO, ISD::USUBO}, VT: MVT::i32, Action: Legal);
316
317 setOperationAction(Ops: {ISD::UADDO_CARRY, ISD::USUBO_CARRY}, VT: MVT::i32, Action: Legal);
318
319 setOperationAction(Ops: {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, VT: MVT::i64,
320 Action: Expand);
321
322#if 0
323 setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
324#endif
325
326 // We only support LOAD/STORE and vector manipulation ops for vectors
327 // with > 4 elements.
328 for (MVT VT :
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
337 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
338 switch (Op) {
339 case ISD::LOAD:
340 case ISD::STORE:
341 case ISD::BUILD_VECTOR:
342 case ISD::BITCAST:
343 case ISD::UNDEF:
344 case ISD::EXTRACT_VECTOR_ELT:
345 case ISD::INSERT_VECTOR_ELT:
346 case ISD::SCALAR_TO_VECTOR:
347 case ISD::IS_FPCLASS:
348 break;
349 case ISD::EXTRACT_SUBVECTOR:
350 case ISD::INSERT_SUBVECTOR:
351 case ISD::CONCAT_VECTORS:
352 setOperationAction(Op, VT, Action: Custom);
353 break;
354 default:
355 setOperationAction(Op, VT, Action: Expand);
356 break;
357 }
358 }
359 }
360
361 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Expand);
362
363 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
364 // is expanded to avoid having two separate loops in case the index is a VGPR.
365
366 // Most operations are naturally 32-bit vector operations. We only support
367 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
368 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
369 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
370 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
371
372 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
373 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
374
375 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
376 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v4i32);
377
378 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
379 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v4i32);
380 }
381
382 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
383 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
384 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
385
386 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
387 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
388
389 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
390 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v6i32);
391
392 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
393 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v6i32);
394 }
395
396 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
397 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
398 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
399
400 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
401 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
402
403 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
404 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v8i32);
405
406 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
407 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v8i32);
408 }
409
410 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
411 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
412 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
413
414 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
415 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
416
417 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
418 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v16i32);
419
420 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
421 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v16i32);
422 }
423
424 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 setOperationAction(Op: ISD::BUILD_VECTOR, VT: Vec64, Action: Promote);
426 AddPromotedToType(Opc: ISD::BUILD_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
427
428 setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT: Vec64, Action: Promote);
429 AddPromotedToType(Opc: ISD::EXTRACT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
430
431 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec64, Action: Promote);
432 AddPromotedToType(Opc: ISD::INSERT_VECTOR_ELT, OrigVT: Vec64, DestVT: MVT::v32i32);
433
434 setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT: Vec64, Action: Promote);
435 AddPromotedToType(Opc: ISD::SCALAR_TO_VECTOR, OrigVT: Vec64, DestVT: MVT::v32i32);
436 }
437
438 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
439 VTs: {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 Action: Custom);
442
443 if (Subtarget->hasPkMovB32()) {
444 // TODO: 16-bit element vectors should be legal with even aligned elements.
445 // TODO: Can be legal with wider source types than the result with
446 // subregister extracts.
447 setOperationAction(Ops: ISD::VECTOR_SHUFFLE, VTs: {MVT::v2i32, MVT::v2f32}, Action: Legal);
448 }
449
450 setOperationAction(Ops: {ISD::AND, ISD::OR, ISD::XOR}, VT: MVT::v2i32, Action: Legal);
451 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
452 // instead lower to cndmask in SITargetLowering::LowerSELECT().
453 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i32, Action: Custom);
454 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
455 // alignbit.
456 setOperationAction(Op: ISD::ROTR, VT: MVT::v2i32, Action: Custom);
457
458 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
459 Action: Custom);
460
461 // Avoid stack access for these.
462 // TODO: Generalize to more vector types.
463 setOperationAction(Ops: {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
464 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
466 Action: Custom);
467
468 // Deal with vec3 vector operations when widened to vec4.
469 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
470 VTs: {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Action: Custom);
471
472 // Deal with vec5/6/7 vector operations when widened to vec8.
473 setOperationAction(Ops: ISD::INSERT_SUBVECTOR,
474 VTs: {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
478 Action: Custom);
479
480 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
481 // and output demarshalling
482 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP, VTs: {MVT::i32, MVT::i64}, Action: Custom);
483
484 // We can't return success/failure, only the old value,
485 // let LLVM add the comparison
486 setOperationAction(Ops: ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VTs: {MVT::i32, MVT::i64},
487 Action: Expand);
488
489 setOperationAction(Ops: ISD::ADDRSPACECAST, VTs: {MVT::i32, MVT::i64}, Action: Custom);
490
491 setOperationAction(Ops: ISD::BITREVERSE, VTs: {MVT::i32, MVT::i64}, Action: Legal);
492
493 // FIXME: This should be narrowed to i32, but that only happens if i64 is
494 // illegal.
495 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
496 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i64, MVT::i32}, Action: Legal);
497
498 // On SI this is s_memtime and s_memrealtime on VI.
499 setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
500
501 if (Subtarget->hasSMemRealTime() ||
502 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
503 setOperationAction(Op: ISD::READSTEADYCOUNTER, VT: MVT::i64, Action: Legal);
504 setOperationAction(Ops: {ISD::TRAP, ISD::DEBUGTRAP}, VT: MVT::Other, Action: Custom);
505
506 if (Subtarget->has16BitInsts()) {
507 setOperationAction(Ops: {ISD::FPOW, ISD::FPOWI}, VT: MVT::f16, Action: Promote);
508 setOperationAction(Ops: {ISD::FLOG, ISD::FEXP, ISD::FLOG10}, VT: MVT::f16, Action: Custom);
509 setOperationAction(Ops: ISD::IS_FPCLASS, VTs: {MVT::f16, MVT::f32, MVT::f64}, Action: Legal);
510 setOperationAction(Ops: {ISD::FLOG2, ISD::FEXP2}, VT: MVT::f16, Action: Legal);
511 setOperationAction(Op: ISD::FCANONICALIZE, VT: MVT::f16, Action: Legal);
512 } else {
513 setOperationAction(Op: ISD::FSQRT, VT: MVT::f16, Action: Custom);
514 }
515
516 if (Subtarget->hasMadMacF32Insts())
517 setOperationAction(Op: ISD::FMAD, VT: MVT::f32, Action: Legal);
518
519 setOperationAction(Ops: {ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
520 setOperationAction(Ops: {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, VT: MVT::i32, Action: Custom);
521
522 // We only really have 32-bit BFE instructions (and 16-bit on VI).
523 //
524 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
525 // effort to match them now. We want this to be false for i64 cases when the
526 // extraction isn't restricted to the upper or lower half. Ideally we would
527 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
528 // span the midpoint are probably relatively rare, so don't worry about them
529 // for now.
530 setHasExtractBitsInsn(true);
531
532 // Clamp modifier on add/sub
533 if (Subtarget->hasIntClamp())
534 setOperationAction(Ops: {ISD::UADDSAT, ISD::USUBSAT}, VT: MVT::i32, Action: Legal);
535
536 if (Subtarget->hasAddNoCarryInsts())
537 setOperationAction(Ops: {ISD::SADDSAT, ISD::SSUBSAT}, VTs: {MVT::i16, MVT::i32},
538 Action: Legal);
539
540 setOperationAction(
541 Ops: {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
542 VTs: {MVT::f32, MVT::f64}, Action: Custom);
543
544 // These are really only legal for ieee_mode functions. We should be avoiding
545 // them for functions that don't have ieee_mode enabled, so just say they are
546 // legal.
547 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
548 VTs: {MVT::f32, MVT::f64}, Action: Legal);
549
550 if (Subtarget->haveRoundOpsF64())
551 setOperationAction(Ops: {ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, VT: MVT::f64,
552 Action: Legal);
553 else
554 setOperationAction(Ops: {ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
555 VT: MVT::f64, Action: Custom);
556
557 setOperationAction(Op: ISD::FFLOOR, VT: MVT::f64, Action: Legal);
558 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VTs: {MVT::f32, MVT::f64},
559 Action: Legal);
560 setOperationAction(Ops: ISD::FFREXP, VTs: {MVT::f32, MVT::f64}, Action: Custom);
561
562 setOperationAction(Ops: {ISD::FSIN, ISD::FCOS, ISD::FDIV}, VT: MVT::f32, Action: Custom);
563 setOperationAction(Op: ISD::FDIV, VT: MVT::f64, Action: Custom);
564
565 setOperationAction(Ops: ISD::BF16_TO_FP, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
566 setOperationAction(Ops: ISD::FP_TO_BF16, VTs: {MVT::i16, MVT::f32, MVT::f64}, Action: Expand);
567
568 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i32,
569 Action: Custom);
570 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i16,
571 Action: Custom);
572 setOperationAction(Ops: {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT: MVT::i1,
573 Action: Custom);
574
575 // Custom lower these because we can't specify a rule based on an illegal
576 // source bf16.
577 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f32, Action: Custom);
578 setOperationAction(Ops: {ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, VT: MVT::f64, Action: Custom);
579
580 if (Subtarget->has16BitInsts()) {
581 setOperationAction(Ops: {ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
582 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
583 VT: MVT::i16, Action: Legal);
584
585 AddPromotedToType(Opc: ISD::SIGN_EXTEND, OrigVT: MVT::i16, DestVT: MVT::i32);
586
587 setOperationAction(Ops: {ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
588 VT: MVT::i16, Action: Expand);
589
590 setOperationAction(Ops: {ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
591 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
592 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
593 ISD::CTPOP},
594 VT: MVT::i16, Action: Promote);
595
596 setOperationAction(Op: ISD::LOAD, VT: MVT::i16, Action: Custom);
597
598 setTruncStoreAction(ValVT: MVT::i64, MemVT: MVT::i16, Action: Expand);
599
600 setOperationAction(Op: ISD::FP16_TO_FP, VT: MVT::i16, Action: Promote);
601 AddPromotedToType(Opc: ISD::FP16_TO_FP, OrigVT: MVT::i16, DestVT: MVT::i32);
602 setOperationAction(Op: ISD::FP_TO_FP16, VT: MVT::i16, Action: Promote);
603 AddPromotedToType(Opc: ISD::FP_TO_FP16, OrigVT: MVT::i16, DestVT: MVT::i32);
604
605 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT: MVT::i16, Action: Custom);
606 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i16, Action: Custom);
607 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i1, Action: Custom);
608
609 setOperationAction(Ops: {ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT: MVT::i32, Action: Custom);
610
611 // F16 - Constant Actions.
612 setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
613 setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
614
615 // F16 - Load/Store Actions.
616 setOperationAction(Op: ISD::LOAD, VT: MVT::f16, Action: Promote);
617 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::f16, DestVT: MVT::i16);
618 setOperationAction(Op: ISD::STORE, VT: MVT::f16, Action: Promote);
619 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::f16, DestVT: MVT::i16);
620
621 // BF16 - Load/Store Actions.
622 setOperationAction(Op: ISD::LOAD, VT: MVT::bf16, Action: Promote);
623 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::bf16, DestVT: MVT::i16);
624 setOperationAction(Op: ISD::STORE, VT: MVT::bf16, Action: Promote);
625 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::bf16, DestVT: MVT::i16);
626
627 // F16 - VOP1 Actions.
628 setOperationAction(Ops: {ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
629 ISD::FSIN, ISD::FROUND},
630 VT: MVT::f16, Action: Custom);
631
632 // BF16 - VOP1 Actions.
633 if (Subtarget->hasBF16TransInsts())
634 setOperationAction(Ops: {ISD::FCOS, ISD::FSIN, ISD::FDIV}, VT: MVT::bf16, Action: Custom);
635
636 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
637 ISD::FP_TO_UINT_SAT},
638 VT: MVT::f16, Action: Promote);
639 setOperationAction(Ops: {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
640 ISD::FP_TO_UINT_SAT},
641 VT: MVT::bf16, Action: Promote);
642
643 // F16 - VOP2 Actions.
644 setOperationAction(Ops: {ISD::BR_CC, ISD::SELECT_CC}, VTs: {MVT::f16, MVT::bf16},
645 Action: Expand);
646 setOperationAction(Ops: {ISD::FLDEXP, ISD::STRICT_FLDEXP}, VT: MVT::f16, Action: Custom);
647 setOperationAction(Op: ISD::FFREXP, VT: MVT::f16, Action: Custom);
648 setOperationAction(Op: ISD::FDIV, VT: MVT::f16, Action: Custom);
649
650 // F16 - VOP3 Actions.
651 setOperationAction(Op: ISD::FMA, VT: MVT::f16, Action: Legal);
652 if (STI.hasMadF16())
653 setOperationAction(Op: ISD::FMAD, VT: MVT::f16, Action: Legal);
654
655 for (MVT VT :
656 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
657 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
658 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
659 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
660 switch (Op) {
661 case ISD::LOAD:
662 case ISD::STORE:
663 case ISD::BUILD_VECTOR:
664 case ISD::BITCAST:
665 case ISD::UNDEF:
666 case ISD::EXTRACT_VECTOR_ELT:
667 case ISD::INSERT_VECTOR_ELT:
668 case ISD::INSERT_SUBVECTOR:
669 case ISD::SCALAR_TO_VECTOR:
670 case ISD::IS_FPCLASS:
671 break;
672 case ISD::EXTRACT_SUBVECTOR:
673 case ISD::CONCAT_VECTORS:
674 case ISD::FSIN:
675 case ISD::FCOS:
676 setOperationAction(Op, VT, Action: Custom);
677 break;
678 default:
679 setOperationAction(Op, VT, Action: Expand);
680 break;
681 }
682 }
683 }
684
685 // v_perm_b32 can handle either of these.
686 setOperationAction(Ops: ISD::BSWAP, VTs: {MVT::i16, MVT::v2i16}, Action: Legal);
687 setOperationAction(Op: ISD::BSWAP, VT: MVT::v4i16, Action: Custom);
688
689 // XXX - Do these do anything? Vector constants turn into build_vector.
690 setOperationAction(Ops: ISD::Constant, VTs: {MVT::v2i16, MVT::v2f16}, Action: Legal);
691
692 setOperationAction(Ops: ISD::UNDEF, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
693 Action: Legal);
694
695 setOperationAction(Op: ISD::STORE, VT: MVT::v2i16, Action: Promote);
696 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2i16, DestVT: MVT::i32);
697 setOperationAction(Op: ISD::STORE, VT: MVT::v2f16, Action: Promote);
698 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v2f16, DestVT: MVT::i32);
699
700 setOperationAction(Op: ISD::LOAD, VT: MVT::v2i16, Action: Promote);
701 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2i16, DestVT: MVT::i32);
702 setOperationAction(Op: ISD::LOAD, VT: MVT::v2f16, Action: Promote);
703 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v2f16, DestVT: MVT::i32);
704
705 setOperationAction(Op: ISD::AND, VT: MVT::v2i16, Action: Promote);
706 AddPromotedToType(Opc: ISD::AND, OrigVT: MVT::v2i16, DestVT: MVT::i32);
707 setOperationAction(Op: ISD::OR, VT: MVT::v2i16, Action: Promote);
708 AddPromotedToType(Opc: ISD::OR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
709 setOperationAction(Op: ISD::XOR, VT: MVT::v2i16, Action: Promote);
710 AddPromotedToType(Opc: ISD::XOR, OrigVT: MVT::v2i16, DestVT: MVT::i32);
711
712 setOperationAction(Op: ISD::LOAD, VT: MVT::v4i16, Action: Promote);
713 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
714 setOperationAction(Op: ISD::LOAD, VT: MVT::v4f16, Action: Promote);
715 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
716 setOperationAction(Op: ISD::LOAD, VT: MVT::v4bf16, Action: Promote);
717 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
718
719 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
720 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
721 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
722 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
723 setOperationAction(Op: ISD::STORE, VT: MVT::v4bf16, Action: Promote);
724 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4bf16, DestVT: MVT::v2i32);
725
726 setOperationAction(Op: ISD::LOAD, VT: MVT::v8i16, Action: Promote);
727 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
728 setOperationAction(Op: ISD::LOAD, VT: MVT::v8f16, Action: Promote);
729 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
730 setOperationAction(Op: ISD::LOAD, VT: MVT::v8bf16, Action: Promote);
731 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
732
733 setOperationAction(Op: ISD::STORE, VT: MVT::v4i16, Action: Promote);
734 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4i16, DestVT: MVT::v2i32);
735 setOperationAction(Op: ISD::STORE, VT: MVT::v4f16, Action: Promote);
736 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v4f16, DestVT: MVT::v2i32);
737
738 setOperationAction(Op: ISD::STORE, VT: MVT::v8i16, Action: Promote);
739 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8i16, DestVT: MVT::v4i32);
740 setOperationAction(Op: ISD::STORE, VT: MVT::v8f16, Action: Promote);
741 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8f16, DestVT: MVT::v4i32);
742 setOperationAction(Op: ISD::STORE, VT: MVT::v8bf16, Action: Promote);
743 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v8bf16, DestVT: MVT::v4i32);
744
745 setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Promote);
746 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
747 setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Promote);
748 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
749 setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Promote);
750 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
751
752 setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Promote);
753 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16i16, DestVT: MVT::v8i32);
754 setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Promote);
755 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16f16, DestVT: MVT::v8i32);
756 setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Promote);
757 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v16bf16, DestVT: MVT::v8i32);
758
759 setOperationAction(Op: ISD::LOAD, VT: MVT::v32i16, Action: Promote);
760 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
761 setOperationAction(Op: ISD::LOAD, VT: MVT::v32f16, Action: Promote);
762 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
763 setOperationAction(Op: ISD::LOAD, VT: MVT::v32bf16, Action: Promote);
764 AddPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
765
766 setOperationAction(Op: ISD::STORE, VT: MVT::v32i16, Action: Promote);
767 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32i16, DestVT: MVT::v16i32);
768 setOperationAction(Op: ISD::STORE, VT: MVT::v32f16, Action: Promote);
769 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32f16, DestVT: MVT::v16i32);
770 setOperationAction(Op: ISD::STORE, VT: MVT::v32bf16, Action: Promote);
771 AddPromotedToType(Opc: ISD::STORE, OrigVT: MVT::v32bf16, DestVT: MVT::v16i32);
772
773 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
774 VT: MVT::v2i32, Action: Expand);
775 setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v2f32, Action: Expand);
776
777 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
778 VT: MVT::v4i32, Action: Expand);
779
780 setOperationAction(Ops: {ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
781 VT: MVT::v8i32, Action: Expand);
782
783 setOperationAction(Ops: ISD::BUILD_VECTOR, VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
784 Action: Subtarget->hasVOP3PInsts() ? Legal : Custom);
785
786 setOperationAction(Ops: ISD::FNEG, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
787 // This isn't really legal, but this avoids the legalizer unrolling it (and
788 // allows matching fneg (fabs x) patterns)
789 setOperationAction(Ops: ISD::FABS, VTs: {MVT::v2f16, MVT::v2bf16}, Action: Legal);
790
791 // Can do this in one BFI plus a constant materialize.
792 setOperationAction(Ops: ISD::FCOPYSIGN,
793 VTs: {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
794 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
795 MVT::v32f16, MVT::v32bf16},
796 Action: Custom);
797
798 setOperationAction(
799 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
800 VT: MVT::f16, Action: Custom);
801 setOperationAction(Ops: {ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, VT: MVT::f16, Action: Legal);
802
803 setOperationAction(Ops: {ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
804 ISD::FMAXIMUMNUM},
805 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806 Action: Custom);
807
808 setOperationAction(Ops: {ISD::FMINNUM, ISD::FMAXNUM},
809 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 Action: Expand);
811
812 for (MVT Vec16 :
813 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
814 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
815 setOperationAction(
816 Ops: {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
817 VT: Vec16, Action: Custom);
818 setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT: Vec16, Action: Expand);
819 }
820 }
821
822 if (Subtarget->hasVOP3PInsts()) {
823 setOperationAction(Ops: {ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
824 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
825 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
826 VT: MVT::v2i16, Action: Legal);
827
828 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
829 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
830 VT: MVT::v2f16, Action: Legal);
831
832 setOperationAction(Ops: ISD::EXTRACT_VECTOR_ELT,
833 VTs: {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Action: Custom);
834
835 setOperationAction(Ops: ISD::VECTOR_SHUFFLE,
836 VTs: {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
837 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
838 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
839 Action: Custom);
840
841 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
842 // Split vector operations.
843 setOperationAction(Ops: {ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
844 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
845 ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
846 ISD::SSUBSAT},
847 VT, Action: Custom);
848
849 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
850 // Split vector operations.
851 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
852 VT, Action: Custom);
853
854 setOperationAction(
855 Ops: {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
856 VTs: {MVT::v2f16, MVT::v4f16}, Action: Custom);
857
858 setOperationAction(Op: ISD::FEXP, VT: MVT::v2f16, Action: Custom);
859 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
860 Action: Custom);
861
862 if (Subtarget->hasBF16PackedInsts()) {
863 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
864 // Split vector operations.
865 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
866 VT, Action: Custom);
867 }
868
869 if (Subtarget->hasPackedFP32Ops()) {
870 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
871 VT: MVT::v2f32, Action: Legal);
872 setOperationAction(Ops: {ISD::FADD, ISD::FMUL, ISD::FMA},
873 VTs: {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
874 Action: Custom);
875 }
876 }
877
878 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v4f16, Action: Custom);
879
880 if (Subtarget->has16BitInsts()) {
881 setOperationAction(Op: ISD::SELECT, VT: MVT::v2i16, Action: Promote);
882 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2i16, DestVT: MVT::i32);
883 setOperationAction(Op: ISD::SELECT, VT: MVT::v2f16, Action: Promote);
884 AddPromotedToType(Opc: ISD::SELECT, OrigVT: MVT::v2f16, DestVT: MVT::i32);
885 } else {
886 // Legalization hack.
887 setOperationAction(Ops: ISD::SELECT, VTs: {MVT::v2i16, MVT::v2f16}, Action: Custom);
888
889 setOperationAction(Ops: {ISD::FNEG, ISD::FABS}, VT: MVT::v2f16, Action: Custom);
890 }
891
892 setOperationAction(Ops: ISD::SELECT,
893 VTs: {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
894 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
895 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
896 MVT::v32f16, MVT::v32bf16},
897 Action: Custom);
898
899 setOperationAction(Ops: {ISD::SMULO, ISD::UMULO}, VT: MVT::i64, Action: Custom);
900
901 if (Subtarget->hasVectorMulU64())
902 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Legal);
903 else if (Subtarget->hasScalarSMulU64())
904 setOperationAction(Op: ISD::MUL, VT: MVT::i64, Action: Custom);
905
906 if (Subtarget->hasMad64_32())
907 setOperationAction(Ops: {ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT: MVT::i32, Action: Custom);
908
909 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
910 setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
911
912 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
913 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM},
914 VTs: {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Action: Legal);
915 } else {
916 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
917 if (Subtarget->hasMinimum3Maximum3F32())
918 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f32, Action: Legal);
919
920 if (Subtarget->hasMinimum3Maximum3PKF16()) {
921 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::v2f16, Action: Legal);
922
923 // If only the vector form is available, we need to widen to a vector.
924 if (!Subtarget->hasMinimum3Maximum3F16())
925 setOperationAction(Ops: {ISD::FMAXIMUM, ISD::FMINIMUM}, VT: MVT::f16, Action: Custom);
926 }
927 }
928
929 if (Subtarget->hasVOP3PInsts()) {
930 // We want to break these into v2f16 pieces, not scalarize.
931 setOperationAction(Ops: {ISD::FMINIMUM, ISD::FMAXIMUM},
932 VTs: {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933 Action: Custom);
934 }
935
936 if (Subtarget->hasIntMinMax64())
937 setOperationAction(Ops: {ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, VT: MVT::i64,
938 Action: Legal);
939
940 setOperationAction(Ops: ISD::INTRINSIC_WO_CHAIN,
941 VTs: {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
942 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
943 MVT::i8},
944 Action: Custom);
945
946 setOperationAction(Ops: ISD::INTRINSIC_W_CHAIN,
947 VTs: {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
948 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
949 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
950 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951 Action: Custom);
952
953 setOperationAction(Ops: ISD::INTRINSIC_VOID,
954 VTs: {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
955 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
956 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
957 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
958 Action: Custom);
959
960 setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Custom);
961 setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
962 setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
963 setOperationAction(Op: ISD::GET_FPENV, VT: MVT::i64, Action: Custom);
964 setOperationAction(Op: ISD::SET_FPENV, VT: MVT::i64, Action: Custom);
965
966 // TODO: Could move this to custom lowering, could benefit from combines on
967 // extract of relevant bits.
968 setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Legal);
969
970 setOperationAction(Op: ISD::MUL, VT: MVT::i1, Action: Promote);
971
972 if (Subtarget->hasBF16ConversionInsts()) {
973 setOperationAction(Ops: ISD::FP_ROUND, VTs: {MVT::bf16, MVT::v2bf16}, Action: Custom);
974 setOperationAction(Op: ISD::BUILD_VECTOR, VT: MVT::v2bf16, Action: Legal);
975 }
976
977 if (Subtarget->hasBF16PackedInsts()) {
978 setOperationAction(
979 Ops: {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
980 VT: MVT::v2bf16, Action: Legal);
981 }
982
983 if (Subtarget->hasBF16TransInsts()) {
984 setOperationAction(Ops: {ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, VT: MVT::bf16, Action: Legal);
985 }
986
987 if (Subtarget->hasCvtPkF16F32Inst()) {
988 setOperationAction(Ops: ISD::FP_ROUND,
989 VTs: {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
990 Action: Custom);
991 }
992
993 setTargetDAGCombine({ISD::ADD,
994 ISD::PTRADD,
995 ISD::UADDO_CARRY,
996 ISD::SUB,
997 ISD::USUBO_CARRY,
998 ISD::MUL,
999 ISD::FADD,
1000 ISD::FSUB,
1001 ISD::FDIV,
1002 ISD::FMUL,
1003 ISD::FMINNUM,
1004 ISD::FMAXNUM,
1005 ISD::FMINNUM_IEEE,
1006 ISD::FMAXNUM_IEEE,
1007 ISD::FMINIMUM,
1008 ISD::FMAXIMUM,
1009 ISD::FMINIMUMNUM,
1010 ISD::FMAXIMUMNUM,
1011 ISD::FMA,
1012 ISD::SMIN,
1013 ISD::SMAX,
1014 ISD::UMIN,
1015 ISD::UMAX,
1016 ISD::SETCC,
1017 ISD::SELECT,
1018 ISD::SMIN,
1019 ISD::SMAX,
1020 ISD::UMIN,
1021 ISD::UMAX,
1022 ISD::AND,
1023 ISD::OR,
1024 ISD::XOR,
1025 ISD::SHL,
1026 ISD::SRL,
1027 ISD::SRA,
1028 ISD::FSHR,
1029 ISD::SINT_TO_FP,
1030 ISD::UINT_TO_FP,
1031 ISD::FCANONICALIZE,
1032 ISD::SCALAR_TO_VECTOR,
1033 ISD::ZERO_EXTEND,
1034 ISD::SIGN_EXTEND_INREG,
1035 ISD::ANY_EXTEND,
1036 ISD::EXTRACT_VECTOR_ELT,
1037 ISD::INSERT_VECTOR_ELT,
1038 ISD::FCOPYSIGN});
1039
1040 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1041 setTargetDAGCombine(ISD::FP_ROUND);
1042
1043 // All memory operations. Some folding on the pointer operand is done to help
1044 // matching the constant offsets in the addressing modes.
1045 setTargetDAGCombine({ISD::LOAD,
1046 ISD::STORE,
1047 ISD::ATOMIC_LOAD,
1048 ISD::ATOMIC_STORE,
1049 ISD::ATOMIC_CMP_SWAP,
1050 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1051 ISD::ATOMIC_SWAP,
1052 ISD::ATOMIC_LOAD_ADD,
1053 ISD::ATOMIC_LOAD_SUB,
1054 ISD::ATOMIC_LOAD_AND,
1055 ISD::ATOMIC_LOAD_OR,
1056 ISD::ATOMIC_LOAD_XOR,
1057 ISD::ATOMIC_LOAD_NAND,
1058 ISD::ATOMIC_LOAD_MIN,
1059 ISD::ATOMIC_LOAD_MAX,
1060 ISD::ATOMIC_LOAD_UMIN,
1061 ISD::ATOMIC_LOAD_UMAX,
1062 ISD::ATOMIC_LOAD_FADD,
1063 ISD::ATOMIC_LOAD_FMIN,
1064 ISD::ATOMIC_LOAD_FMAX,
1065 ISD::ATOMIC_LOAD_UINC_WRAP,
1066 ISD::ATOMIC_LOAD_UDEC_WRAP,
1067 ISD::ATOMIC_LOAD_USUB_COND,
1068 ISD::ATOMIC_LOAD_USUB_SAT,
1069 ISD::INTRINSIC_VOID,
1070 ISD::INTRINSIC_W_CHAIN});
1071
1072 // FIXME: In other contexts we pretend this is a per-function property.
1073 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
1074
1075 setSchedulingPreference(Sched::RegPressure);
1076}
1077
1078const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1079
1080ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
1081 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1082 return RCRegs;
1083}
1084
1085//===----------------------------------------------------------------------===//
1086// TargetLowering queries
1087//===----------------------------------------------------------------------===//
1088
1089// v_mad_mix* support a conversion from f16 to f32.
1090//
1091// There is only one special case when denormals are enabled we don't currently,
1092// where this is OK to use.
1093bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1094 EVT DestVT, EVT SrcVT) const {
1095 return DestVT.getScalarType() == MVT::f32 &&
1096 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1097 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1098 SrcVT.getScalarType() == MVT::f16) ||
1099 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1100 SrcVT.getScalarType() == MVT::bf16)) &&
1101 // TODO: This probably only requires no input flushing?
1102 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
1103}
1104
1105bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
1106 LLT DestTy, LLT SrcTy) const {
1107 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1108 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1109 DestTy.getScalarSizeInBits() == 32 &&
1110 SrcTy.getScalarSizeInBits() == 16 &&
1111 // TODO: This probably only requires no input flushing?
1112 denormalModeIsFlushAllF32(MF: *MI.getMF());
1113}
1114
1115bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
1116 // SI has some legal vector types, but no legal vector operations. Say no
1117 // shuffles are legal in order to prefer scalarizing some vector operations.
1118 return false;
1119}
1120
1121MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1122 CallingConv::ID CC,
1123 EVT VT) const {
1124 if (CC == CallingConv::AMDGPU_KERNEL)
1125 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1126
1127 if (VT.isVector()) {
1128 EVT ScalarVT = VT.getScalarType();
1129 unsigned Size = ScalarVT.getSizeInBits();
1130 if (Size == 16) {
1131 return Subtarget->has16BitInsts()
1132 ? MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), NumElements: 2)
1133 : MVT::i32;
1134 }
1135
1136 if (Size < 16)
1137 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1138 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1139 }
1140
1141 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1142 return MVT::i32;
1143
1144 if (VT.getSizeInBits() > 32)
1145 return MVT::i32;
1146
1147 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1148}
1149
1150unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1151 CallingConv::ID CC,
1152 EVT VT) const {
1153 if (CC == CallingConv::AMDGPU_KERNEL)
1154 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1155
1156 if (VT.isVector()) {
1157 unsigned NumElts = VT.getVectorNumElements();
1158 EVT ScalarVT = VT.getScalarType();
1159 unsigned Size = ScalarVT.getSizeInBits();
1160
1161 // FIXME: Should probably promote 8-bit vectors to i16.
1162 if (Size == 16)
1163 return (NumElts + 1) / 2;
1164
1165 if (Size <= 32)
1166 return NumElts;
1167
1168 if (Size > 32)
1169 return NumElts * ((Size + 31) / 32);
1170 } else if (VT.getSizeInBits() > 32)
1171 return (VT.getSizeInBits() + 31) / 32;
1172
1173 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1174}
1175
1176unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
1177 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1178 unsigned &NumIntermediates, MVT &RegisterVT) const {
1179 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1180 unsigned NumElts = VT.getVectorNumElements();
1181 EVT ScalarVT = VT.getScalarType();
1182 unsigned Size = ScalarVT.getSizeInBits();
1183 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1184 // support, but unless we can properly handle 3-vectors, it will be still be
1185 // inconsistent.
1186 if (Size == 16) {
1187 MVT SimpleIntermediateVT =
1188 MVT::getVectorVT(VT: ScalarVT.getSimpleVT(), EC: ElementCount::getFixed(MinVal: 2));
1189 IntermediateVT = SimpleIntermediateVT;
1190 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1191 NumIntermediates = (NumElts + 1) / 2;
1192 return (NumElts + 1) / 2;
1193 }
1194
1195 if (Size == 32) {
1196 RegisterVT = ScalarVT.getSimpleVT();
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size < 16 && Subtarget->has16BitInsts()) {
1203 // FIXME: Should probably form v2i16 pieces
1204 RegisterVT = MVT::i16;
1205 IntermediateVT = ScalarVT;
1206 NumIntermediates = NumElts;
1207 return NumIntermediates;
1208 }
1209
1210 if (Size != 16 && Size <= 32) {
1211 RegisterVT = MVT::i32;
1212 IntermediateVT = ScalarVT;
1213 NumIntermediates = NumElts;
1214 return NumIntermediates;
1215 }
1216
1217 if (Size > 32) {
1218 RegisterVT = MVT::i32;
1219 IntermediateVT = RegisterVT;
1220 NumIntermediates = NumElts * ((Size + 31) / 32);
1221 return NumIntermediates;
1222 }
1223 }
1224
1225 return TargetLowering::getVectorTypeBreakdownForCallingConv(
1226 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1227}
1228
1229static EVT memVTFromLoadIntrData(const SITargetLowering &TLI,
1230 const DataLayout &DL, Type *Ty,
1231 unsigned MaxNumLanes) {
1232 assert(MaxNumLanes != 0);
1233
1234 LLVMContext &Ctx = Ty->getContext();
1235 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1236 unsigned NumElts = std::min(a: MaxNumLanes, b: VT->getNumElements());
1237 return EVT::getVectorVT(Context&: Ctx, VT: TLI.getValueType(DL, Ty: VT->getElementType()),
1238 NumElements: NumElts);
1239 }
1240
1241 return TLI.getValueType(DL, Ty);
1242}
1243
1244// Peek through TFE struct returns to only use the data size.
1245static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI,
1246 const DataLayout &DL, Type *Ty,
1247 unsigned MaxNumLanes) {
1248 auto *ST = dyn_cast<StructType>(Val: Ty);
1249 if (!ST)
1250 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1251
1252 // TFE intrinsics return an aggregate type.
1253 assert(ST->getNumContainedTypes() == 2 &&
1254 ST->getContainedType(1)->isIntegerTy(32));
1255 return memVTFromLoadIntrData(TLI, DL, Ty: ST->getContainedType(i: 0), MaxNumLanes);
1256}
1257
1258/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1259/// in-memory representation. This return value is a custom type because there
1260/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1261/// could cause issues during codegen, these address space 7 pointers will be
1262/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1263/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1264/// for cost modeling, to work. (This also sets us up decently for doing the
1265/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1266MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
1267 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1268 return MVT::amdgpuBufferFatPointer;
1269 if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1270 DL.getPointerSizeInBits(AS) == 192)
1271 return MVT::amdgpuBufferStridedPointer;
1272 return AMDGPUTargetLowering::getPointerTy(DL, AS);
1273}
1274/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1275/// v8i32 when padding is added.
1276/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1277/// also v8i32 with padding.
1278MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
1279 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1280 DL.getPointerSizeInBits(AS) == 160) ||
1281 (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
1282 DL.getPointerSizeInBits(AS) == 192))
1283 return MVT::v8i32;
1284 return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
1285}
1286
1287static unsigned getIntrMemWidth(unsigned IntrID) {
1288 switch (IntrID) {
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1292 return 8;
1293 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1295 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1296 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1297 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1298 case Intrinsic::amdgcn_flat_load_monitor_b32:
1299 case Intrinsic::amdgcn_global_load_monitor_b32:
1300 return 32;
1301 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1302 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1303 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1304 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1305 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1306 case Intrinsic::amdgcn_flat_load_monitor_b64:
1307 case Intrinsic::amdgcn_global_load_monitor_b64:
1308 return 64;
1309 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1310 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1311 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1312 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1313 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1314 case Intrinsic::amdgcn_flat_load_monitor_b128:
1315 case Intrinsic::amdgcn_global_load_monitor_b128:
1316 return 128;
1317 default:
1318 llvm_unreachable("Unknown width");
1319 }
1320}
1321
1322static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI,
1323 unsigned ArgIdx) {
1324 Value *OrderingArg = CI.getArgOperand(i: ArgIdx);
1325 unsigned Ord = cast<ConstantInt>(Val: OrderingArg)->getZExtValue();
1326 switch (AtomicOrderingCABI(Ord)) {
1327 case AtomicOrderingCABI::acquire:
1328 return AtomicOrdering::Acquire;
1329 break;
1330 case AtomicOrderingCABI::release:
1331 return AtomicOrdering::Release;
1332 break;
1333 case AtomicOrderingCABI::seq_cst:
1334 return AtomicOrdering::SequentiallyConsistent;
1335 break;
1336 default:
1337 return AtomicOrdering::Monotonic;
1338 }
1339}
1340
1341static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1342 MDNode *ScopeMD = cast<MDNode>(
1343 Val: cast<MetadataAsValue>(Val: CI.getArgOperand(i: ArgIdx))->getMetadata());
1344 StringRef Scope = cast<MDString>(Val: ScopeMD->getOperand(I: 0))->getString();
1345 return CI.getContext().getOrInsertSyncScopeID(SSN: Scope);
1346}
1347
1348void SITargetLowering::getTgtMemIntrinsic(SmallVectorImpl<IntrinsicInfo> &Infos,
1349 const CallBase &CI,
1350 MachineFunction &MF,
1351 unsigned IntrID) const {
1352 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1353 if (CI.hasMetadata(KindID: LLVMContext::MD_invariant_load))
1354 Flags |= MachineMemOperand::MOInvariant;
1355 if (CI.hasMetadata(KindID: LLVMContext::MD_nontemporal))
1356 Flags |= MachineMemOperand::MONonTemporal;
1357 Flags |= getTargetMMOFlags(I: CI);
1358
1359 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1360 AMDGPU::lookupRsrcIntrinsic(Intr: IntrID)) {
1361 AttributeSet Attr =
1362 Intrinsic::getFnAttributes(C&: CI.getContext(), id: (Intrinsic::ID)IntrID);
1363 MemoryEffects ME = Attr.getMemoryEffects();
1364 if (ME.doesNotAccessMemory())
1365 return;
1366
1367 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1368 if (!IsSPrefetch) {
1369 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1370 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1371 Flags |= MachineMemOperand::MOVolatile;
1372 }
1373 Flags |= MachineMemOperand::MODereferenceable;
1374
1375 IntrinsicInfo Info;
1376 // TODO: Should images get their own address space?
1377 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1378
1379 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1380 if (RsrcIntr->IsImage) {
1381 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1382 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID);
1383 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Intr->BaseOpcode);
1384 Info.align.reset();
1385 }
1386
1387 Value *RsrcArg = CI.getArgOperand(i: RsrcIntr->RsrcArg);
1388 if (auto *RsrcPtrTy = dyn_cast<PointerType>(Val: RsrcArg->getType())) {
1389 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1390 // We conservatively set the memory operand of a buffer intrinsic to the
1391 // base resource pointer, so that we can access alias information about
1392 // those pointers. Cases like "this points at the same value
1393 // but with a different offset" are handled in
1394 // areMemAccessesTriviallyDisjoint.
1395 Info.ptrVal = RsrcArg;
1396 }
1397
1398 if (ME.onlyReadsMemory()) {
1399 if (RsrcIntr->IsImage) {
1400 unsigned MaxNumLanes = 4;
1401
1402 if (!BaseOpcode->Gather4) {
1403 // If this isn't a gather, we may have excess loaded elements in the
1404 // IR type. Check the dmask for the real number of elements loaded.
1405 unsigned DMask =
1406 cast<ConstantInt>(Val: CI.getArgOperand(i: 0))->getZExtValue();
1407 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1408 }
1409
1410 Info.memVT = memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(),
1411 Ty: CI.getType(), MaxNumLanes);
1412 } else {
1413 Info.memVT =
1414 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1415 MaxNumLanes: std::numeric_limits<unsigned>::max());
1416 }
1417
1418 // FIXME: What does alignment mean for an image?
1419 Info.opc = ISD::INTRINSIC_W_CHAIN;
1420 Info.flags = Flags | MachineMemOperand::MOLoad;
1421 } else if (ME.onlyWritesMemory()) {
1422 Info.opc = ISD::INTRINSIC_VOID;
1423
1424 Type *DataTy = CI.getArgOperand(i: 0)->getType();
1425 if (RsrcIntr->IsImage) {
1426 unsigned DMask = cast<ConstantInt>(Val: CI.getArgOperand(i: 1))->getZExtValue();
1427 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(Value: DMask);
1428 Info.memVT = memVTFromLoadIntrData(TLI: *this, DL: MF.getDataLayout(), Ty: DataTy,
1429 MaxNumLanes: DMaskLanes);
1430 } else
1431 Info.memVT = getValueType(DL: MF.getDataLayout(), Ty: DataTy);
1432
1433 Info.flags = Flags | MachineMemOperand::MOStore;
1434 } else {
1435 // Atomic, NoReturn Sampler or prefetch
1436 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1437 : ISD::INTRINSIC_W_CHAIN;
1438
1439 switch (IntrID) {
1440 default:
1441 Info.flags = Flags | MachineMemOperand::MOLoad;
1442 if (!IsSPrefetch)
1443 Info.flags |= MachineMemOperand::MOStore;
1444
1445 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1446 // Fake memory access type for no return sampler intrinsics
1447 Info.memVT = MVT::i32;
1448 } else {
1449 // XXX - Should this be volatile without known ordering?
1450 Info.flags |= MachineMemOperand::MOVolatile;
1451 Info.memVT = MVT::getVT(Ty: CI.getArgOperand(i: 0)->getType());
1452 }
1453 break;
1454 case Intrinsic::amdgcn_raw_buffer_load_lds:
1455 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1456 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1457 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1458 case Intrinsic::amdgcn_struct_buffer_load_lds:
1459 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1460 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1461 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1462 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1463
1464 // Entry 0: Load from buffer.
1465 // Don't set an offset, since the pointer value always represents the
1466 // base of the buffer.
1467 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1468 Info.flags = Flags | MachineMemOperand::MOLoad;
1469 Infos.push_back(Elt: Info);
1470
1471 // Entry 1: Store to LDS.
1472 // Instruction offset is applied, and an additional per-lane offset
1473 // which we simulate using a larger memory type.
1474 Info.memVT = EVT::getIntegerVT(
1475 Context&: CI.getContext(), BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1476 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1477 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 2))
1478 ->getZExtValue();
1479 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1480 Info.flags = Flags | MachineMemOperand::MOStore;
1481 Infos.push_back(Elt: Info);
1482 return;
1483 }
1484 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1485 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1486 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1487 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1488 Info.memVT =
1489 memVTFromLoadIntrReturn(TLI: *this, DL: MF.getDataLayout(), Ty: CI.getType(),
1490 MaxNumLanes: std::numeric_limits<unsigned>::max());
1491 Info.flags = Flags | MachineMemOperand::MOLoad;
1492 Infos.push_back(Elt: Info);
1493 return;
1494 }
1495 }
1496 }
1497 Infos.push_back(Elt: Info);
1498 return;
1499 }
1500
1501 IntrinsicInfo Info;
1502 switch (IntrID) {
1503 case Intrinsic::amdgcn_ds_ordered_add:
1504 case Intrinsic::amdgcn_ds_ordered_swap: {
1505 Info.opc = ISD::INTRINSIC_W_CHAIN;
1506 Info.memVT = MVT::getVT(Ty: CI.getType());
1507 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1508 Info.align.reset();
1509 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1510
1511 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 4));
1512 if (!Vol->isZero())
1513 Info.flags |= MachineMemOperand::MOVolatile;
1514
1515 Infos.push_back(Elt: Info);
1516 return;
1517 }
1518 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1519 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1520 Info.opc = ISD::INTRINSIC_W_CHAIN;
1521 Info.memVT = MVT::getVT(Ty: CI.getOperand(i_nocapture: 0)->getType());
1522 Info.ptrVal = nullptr;
1523 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1524 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1525 Infos.push_back(Elt: Info);
1526 return;
1527 }
1528 case Intrinsic::amdgcn_ds_append:
1529 case Intrinsic::amdgcn_ds_consume: {
1530 Info.opc = ISD::INTRINSIC_W_CHAIN;
1531 Info.memVT = MVT::getVT(Ty: CI.getType());
1532 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1533 Info.align.reset();
1534 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1535
1536 const ConstantInt *Vol = cast<ConstantInt>(Val: CI.getOperand(i_nocapture: 1));
1537 if (!Vol->isZero())
1538 Info.flags |= MachineMemOperand::MOVolatile;
1539
1540 Infos.push_back(Elt: Info);
1541 return;
1542 }
1543 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1544 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1545 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1546 ? ISD::INTRINSIC_W_CHAIN
1547 : ISD::INTRINSIC_VOID;
1548 Info.memVT = MVT::getVT(Ty: CI.getType());
1549 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1550 Info.memVT = MVT::i64;
1551 Info.size = 8;
1552 Info.align.reset();
1553 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1554 Infos.push_back(Elt: Info);
1555 return;
1556 }
1557 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1558 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1559 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1560 Info.opc = ISD::INTRINSIC_W_CHAIN;
1561 Info.memVT =
1562 MVT::getVT(Ty: IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1563 ? CI.getType()
1564 : cast<StructType>(Val: CI.getType())
1565 ->getElementType(N: 0)); // XXX: what is correct VT?
1566
1567 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1568 Info.align.reset();
1569 Info.flags = Flags | MachineMemOperand::MOLoad |
1570 MachineMemOperand::MODereferenceable;
1571 Infos.push_back(Elt: Info);
1572 return;
1573 }
1574 case Intrinsic::amdgcn_global_atomic_fmin_num:
1575 case Intrinsic::amdgcn_global_atomic_fmax_num:
1576 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1577 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1578 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1579 Info.opc = ISD::INTRINSIC_W_CHAIN;
1580 Info.memVT = MVT::getVT(Ty: CI.getType());
1581 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1582 Info.align.reset();
1583 Info.flags =
1584 Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1585 MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile;
1586 Infos.push_back(Elt: Info);
1587 return;
1588 }
1589 case Intrinsic::amdgcn_cluster_load_b32:
1590 case Intrinsic::amdgcn_cluster_load_b64:
1591 case Intrinsic::amdgcn_cluster_load_b128:
1592 case Intrinsic::amdgcn_ds_load_tr6_b96:
1593 case Intrinsic::amdgcn_ds_load_tr4_b64:
1594 case Intrinsic::amdgcn_ds_load_tr8_b64:
1595 case Intrinsic::amdgcn_ds_load_tr16_b128:
1596 case Intrinsic::amdgcn_global_load_tr6_b96:
1597 case Intrinsic::amdgcn_global_load_tr4_b64:
1598 case Intrinsic::amdgcn_global_load_tr_b64:
1599 case Intrinsic::amdgcn_global_load_tr_b128:
1600 case Intrinsic::amdgcn_ds_read_tr4_b64:
1601 case Intrinsic::amdgcn_ds_read_tr6_b96:
1602 case Intrinsic::amdgcn_ds_read_tr8_b64:
1603 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1604 Info.opc = ISD::INTRINSIC_W_CHAIN;
1605 Info.memVT = MVT::getVT(Ty: CI.getType());
1606 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1607 Info.align.reset();
1608 Info.flags = Flags | MachineMemOperand::MOLoad;
1609 Infos.push_back(Elt: Info);
1610 return;
1611 }
1612 case Intrinsic::amdgcn_flat_load_monitor_b32:
1613 case Intrinsic::amdgcn_flat_load_monitor_b64:
1614 case Intrinsic::amdgcn_flat_load_monitor_b128:
1615 case Intrinsic::amdgcn_global_load_monitor_b32:
1616 case Intrinsic::amdgcn_global_load_monitor_b64:
1617 case Intrinsic::amdgcn_global_load_monitor_b128: {
1618 Info.opc = ISD::INTRINSIC_W_CHAIN;
1619 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1620 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1621 Info.align.reset();
1622 Info.flags = MachineMemOperand::MOLoad;
1623 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1624 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1625 Infos.push_back(Elt: Info);
1626 return;
1627 }
1628 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1629 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1630 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1631 Info.opc = ISD::INTRINSIC_W_CHAIN;
1632 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1633 Info.ptrVal = CI.getOperand(i_nocapture: 0);
1634 Info.align.reset();
1635 Info.flags = (MachineMemOperand::MOLoad | MOCooperative);
1636 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 1);
1637 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 2);
1638 Infos.push_back(Elt: Info);
1639 return;
1640 }
1641 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1642 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1643 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1644 Info.opc = ISD::INTRINSIC_VOID;
1645 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1646 Info.ptrVal = CI.getArgOperand(i: 0);
1647 Info.align.reset();
1648 Info.flags = (MachineMemOperand::MOStore | MOCooperative);
1649 Info.order = parseAtomicOrderingCABIArg(CI, ArgIdx: 2);
1650 Info.ssid = parseSyncscopeMDArg(CI, ArgIdx: 3);
1651 Infos.push_back(Elt: Info);
1652 return;
1653 }
1654 case Intrinsic::amdgcn_ds_gws_init:
1655 case Intrinsic::amdgcn_ds_gws_barrier:
1656 case Intrinsic::amdgcn_ds_gws_sema_v:
1657 case Intrinsic::amdgcn_ds_gws_sema_br:
1658 case Intrinsic::amdgcn_ds_gws_sema_p:
1659 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1660 Info.opc = ISD::INTRINSIC_VOID;
1661
1662 const GCNTargetMachine &TM =
1663 static_cast<const GCNTargetMachine &>(getTargetMachine());
1664
1665 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1666 Info.ptrVal = MFI->getGWSPSV(TM);
1667
1668 // This is an abstract access, but we need to specify a type and size.
1669 Info.memVT = MVT::i32;
1670 Info.size = 4;
1671 Info.align = Align(4);
1672
1673 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1674 Info.flags = Flags | MachineMemOperand::MOLoad;
1675 else
1676 Info.flags = Flags | MachineMemOperand::MOStore;
1677 Infos.push_back(Elt: Info);
1678 return;
1679 }
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1681 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1682 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1685 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1686 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1687 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1688 // Entry 0: Load from source (global/flat).
1689 Info.opc = ISD::INTRINSIC_VOID;
1690 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1691 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1692 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1693 Info.flags = Flags | MachineMemOperand::MOLoad;
1694 Infos.push_back(Elt: Info);
1695
1696 // Entry 1: Store to LDS (same offset).
1697 Info.flags = Flags | MachineMemOperand::MOStore;
1698 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1699 Infos.push_back(Elt: Info);
1700 return;
1701 }
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1703 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1704 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1705 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1706 // Entry 0: Load from LDS.
1707 Info.opc = ISD::INTRINSIC_VOID;
1708 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: getIntrMemWidth(IntrID));
1709 Info.ptrVal = CI.getArgOperand(i: 1); // LDS pointer
1710 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getSExtValue();
1711 Info.flags = Flags | MachineMemOperand::MOLoad;
1712 Infos.push_back(Elt: Info);
1713
1714 // Entry 1: Store to global (same offset).
1715 Info.flags = Flags | MachineMemOperand::MOStore;
1716 Info.ptrVal = CI.getArgOperand(i: 0); // Global pointer
1717 Infos.push_back(Elt: Info);
1718 return;
1719 }
1720 case Intrinsic::amdgcn_load_to_lds:
1721 case Intrinsic::amdgcn_load_async_to_lds:
1722 case Intrinsic::amdgcn_global_load_lds:
1723 case Intrinsic::amdgcn_global_load_async_lds: {
1724 unsigned Width = cast<ConstantInt>(Val: CI.getArgOperand(i: 2))->getZExtValue();
1725 auto *Aux = cast<ConstantInt>(Val: CI.getArgOperand(i: CI.arg_size() - 1));
1726 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1727 if (IsVolatile)
1728 Flags |= MachineMemOperand::MOVolatile;
1729
1730 // Entry 0: Load from source (global/flat).
1731 Info.opc = ISD::INTRINSIC_VOID;
1732 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: Width * 8);
1733 Info.ptrVal = CI.getArgOperand(i: 0); // Source pointer
1734 Info.offset = cast<ConstantInt>(Val: CI.getArgOperand(i: 3))->getSExtValue();
1735 Info.flags = Flags | MachineMemOperand::MOLoad;
1736 Infos.push_back(Elt: Info);
1737
1738 // Entry 1: Store to LDS.
1739 // Same offset from the instruction, but an additional per-lane offset is
1740 // added. Represent that using a wider memory type.
1741 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(),
1742 BitWidth: Width * 8 * Subtarget->getWavefrontSize());
1743 Info.ptrVal = CI.getArgOperand(i: 1); // LDS destination pointer
1744 Info.flags = Flags | MachineMemOperand::MOStore;
1745 Infos.push_back(Elt: Info);
1746 return;
1747 }
1748 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1749 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1750 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1751 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1752 Info.opc = ISD::INTRINSIC_W_CHAIN;
1753
1754 const GCNTargetMachine &TM =
1755 static_cast<const GCNTargetMachine &>(getTargetMachine());
1756
1757 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1758 Info.ptrVal = MFI->getGWSPSV(TM);
1759
1760 // This is an abstract access, but we need to specify a type and size.
1761 Info.memVT = MVT::i32;
1762 Info.size = 4;
1763 Info.align = Align(4);
1764
1765 Info.flags = Flags | MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1766 Infos.push_back(Elt: Info);
1767 return;
1768 }
1769 case Intrinsic::amdgcn_s_prefetch_data:
1770 case Intrinsic::amdgcn_flat_prefetch:
1771 case Intrinsic::amdgcn_global_prefetch: {
1772 Info.opc = ISD::INTRINSIC_VOID;
1773 Info.memVT = EVT::getIntegerVT(Context&: CI.getContext(), BitWidth: 8);
1774 Info.ptrVal = CI.getArgOperand(i: 0);
1775 Info.flags = Flags | MachineMemOperand::MOLoad;
1776 Infos.push_back(Elt: Info);
1777 return;
1778 }
1779 default:
1780 return;
1781 }
1782}
1783
1784void SITargetLowering::CollectTargetIntrinsicOperands(
1785 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1786 switch (cast<IntrinsicInst>(Val: I).getIntrinsicID()) {
1787 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1788 // The DAG's ValueType loses the addrspaces.
1789 // Add them as 2 extra Constant operands "from" and "to".
1790 unsigned SrcAS = I.getOperand(i_nocapture: 0)->getType()->getPointerAddressSpace();
1791 unsigned DstAS = I.getType()->getPointerAddressSpace();
1792 Ops.push_back(Elt: DAG.getTargetConstant(Val: SrcAS, DL: SDLoc(), VT: MVT::i32));
1793 Ops.push_back(Elt: DAG.getTargetConstant(Val: DstAS, DL: SDLoc(), VT: MVT::i32));
1794 break;
1795 }
1796 default:
1797 break;
1798 }
1799}
1800
1801bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
1802 SmallVectorImpl<Value *> &Ops,
1803 Type *&AccessTy) const {
1804 Value *Ptr = nullptr;
1805 switch (II->getIntrinsicID()) {
1806 case Intrinsic::amdgcn_cluster_load_b128:
1807 case Intrinsic::amdgcn_cluster_load_b64:
1808 case Intrinsic::amdgcn_cluster_load_b32:
1809 case Intrinsic::amdgcn_ds_append:
1810 case Intrinsic::amdgcn_ds_consume:
1811 case Intrinsic::amdgcn_ds_load_tr8_b64:
1812 case Intrinsic::amdgcn_ds_load_tr16_b128:
1813 case Intrinsic::amdgcn_ds_load_tr4_b64:
1814 case Intrinsic::amdgcn_ds_load_tr6_b96:
1815 case Intrinsic::amdgcn_ds_read_tr4_b64:
1816 case Intrinsic::amdgcn_ds_read_tr6_b96:
1817 case Intrinsic::amdgcn_ds_read_tr8_b64:
1818 case Intrinsic::amdgcn_ds_read_tr16_b64:
1819 case Intrinsic::amdgcn_ds_ordered_add:
1820 case Intrinsic::amdgcn_ds_ordered_swap:
1821 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1822 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1823 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1824 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1825 case Intrinsic::amdgcn_global_atomic_fmax_num:
1826 case Intrinsic::amdgcn_global_atomic_fmin_num:
1827 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1828 case Intrinsic::amdgcn_global_load_tr_b64:
1829 case Intrinsic::amdgcn_global_load_tr_b128:
1830 case Intrinsic::amdgcn_global_load_tr4_b64:
1831 case Intrinsic::amdgcn_global_load_tr6_b96:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1833 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1834 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1835 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1836 Ptr = II->getArgOperand(i: 0);
1837 break;
1838 case Intrinsic::amdgcn_load_to_lds:
1839 case Intrinsic::amdgcn_load_async_to_lds:
1840 case Intrinsic::amdgcn_global_load_lds:
1841 case Intrinsic::amdgcn_global_load_async_lds:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1843 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1844 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1845 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1847 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1848 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1849 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1850 Ptr = II->getArgOperand(i: 1);
1851 break;
1852 default:
1853 return false;
1854 }
1855 AccessTy = II->getType();
1856 Ops.push_back(Elt: Ptr);
1857 return true;
1858}
1859
1860bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1861 unsigned AddrSpace) const {
1862 if (!Subtarget->hasFlatInstOffsets()) {
1863 // Flat instructions do not have offsets, and only have the register
1864 // address.
1865 return AM.BaseOffs == 0 && AM.Scale == 0;
1866 }
1867
1868 decltype(SIInstrFlags::FLAT) FlatVariant =
1869 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? SIInstrFlags::FlatGlobal
1870 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? SIInstrFlags::FlatScratch
1871 : SIInstrFlags::FLAT;
1872
1873 return AM.Scale == 0 &&
1874 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1875 Offset: AM.BaseOffs, AddrSpace, FlatVariant));
1876}
1877
1878bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1879 if (Subtarget->hasFlatGlobalInsts())
1880 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::GLOBAL_ADDRESS);
1881
1882 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1883 // Assume the we will use FLAT for all global memory accesses
1884 // on VI.
1885 // FIXME: This assumption is currently wrong. On VI we still use
1886 // MUBUF instructions for the r + i addressing mode. As currently
1887 // implemented, the MUBUF instructions only work on buffer < 4GB.
1888 // It may be possible to support > 4GB buffers with MUBUF instructions,
1889 // by setting the stride value in the resource descriptor which would
1890 // increase the size limit to (stride * 4GB). However, this is risky,
1891 // because it has never been validated.
1892 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
1893 }
1894
1895 return isLegalMUBUFAddressingMode(AM);
1896}
1897
1898bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1899 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1900 // additionally can do r + r + i with addr64. 32-bit has more addressing
1901 // mode options. Depending on the resource constant, it can also do
1902 // (i64 r0) + (i32 r1) * (i14 i).
1903 //
1904 // Private arrays end up using a scratch buffer most of the time, so also
1905 // assume those use MUBUF instructions. Scratch loads / stores are currently
1906 // implemented as mubuf instructions with offen bit set, so slightly
1907 // different than the normal addr64.
1908 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1909 if (!TII->isLegalMUBUFImmOffset(Imm: AM.BaseOffs))
1910 return false;
1911
1912 // FIXME: Since we can split immediate into soffset and immediate offset,
1913 // would it make sense to allow any immediate?
1914
1915 switch (AM.Scale) {
1916 case 0: // r + i or just i, depending on HasBaseReg.
1917 return true;
1918 case 1:
1919 return true; // We have r + r or r + i.
1920 case 2:
1921 if (AM.HasBaseReg) {
1922 // Reject 2 * r + r.
1923 return false;
1924 }
1925
1926 // Allow 2 * r as r + r
1927 // Or 2 * r + i is allowed as r + r + i.
1928 return true;
1929 default: // Don't allow n * r
1930 return false;
1931 }
1932}
1933
1934bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1935 const AddrMode &AM, Type *Ty,
1936 unsigned AS,
1937 Instruction *I) const {
1938 // No global is ever allowed as a base.
1939 if (AM.BaseGV)
1940 return false;
1941
1942 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1943 return isLegalGlobalAddressingMode(AM);
1944
1945 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1946 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1947 AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
1948 AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
1949 // If the offset isn't a multiple of 4, it probably isn't going to be
1950 // correctly aligned.
1951 // FIXME: Can we get the real alignment here?
1952 if (AM.BaseOffs % 4 != 0)
1953 return isLegalMUBUFAddressingMode(AM);
1954
1955 if (!Subtarget->hasScalarSubwordLoads()) {
1956 // There are no SMRD extloads, so if we have to do a small type access we
1957 // will use a MUBUF load.
1958 // FIXME?: We also need to do this if unaligned, but we don't know the
1959 // alignment here.
1960 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1961 return isLegalGlobalAddressingMode(AM);
1962 }
1963
1964 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1965 // SMRD instructions have an 8-bit, dword offset on SI.
1966 if (!isUInt<8>(x: AM.BaseOffs / 4))
1967 return false;
1968 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1969 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1970 // in 8-bits, it can use a smaller encoding.
1971 if (!isUInt<32>(x: AM.BaseOffs / 4))
1972 return false;
1973 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1974 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1975 if (!isUInt<20>(x: AM.BaseOffs))
1976 return false;
1977 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1978 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1979 // for S_BUFFER_* instructions).
1980 if (!isInt<21>(x: AM.BaseOffs))
1981 return false;
1982 } else {
1983 // On GFX12, all offsets are signed 24-bit in bytes.
1984 if (!isInt<24>(x: AM.BaseOffs))
1985 return false;
1986 }
1987
1988 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1989 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1990 AM.BaseOffs < 0) {
1991 // Scalar (non-buffer) loads can only use a negative offset if
1992 // soffset+offset is non-negative. Since the compiler can only prove that
1993 // in a few special cases, it is safer to claim that negative offsets are
1994 // not supported.
1995 return false;
1996 }
1997
1998 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1999 return true;
2000
2001 if (AM.Scale == 1 && AM.HasBaseReg)
2002 return true;
2003
2004 return false;
2005 }
2006
2007 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2008 return Subtarget->hasFlatScratchEnabled()
2009 ? isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::PRIVATE_ADDRESS)
2010 : isLegalMUBUFAddressingMode(AM);
2011
2012 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2013 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2014 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2015 // field.
2016 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2017 // an 8-bit dword offset but we don't know the alignment here.
2018 if (!isUInt<16>(x: AM.BaseOffs))
2019 return false;
2020
2021 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2022 return true;
2023
2024 if (AM.Scale == 1 && AM.HasBaseReg)
2025 return true;
2026
2027 return false;
2028 }
2029
2030 if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
2031 // For an unknown address space, this usually means that this is for some
2032 // reason being used for pure arithmetic, and not based on some addressing
2033 // computation. We don't have instructions that compute pointers with any
2034 // addressing modes, so treat them as having no offset like flat
2035 // instructions.
2036 return isLegalFlatAddressingMode(AM, AddrSpace: AMDGPUAS::FLAT_ADDRESS);
2037 }
2038
2039 // Assume a user alias of global for unknown address spaces.
2040 return isLegalGlobalAddressingMode(AM);
2041}
2042
2043bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
2044 const MachineFunction &MF) const {
2045 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
2046 return (MemVT.getSizeInBits() <= 4 * 32);
2047 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2048 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2049 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2050 }
2051 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
2052 return (MemVT.getSizeInBits() <= 2 * 32);
2053 return true;
2054}
2055
2056bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
2057 unsigned Size, unsigned AddrSpace, Align Alignment,
2058 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2059 if (IsFast)
2060 *IsFast = 0;
2061
2062 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2063 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2064 // Check if alignment requirements for ds_read/write instructions are
2065 // disabled.
2066 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2067 return false;
2068
2069 Align RequiredAlignment(
2070 PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))); // Natural alignment.
2071 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2072 Alignment < RequiredAlignment)
2073 return false;
2074
2075 // Either, the alignment requirements are "enabled", or there is an
2076 // unaligned LDS access related hardware bug though alignment requirements
2077 // are "disabled". In either case, we need to check for proper alignment
2078 // requirements.
2079 //
2080 switch (Size) {
2081 case 64:
2082 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2083 // address is negative, then the instruction is incorrectly treated as
2084 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2085 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2086 // load later in the SILoadStoreOptimizer.
2087 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2088 return false;
2089
2090 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2091 // can do a 4 byte aligned, 8 byte access in a single operation using
2092 // ds_read2/write2_b32 with adjacent offsets.
2093 RequiredAlignment = Align(4);
2094
2095 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2096 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2097 // ds_write2_b32 depending on the alignment. In either case with either
2098 // alignment there is no faster way of doing this.
2099
2100 // The numbers returned here and below are not additive, it is a 'speed
2101 // rank'. They are just meant to be compared to decide if a certain way
2102 // of lowering an operation is faster than another. For that purpose
2103 // naturally aligned operation gets it bitsize to indicate that "it
2104 // operates with a speed comparable to N-bit wide load". With the full
2105 // alignment ds128 is slower than ds96 for example. If underaligned it
2106 // is comparable to a speed of a single dword access, which would then
2107 // mean 32 < 128 and it is faster to issue a wide load regardless.
2108 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2109 // wider load which will not be aligned anymore the latter is slower.
2110 if (IsFast)
2111 *IsFast = (Alignment >= RequiredAlignment) ? 64
2112 : (Alignment < Align(4)) ? 32
2113 : 1;
2114 return true;
2115 }
2116
2117 break;
2118 case 96:
2119 if (!Subtarget->hasDS96AndDS128())
2120 return false;
2121
2122 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2123 // gfx8 and older.
2124
2125 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2126 // Naturally aligned access is fastest. However, also report it is Fast
2127 // if memory is aligned less than DWORD. A narrow load or store will be
2128 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2129 // be more of them, so overall we will pay less penalty issuing a single
2130 // instruction.
2131
2132 // See comment on the values above.
2133 if (IsFast)
2134 *IsFast = (Alignment >= RequiredAlignment) ? 96
2135 : (Alignment < Align(4)) ? 32
2136 : 1;
2137 return true;
2138 }
2139
2140 break;
2141 case 128:
2142 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2143 return false;
2144
2145 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2146 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2147 // single operation using ds_read2/write2_b64.
2148 RequiredAlignment = Align(8);
2149
2150 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2151 // Naturally aligned access is fastest. However, also report it is Fast
2152 // if memory is aligned less than DWORD. A narrow load or store will be
2153 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2154 // will be more of them, so overall we will pay less penalty issuing a
2155 // single instruction.
2156
2157 // See comment on the values above.
2158 if (IsFast)
2159 *IsFast = (Alignment >= RequiredAlignment) ? 128
2160 : (Alignment < Align(4)) ? 32
2161 : 1;
2162 return true;
2163 }
2164
2165 break;
2166 default:
2167 if (Size > 32)
2168 return false;
2169
2170 break;
2171 }
2172
2173 // See comment on the values above.
2174 // Note that we have a single-dword or sub-dword here, so if underaligned
2175 // it is a slowest possible access, hence returned value is 0.
2176 if (IsFast)
2177 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2178
2179 return Alignment >= RequiredAlignment ||
2180 Subtarget->hasUnalignedDSAccessEnabled();
2181 }
2182
2183 // FIXME: We have to be conservative here and assume that flat operations
2184 // will access scratch. If we had access to the IR function, then we
2185 // could determine if any private memory was used in the function.
2186 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2187 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2188 bool AlignedBy4 = Alignment >= Align(4);
2189 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2190 if (IsFast)
2191 *IsFast = AlignedBy4 ? Size : 1;
2192 return true;
2193 }
2194
2195 if (IsFast)
2196 *IsFast = AlignedBy4;
2197
2198 return AlignedBy4;
2199 }
2200
2201 // So long as they are correct, wide global memory operations perform better
2202 // than multiple smaller memory ops -- even when misaligned
2203 if (AMDGPU::isExtendedGlobalAddrSpace(AS: AddrSpace)) {
2204 if (IsFast)
2205 *IsFast = Size;
2206
2207 return Alignment >= Align(4) ||
2208 Subtarget->hasUnalignedBufferAccessEnabled();
2209 }
2210
2211 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2212 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2213 // out-of-bounds behavior, but in the edge case where an access starts
2214 // out-of-bounds and then enter in-bounds, the entire access would be treated
2215 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2216 // natural alignment of buffer accesses.
2217 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2218 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2219 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2220 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2221 Alignment < Align(PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: 8))))
2222 return false;
2223 }
2224
2225 // Smaller than dword value must be aligned.
2226 if (Size < 32)
2227 return false;
2228
2229 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2230 // byte-address are ignored, thus forcing Dword alignment.
2231 // This applies to private, global, and constant memory.
2232 if (IsFast)
2233 *IsFast = 1;
2234
2235 return Size >= 32 && Alignment >= Align(4);
2236}
2237
2238bool SITargetLowering::allowsMisalignedMemoryAccesses(
2239 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2240 unsigned *IsFast) const {
2241 return allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace,
2242 Alignment, Flags, IsFast);
2243}
2244
2245EVT SITargetLowering::getOptimalMemOpType(
2246 LLVMContext &Context, const MemOp &Op,
2247 const AttributeList &FuncAttributes) const {
2248 // FIXME: Should account for address space here.
2249
2250 // The default fallback uses the private pointer size as a guess for a type to
2251 // use. Make sure we switch these to 64-bit accesses.
2252
2253 if (Op.size() >= 16 &&
2254 Op.isDstAligned(AlignCheck: Align(4))) // XXX: Should only do for global
2255 return MVT::v4i32;
2256
2257 if (Op.size() >= 8 && Op.isDstAligned(AlignCheck: Align(4)))
2258 return MVT::v2i32;
2259
2260 // Use the default.
2261 return MVT::Other;
2262}
2263
2264bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
2265 const MemSDNode *MemNode = cast<MemSDNode>(Val: N);
2266 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2267}
2268
2269bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
2270 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
2271 AS == AMDGPUAS::PRIVATE_ADDRESS;
2272}
2273
2274bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
2275 unsigned DestAS) const {
2276 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2277 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2278 Subtarget->hasGloballyAddressableScratch()) {
2279 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2280 return false;
2281 }
2282
2283 // Flat -> private/local is a simple truncate.
2284 // Flat -> global is no-op
2285 return true;
2286 }
2287
2288 const GCNTargetMachine &TM =
2289 static_cast<const GCNTargetMachine &>(getTargetMachine());
2290 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2291}
2292
2293TargetLoweringBase::LegalizeTypeAction
2294SITargetLowering::getPreferredVectorAction(MVT VT) const {
2295 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2296 VT.getScalarType().bitsLE(VT: MVT::i16))
2297 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
2298 return TargetLoweringBase::getPreferredVectorAction(VT);
2299}
2300
2301bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
2302 Type *Ty) const {
2303 // FIXME: Could be smarter if called for vector constants.
2304 return true;
2305}
2306
2307bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
2308 unsigned Index) const {
2309 if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
2310 return false;
2311
2312 // TODO: Add more cases that are cheap.
2313 return Index == 0;
2314}
2315
2316bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2317 // TODO: This should be more aggressive, particular for 16-bit element
2318 // vectors. However there are some mixed improvements and regressions.
2319 EVT EltTy = VT.getVectorElementType();
2320 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2321 return EltTy.getSizeInBits() % MinAlign == 0;
2322}
2323
2324bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
2325 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2326 switch (Op) {
2327 case ISD::LOAD:
2328 case ISD::STORE:
2329 return true;
2330 default:
2331 return false;
2332 }
2333 }
2334
2335 // SimplifySetCC uses this function to determine whether or not it should
2336 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2337 if (VT == MVT::i1 && Op == ISD::SETCC)
2338 return false;
2339
2340 return TargetLowering::isTypeDesirableForOp(Op, VT);
2341}
2342
2343MachinePointerInfo
2344SITargetLowering::getKernargSegmentPtrInfo(MachineFunction &MF) const {
2345 // This isn't really a constant pool but close enough.
2346 MachinePointerInfo PtrInfo(MF.getPSVManager().getConstantPool());
2347 PtrInfo.AddrSpace = AMDGPUAS::CONSTANT_ADDRESS;
2348 return PtrInfo;
2349}
2350
2351SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2352 const SDLoc &SL,
2353 SDValue Chain,
2354 uint64_t Offset) const {
2355 const DataLayout &DL = DAG.getDataLayout();
2356 MachineFunction &MF = DAG.getMachineFunction();
2357 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2358 MVT PtrVT = getPointerTy(DL, AS: AMDGPUAS::CONSTANT_ADDRESS);
2359
2360 auto [InputPtrReg, RC, ArgTy] =
2361 Info->getPreloadedValue(Value: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2362
2363 // We may not have the kernarg segment argument if we have no kernel
2364 // arguments.
2365 if (!InputPtrReg)
2366 return DAG.getConstant(Val: Offset, DL: SL, VT: PtrVT);
2367
2368 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
2369 SDValue BasePtr = DAG.getCopyFromReg(
2370 Chain, dl: SL, Reg: MRI.getLiveInVirtReg(PReg: InputPtrReg->getRegister()), VT: PtrVT);
2371
2372 return DAG.getObjectPtrOffset(SL, Ptr: BasePtr, Offset: TypeSize::getFixed(ExactSize: Offset));
2373}
2374
2375SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2376 const SDLoc &SL) const {
2377 uint64_t Offset =
2378 getImplicitParameterOffset(MF: DAG.getMachineFunction(), Param: FIRST_IMPLICIT);
2379 return lowerKernArgParameterPtr(DAG, SL, Chain: DAG.getEntryNode(), Offset);
2380}
2381
2382SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2383 const SDLoc &SL) const {
2384
2385 Function &F = DAG.getMachineFunction().getFunction();
2386 std::optional<uint32_t> KnownSize =
2387 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2388 if (KnownSize.has_value())
2389 return DAG.getConstant(Val: *KnownSize, DL: SL, VT: MVT::i32);
2390 return SDValue();
2391}
2392
2393SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2394 const SDLoc &SL, SDValue Val,
2395 bool Signed,
2396 const ISD::InputArg *Arg) const {
2397 // First, if it is a widened vector, narrow it.
2398 if (VT.isVector() &&
2399 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
2400 EVT NarrowedVT =
2401 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getVectorElementType(),
2402 NumElements: VT.getVectorNumElements());
2403 Val = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: NarrowedVT, N1: Val,
2404 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
2405 }
2406
2407 // Then convert the vector elements or scalar value.
2408 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(VT: MemVT)) {
2409 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2410 Val = DAG.getNode(Opcode: Opc, DL: SL, VT: MemVT, N1: Val, N2: DAG.getValueType(VT));
2411 }
2412
2413 if (MemVT.isFloatingPoint()) {
2414 if (VT.isFloatingPoint()) {
2415 Val = getFPExtOrFPRound(DAG, Op: Val, DL: SL, VT);
2416 } else {
2417 assert(!MemVT.isVector());
2418 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
2419 SDValue Cast = DAG.getBitcast(VT: IntVT, V: Val);
2420 Val = DAG.getAnyExtOrTrunc(Op: Cast, DL: SL, VT);
2421 }
2422 } else if (Signed)
2423 Val = DAG.getSExtOrTrunc(Op: Val, DL: SL, VT);
2424 else
2425 Val = DAG.getZExtOrTrunc(Op: Val, DL: SL, VT);
2426
2427 return Val;
2428}
2429
2430SDValue SITargetLowering::lowerKernargMemParameter(
2431 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2432 uint64_t Offset, Align Alignment, bool Signed,
2433 const ISD::InputArg *Arg) const {
2434
2435 MachinePointerInfo PtrInfo =
2436 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
2437
2438 // Try to avoid using an extload by loading earlier than the argument address,
2439 // and extracting the relevant bits. The load should hopefully be merged with
2440 // the previous argument.
2441 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2442 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2443 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
2444 int64_t OffsetDiff = Offset - AlignDownOffset;
2445
2446 EVT IntVT = MemVT.changeTypeToInteger();
2447
2448 // TODO: If we passed in the base kernel offset we could have a better
2449 // alignment than 4, but we don't really need it.
2450 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset: AlignDownOffset);
2451 SDValue Load = DAG.getLoad(VT: MVT::i32, dl: SL, Chain, Ptr,
2452 PtrInfo: PtrInfo.getWithOffset(O: AlignDownOffset), Alignment: Align(4),
2453 MMOFlags: MachineMemOperand::MODereferenceable |
2454 MachineMemOperand::MOInvariant);
2455
2456 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL: SL, VT: MVT::i32);
2457 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Load, N2: ShiftAmt);
2458
2459 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: IntVT, Operand: Extract);
2460 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MemVT, Operand: ArgVal);
2461 ArgVal = convertArgType(DAG, VT, MemVT, SL, Val: ArgVal, Signed, Arg);
2462
2463 return DAG.getMergeValues(Ops: {ArgVal, Load.getValue(R: 1)}, dl: SL);
2464 }
2465
2466 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2467 SDValue Load = DAG.getLoad(
2468 VT: MemVT, dl: SL, Chain, Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
2469 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
2470
2471 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Val: Load, Signed, Arg);
2472 return DAG.getMergeValues(Ops: {Val, Load.getValue(R: 1)}, dl: SL);
2473}
2474
2475/// Coerce an argument which was passed in a different ABI type to the original
2476/// expected value type.
2477SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2478 SDValue Val,
2479 CCValAssign &VA,
2480 const SDLoc &SL) const {
2481 EVT ValVT = VA.getValVT();
2482
2483 // If this is an 8 or 16-bit value, it is really passed promoted
2484 // to 32 bits. Insert an assert[sz]ext to capture this, then
2485 // truncate to the right size.
2486 switch (VA.getLocInfo()) {
2487 case CCValAssign::Full:
2488 return Val;
2489 case CCValAssign::BCvt:
2490 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ValVT, Operand: Val);
2491 case CCValAssign::SExt:
2492 Val = DAG.getNode(Opcode: ISD::AssertSext, DL: SL, VT: VA.getLocVT(), N1: Val,
2493 N2: DAG.getValueType(ValVT));
2494 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2495 case CCValAssign::ZExt:
2496 Val = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: VA.getLocVT(), N1: Val,
2497 N2: DAG.getValueType(ValVT));
2498 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2499 case CCValAssign::AExt:
2500 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: ValVT, Operand: Val);
2501 default:
2502 llvm_unreachable("Unknown loc info!");
2503 }
2504}
2505
2506SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2507 CCValAssign &VA, const SDLoc &SL,
2508 SDValue Chain,
2509 const ISD::InputArg &Arg) const {
2510 MachineFunction &MF = DAG.getMachineFunction();
2511 MachineFrameInfo &MFI = MF.getFrameInfo();
2512
2513 if (Arg.Flags.isByVal()) {
2514 unsigned Size = Arg.Flags.getByValSize();
2515 int FrameIdx = MFI.CreateFixedObject(Size, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
2516 return DAG.getFrameIndex(FI: FrameIdx, VT: MVT::i32);
2517 }
2518
2519 unsigned ArgOffset = VA.getLocMemOffset();
2520 unsigned ArgSize = VA.getValVT().getStoreSize();
2521
2522 int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset, IsImmutable: true);
2523
2524 // Create load nodes to retrieve arguments from the stack.
2525 SDValue FIN = DAG.getFrameIndex(FI, VT: MVT::i32);
2526
2527 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2528 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
2529 MVT MemVT = VA.getValVT();
2530
2531 switch (VA.getLocInfo()) {
2532 default:
2533 break;
2534 case CCValAssign::BCvt:
2535 MemVT = VA.getLocVT();
2536 break;
2537 case CCValAssign::SExt:
2538 ExtType = ISD::SEXTLOAD;
2539 break;
2540 case CCValAssign::ZExt:
2541 ExtType = ISD::ZEXTLOAD;
2542 break;
2543 case CCValAssign::AExt:
2544 ExtType = ISD::EXTLOAD;
2545 break;
2546 }
2547
2548 SDValue ArgValue = DAG.getExtLoad(
2549 ExtType, dl: SL, VT: VA.getLocVT(), Chain, Ptr: FIN,
2550 PtrInfo: MachinePointerInfo::getFixedStack(MF&: DAG.getMachineFunction(), FI), MemVT);
2551
2552 SDValue ConvertedVal = convertABITypeToValueType(DAG, Val: ArgValue, VA, SL);
2553 if (ConvertedVal == ArgValue)
2554 return ConvertedVal;
2555
2556 return DAG.getMergeValues(Ops: {ConvertedVal, ArgValue.getValue(R: 1)}, dl: SL);
2557}
2558
2559SDValue SITargetLowering::lowerWorkGroupId(
2560 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2561 AMDGPUFunctionArgInfo::PreloadedValue WorkGroupIdPV,
2562 AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
2563 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2564 if (!Subtarget->hasClusters())
2565 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2566
2567 // Clusters are supported. Return the global position in the grid. If clusters
2568 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2569
2570 // WorkGroupIdXYZ = ClusterId == 0 ?
2571 // ClusterIdXYZ :
2572 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2573 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2574 SDLoc SL(ClusterIdXYZ);
2575 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2576 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT);
2577 SDValue ClusterSizeXYZ = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterMaxIdXYZ, N2: One);
2578 SDValue ClusterWorkGroupIdXYZ =
2579 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2580 SDValue GlobalIdXYZ =
2581 DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ClusterWorkGroupIdXYZ,
2582 N2: DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: ClusterIdXYZ, N2: ClusterSizeXYZ));
2583
2584 switch (MFI.getClusterDims().getKind()) {
2585 case AMDGPU::ClusterDimsAttr::Kind::FixedDims:
2586 case AMDGPU::ClusterDimsAttr::Kind::VariableDims:
2587 return GlobalIdXYZ;
2588 case AMDGPU::ClusterDimsAttr::Kind::NoCluster:
2589 return ClusterIdXYZ;
2590 case AMDGPU::ClusterDimsAttr::Kind::Unknown: {
2591 using namespace AMDGPU::Hwreg;
2592 SDValue ClusterIdField =
2593 DAG.getTargetConstant(Val: HwregEncoding::encode(Values: ID_IB_STS2, Values: 6, Values: 4), DL: SL, VT);
2594 SDNode *GetReg =
2595 DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT, Op1: ClusterIdField);
2596 SDValue ClusterId(GetReg, 0);
2597 SDValue Zero = DAG.getConstant(Val: 0, DL: SL, VT);
2598 return DAG.getNode(Opcode: ISD::SELECT_CC, DL: SL, VT, N1: ClusterId, N2: Zero, N3: ClusterIdXYZ,
2599 N4: GlobalIdXYZ, N5: DAG.getCondCode(Cond: ISD::SETEQ));
2600 }
2601 }
2602
2603 llvm_unreachable("nothing should reach here");
2604}
2605
2606SDValue SITargetLowering::getPreloadedValue(
2607 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2608 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2609 const ArgDescriptor *Reg = nullptr;
2610 const TargetRegisterClass *RC;
2611 LLT Ty;
2612
2613 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2614 const ArgDescriptor WorkGroupIDX =
2615 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP9);
2616 // If GridZ is not programmed in an entry function then the hardware will set
2617 // it to all zeros, so there is no need to mask the GridY value in the low
2618 // order bits.
2619 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2620 Reg: AMDGPU::TTMP7,
2621 Mask: AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2622 const ArgDescriptor WorkGroupIDZ =
2623 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP7, Mask: 0xFFFF0000u);
2624 const ArgDescriptor ClusterWorkGroupIDX =
2625 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000000Fu);
2626 const ArgDescriptor ClusterWorkGroupIDY =
2627 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000000F0u);
2628 const ArgDescriptor ClusterWorkGroupIDZ =
2629 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00000F00u);
2630 const ArgDescriptor ClusterWorkGroupMaxIDX =
2631 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0000F000u);
2632 const ArgDescriptor ClusterWorkGroupMaxIDY =
2633 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x000F0000u);
2634 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2635 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x00F00000u);
2636 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2637 ArgDescriptor::createRegister(Reg: AMDGPU::TTMP6, Mask: 0x0F000000u);
2638
2639 auto LoadConstant = [&](unsigned N) {
2640 return DAG.getConstant(Val: N, DL: SDLoc(), VT);
2641 };
2642
2643 if (Subtarget->hasArchitectedSGPRs() &&
2644 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
2645 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2646 bool HasFixedDims = ClusterDims.isFixedDims();
2647
2648 switch (PVID) {
2649 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2650 Reg = &WorkGroupIDX;
2651 RC = &AMDGPU::SReg_32RegClass;
2652 Ty = LLT::scalar(SizeInBits: 32);
2653 break;
2654 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2655 Reg = &WorkGroupIDY;
2656 RC = &AMDGPU::SReg_32RegClass;
2657 Ty = LLT::scalar(SizeInBits: 32);
2658 break;
2659 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2660 Reg = &WorkGroupIDZ;
2661 RC = &AMDGPU::SReg_32RegClass;
2662 Ty = LLT::scalar(SizeInBits: 32);
2663 break;
2664 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X:
2665 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2666 return LoadConstant(0);
2667 Reg = &ClusterWorkGroupIDX;
2668 RC = &AMDGPU::SReg_32RegClass;
2669 Ty = LLT::scalar(SizeInBits: 32);
2670 break;
2671 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y:
2672 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2673 return LoadConstant(0);
2674 Reg = &ClusterWorkGroupIDY;
2675 RC = &AMDGPU::SReg_32RegClass;
2676 Ty = LLT::scalar(SizeInBits: 32);
2677 break;
2678 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z:
2679 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2680 return LoadConstant(0);
2681 Reg = &ClusterWorkGroupIDZ;
2682 RC = &AMDGPU::SReg_32RegClass;
2683 Ty = LLT::scalar(SizeInBits: 32);
2684 break;
2685 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X:
2686 if (HasFixedDims)
2687 return LoadConstant(ClusterDims.getDims()[0] - 1);
2688 Reg = &ClusterWorkGroupMaxIDX;
2689 RC = &AMDGPU::SReg_32RegClass;
2690 Ty = LLT::scalar(SizeInBits: 32);
2691 break;
2692 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y:
2693 if (HasFixedDims)
2694 return LoadConstant(ClusterDims.getDims()[1] - 1);
2695 Reg = &ClusterWorkGroupMaxIDY;
2696 RC = &AMDGPU::SReg_32RegClass;
2697 Ty = LLT::scalar(SizeInBits: 32);
2698 break;
2699 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z:
2700 if (HasFixedDims)
2701 return LoadConstant(ClusterDims.getDims()[2] - 1);
2702 Reg = &ClusterWorkGroupMaxIDZ;
2703 RC = &AMDGPU::SReg_32RegClass;
2704 Ty = LLT::scalar(SizeInBits: 32);
2705 break;
2706 case AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID:
2707 Reg = &ClusterWorkGroupMaxFlatID;
2708 RC = &AMDGPU::SReg_32RegClass;
2709 Ty = LLT::scalar(SizeInBits: 32);
2710 break;
2711 default:
2712 break;
2713 }
2714 }
2715
2716 if (!Reg)
2717 std::tie(args&: Reg, args&: RC, args&: Ty) = MFI.getPreloadedValue(Value: PVID);
2718 if (!Reg) {
2719 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
2720 // It's possible for a kernarg intrinsic call to appear in a kernel with
2721 // no allocated segment, in which case we do not add the user sgpr
2722 // argument, so just return null.
2723 return DAG.getConstant(Val: 0, DL: SDLoc(), VT);
2724 }
2725
2726 // It's undefined behavior if a function marked with the amdgpu-no-*
2727 // attributes uses the corresponding intrinsic.
2728 return DAG.getPOISON(VT);
2729 }
2730
2731 return loadInputValue(DAG, RC, VT, SL: SDLoc(DAG.getEntryNode()), Arg: *Reg);
2732}
2733
2734static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
2735 CallingConv::ID CallConv,
2736 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2737 FunctionType *FType,
2738 SIMachineFunctionInfo *Info) {
2739 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2740 const ISD::InputArg *Arg = &Ins[I];
2741
2742 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2743 "vector type argument should have been split");
2744
2745 // First check if it's a PS input addr.
2746 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2747 PSInputNum <= 15) {
2748 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(Index: PSInputNum);
2749
2750 // Inconveniently only the first part of the split is marked as isSplit,
2751 // so skip to the end. We only want to increment PSInputNum once for the
2752 // entire split argument.
2753 if (Arg->Flags.isSplit()) {
2754 while (!Arg->Flags.isSplitEnd()) {
2755 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2756 "unexpected vector split in ps argument type");
2757 if (!SkipArg)
2758 Splits.push_back(Elt: *Arg);
2759 Arg = &Ins[++I];
2760 }
2761 }
2762
2763 if (SkipArg) {
2764 // We can safely skip PS inputs.
2765 Skipped.set(Arg->getOrigArgIndex());
2766 ++PSInputNum;
2767 continue;
2768 }
2769
2770 Info->markPSInputAllocated(Index: PSInputNum);
2771 if (Arg->Used)
2772 Info->markPSInputEnabled(Index: PSInputNum);
2773
2774 ++PSInputNum;
2775 }
2776
2777 Splits.push_back(Elt: *Arg);
2778 }
2779}
2780
2781// Allocate special inputs passed in VGPRs.
2782void SITargetLowering::allocateSpecialEntryInputVGPRs(
2783 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2784 SIMachineFunctionInfo &Info) const {
2785 const LLT S32 = LLT::scalar(SizeInBits: 32);
2786 MachineRegisterInfo &MRI = MF.getRegInfo();
2787
2788 if (Info.hasWorkItemIDX()) {
2789 Register Reg = AMDGPU::VGPR0;
2790 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2791
2792 CCInfo.AllocateReg(Reg);
2793 unsigned Mask =
2794 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2795 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2796 }
2797
2798 if (Info.hasWorkItemIDY()) {
2799 assert(Info.hasWorkItemIDX());
2800 if (Subtarget->hasPackedTID()) {
2801 Info.setWorkItemIDY(
2802 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 10));
2803 } else {
2804 unsigned Reg = AMDGPU::VGPR1;
2805 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2806
2807 CCInfo.AllocateReg(Reg);
2808 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2809 }
2810 }
2811
2812 if (Info.hasWorkItemIDZ()) {
2813 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2814 if (Subtarget->hasPackedTID()) {
2815 Info.setWorkItemIDZ(
2816 ArgDescriptor::createRegister(Reg: AMDGPU::VGPR0, Mask: 0x3ff << 20));
2817 } else {
2818 unsigned Reg = AMDGPU::VGPR2;
2819 MRI.setType(VReg: MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass), Ty: S32);
2820
2821 CCInfo.AllocateReg(Reg);
2822 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2823 }
2824 }
2825}
2826
2827// Try to allocate a VGPR at the end of the argument list, or if no argument
2828// VGPRs are left allocating a stack slot.
2829// If \p Mask is is given it indicates bitfield position in the register.
2830// If \p Arg is given use it with new ]p Mask instead of allocating new.
2831static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2832 ArgDescriptor Arg = ArgDescriptor()) {
2833 if (Arg.isSet())
2834 return ArgDescriptor::createArg(Arg, Mask);
2835
2836 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2837 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgVGPRs);
2838 if (RegIdx == ArgVGPRs.size()) {
2839 // Spill to stack required.
2840 int64_t Offset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
2841
2842 return ArgDescriptor::createStack(Offset, Mask);
2843 }
2844
2845 unsigned Reg = ArgVGPRs[RegIdx];
2846 Reg = CCInfo.AllocateReg(Reg);
2847 assert(Reg != AMDGPU::NoRegister);
2848
2849 MachineFunction &MF = CCInfo.getMachineFunction();
2850 Register LiveInVReg = MF.addLiveIn(PReg: Reg, RC: &AMDGPU::VGPR_32RegClass);
2851 MF.getRegInfo().setType(VReg: LiveInVReg, Ty: LLT::scalar(SizeInBits: 32));
2852 return ArgDescriptor::createRegister(Reg, Mask);
2853}
2854
2855static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
2856 const TargetRegisterClass *RC,
2857 unsigned NumArgRegs) {
2858 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2859 unsigned RegIdx = CCInfo.getFirstUnallocated(Regs: ArgSGPRs);
2860 if (RegIdx == ArgSGPRs.size())
2861 report_fatal_error(reason: "ran out of SGPRs for arguments");
2862
2863 unsigned Reg = ArgSGPRs[RegIdx];
2864 Reg = CCInfo.AllocateReg(Reg);
2865 assert(Reg != AMDGPU::NoRegister);
2866
2867 MachineFunction &MF = CCInfo.getMachineFunction();
2868 MF.addLiveIn(PReg: Reg, RC);
2869 return ArgDescriptor::createRegister(Reg);
2870}
2871
2872// If this has a fixed position, we still should allocate the register in the
2873// CCInfo state. Technically we could get away with this for values passed
2874// outside of the normal argument range.
2875static void allocateFixedSGPRInputImpl(CCState &CCInfo,
2876 const TargetRegisterClass *RC,
2877 MCRegister Reg) {
2878 Reg = CCInfo.AllocateReg(Reg);
2879 assert(Reg != AMDGPU::NoRegister);
2880 MachineFunction &MF = CCInfo.getMachineFunction();
2881 MF.addLiveIn(PReg: Reg, RC);
2882}
2883
2884static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2885 if (Arg) {
2886 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass,
2887 Reg: Arg.getRegister());
2888 } else
2889 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_32RegClass, NumArgRegs: 32);
2890}
2891
2892static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2893 if (Arg) {
2894 allocateFixedSGPRInputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass,
2895 Reg: Arg.getRegister());
2896 } else
2897 Arg = allocateSGPR32InputImpl(CCInfo, RC: &AMDGPU::SGPR_64RegClass, NumArgRegs: 16);
2898}
2899
2900/// Allocate implicit function VGPR arguments at the end of allocated user
2901/// arguments.
2902void SITargetLowering::allocateSpecialInputVGPRs(
2903 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2904 SIMachineFunctionInfo &Info) const {
2905 const unsigned Mask = 0x3ff;
2906 ArgDescriptor Arg;
2907
2908 if (Info.hasWorkItemIDX()) {
2909 Arg = allocateVGPR32Input(CCInfo, Mask);
2910 Info.setWorkItemIDX(Arg);
2911 }
2912
2913 if (Info.hasWorkItemIDY()) {
2914 Arg = allocateVGPR32Input(CCInfo, Mask: Mask << 10, Arg);
2915 Info.setWorkItemIDY(Arg);
2916 }
2917
2918 if (Info.hasWorkItemIDZ())
2919 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask: Mask << 20, Arg));
2920}
2921
2922/// Allocate implicit function VGPR arguments in fixed registers.
2923void SITargetLowering::allocateSpecialInputVGPRsFixed(
2924 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2925 SIMachineFunctionInfo &Info) const {
2926 Register Reg = CCInfo.AllocateReg(Reg: AMDGPU::VGPR31);
2927 if (!Reg)
2928 report_fatal_error(reason: "failed to allocate VGPR for implicit arguments");
2929
2930 const unsigned Mask = 0x3ff;
2931 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2932 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask: Mask << 10));
2933 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask: Mask << 20));
2934}
2935
2936void SITargetLowering::allocateSpecialInputSGPRs(
2937 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2938 SIMachineFunctionInfo &Info) const {
2939 auto &ArgInfo = Info.getArgInfo();
2940 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2941
2942 // TODO: Unify handling with private memory pointers.
2943 if (UserSGPRInfo.hasDispatchPtr())
2944 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchPtr);
2945
2946 if (UserSGPRInfo.hasQueuePtr())
2947 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.QueuePtr);
2948
2949 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2950 // constant offset from the kernarg segment.
2951 if (Info.hasImplicitArgPtr())
2952 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.ImplicitArgPtr);
2953
2954 if (UserSGPRInfo.hasDispatchID())
2955 allocateSGPR64Input(CCInfo, Arg&: ArgInfo.DispatchID);
2956
2957 // flat_scratch_init is not applicable for non-kernel functions.
2958
2959 if (Info.hasWorkGroupIDX())
2960 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDX);
2961
2962 if (Info.hasWorkGroupIDY())
2963 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDY);
2964
2965 if (Info.hasWorkGroupIDZ())
2966 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.WorkGroupIDZ);
2967
2968 if (Info.hasLDSKernelId())
2969 allocateSGPR32Input(CCInfo, Arg&: ArgInfo.LDSKernelId);
2970}
2971
2972// Allocate special inputs passed in user SGPRs.
2973void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2974 MachineFunction &MF,
2975 const SIRegisterInfo &TRI,
2976 SIMachineFunctionInfo &Info) const {
2977 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2978 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2979 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2980 MF.addLiveIn(PReg: ImplicitBufferPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2981 CCInfo.AllocateReg(Reg: ImplicitBufferPtrReg);
2982 }
2983
2984 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2985 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2986 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2987 MF.addLiveIn(PReg: PrivateSegmentBufferReg, RC: &AMDGPU::SGPR_128RegClass);
2988 CCInfo.AllocateReg(Reg: PrivateSegmentBufferReg);
2989 }
2990
2991 if (UserSGPRInfo.hasDispatchPtr()) {
2992 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2993 MF.addLiveIn(PReg: DispatchPtrReg, RC: &AMDGPU::SGPR_64RegClass);
2994 CCInfo.AllocateReg(Reg: DispatchPtrReg);
2995 }
2996
2997 if (UserSGPRInfo.hasQueuePtr()) {
2998 Register QueuePtrReg = Info.addQueuePtr(TRI);
2999 MF.addLiveIn(PReg: QueuePtrReg, RC: &AMDGPU::SGPR_64RegClass);
3000 CCInfo.AllocateReg(Reg: QueuePtrReg);
3001 }
3002
3003 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3004 MachineRegisterInfo &MRI = MF.getRegInfo();
3005 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3006 CCInfo.AllocateReg(Reg: InputPtrReg);
3007
3008 Register VReg = MF.addLiveIn(PReg: InputPtrReg, RC: &AMDGPU::SGPR_64RegClass);
3009 MRI.setType(VReg, Ty: LLT::pointer(AddressSpace: AMDGPUAS::CONSTANT_ADDRESS, SizeInBits: 64));
3010 }
3011
3012 if (UserSGPRInfo.hasDispatchID()) {
3013 Register DispatchIDReg = Info.addDispatchID(TRI);
3014 MF.addLiveIn(PReg: DispatchIDReg, RC: &AMDGPU::SGPR_64RegClass);
3015 CCInfo.AllocateReg(Reg: DispatchIDReg);
3016 }
3017
3018 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3019 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3020 MF.addLiveIn(PReg: FlatScratchInitReg, RC: &AMDGPU::SGPR_64RegClass);
3021 CCInfo.AllocateReg(Reg: FlatScratchInitReg);
3022 }
3023
3024 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3025 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3026 MF.addLiveIn(PReg: PrivateSegmentSizeReg, RC: &AMDGPU::SGPR_32RegClass);
3027 CCInfo.AllocateReg(Reg: PrivateSegmentSizeReg);
3028 }
3029
3030 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3031 // these from the dispatch pointer.
3032}
3033
3034// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3035// sequential starting from the first argument.
3036void SITargetLowering::allocatePreloadKernArgSGPRs(
3037 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3038 const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
3039 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3040 Function &F = MF.getFunction();
3041 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3042 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3043 bool InPreloadSequence = true;
3044 unsigned InIdx = 0;
3045 bool AlignedForImplictArgs = false;
3046 unsigned ImplicitArgOffset = 0;
3047 for (auto &Arg : F.args()) {
3048 if (!InPreloadSequence || !Arg.hasInRegAttr())
3049 break;
3050
3051 unsigned ArgIdx = Arg.getArgNo();
3052 // Don't preload non-original args or parts not in the current preload
3053 // sequence.
3054 if (InIdx < Ins.size() &&
3055 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3056 break;
3057
3058 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3059 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3060 InIdx++) {
3061 assert(ArgLocs[ArgIdx].isMemLoc());
3062 auto &ArgLoc = ArgLocs[InIdx];
3063 const Align KernelArgBaseAlign = Align(16);
3064 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3065 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset: ArgOffset);
3066 unsigned NumAllocSGPRs =
3067 alignTo(Value: ArgLoc.getLocVT().getFixedSizeInBits(), Align: 32) / 32;
3068
3069 // Fix alignment for hidden arguments.
3070 if (Arg.hasAttribute(Kind: "amdgpu-hidden-argument")) {
3071 if (!AlignedForImplictArgs) {
3072 ImplicitArgOffset =
3073 alignTo(Size: LastExplicitArgOffset,
3074 A: Subtarget->getAlignmentForImplicitArgPtr()) -
3075 LastExplicitArgOffset;
3076 AlignedForImplictArgs = true;
3077 }
3078 ArgOffset += ImplicitArgOffset;
3079 }
3080
3081 // Arg is preloaded into the previous SGPR.
3082 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3083 assert(InIdx >= 1 && "No previous SGPR");
3084 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3085 Elt: Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3086 continue;
3087 }
3088
3089 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3090 unsigned PaddingSGPRs = alignTo(Value: Padding, Align: 4) / 4;
3091 // Check for free user SGPRs for preloading.
3092 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3093 InPreloadSequence = false;
3094 break;
3095 }
3096
3097 // Preload this argument.
3098 const TargetRegisterClass *RC =
3099 TRI.getSGPRClassForBitWidth(BitWidth: NumAllocSGPRs * 32);
3100 SmallVectorImpl<MCRegister> *PreloadRegs =
3101 Info.addPreloadedKernArg(TRI, RC, AllocSizeDWord: NumAllocSGPRs, KernArgIdx: InIdx, PaddingSGPRs);
3102
3103 if (PreloadRegs->size() > 1)
3104 RC = &AMDGPU::SGPR_32RegClass;
3105 for (auto &Reg : *PreloadRegs) {
3106 assert(Reg);
3107 MF.addLiveIn(PReg: Reg, RC);
3108 CCInfo.AllocateReg(Reg);
3109 }
3110
3111 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3112 }
3113 }
3114}
3115
3116void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
3117 const SIRegisterInfo &TRI,
3118 SIMachineFunctionInfo &Info) const {
3119 // Always allocate this last since it is a synthetic preload.
3120 if (Info.hasLDSKernelId()) {
3121 Register Reg = Info.addLDSKernelId();
3122 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3123 CCInfo.AllocateReg(Reg);
3124 }
3125}
3126
3127// Allocate special input registers that are initialized per-wave.
3128void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
3129 SIMachineFunctionInfo &Info,
3130 CallingConv::ID CallConv,
3131 bool IsShader) const {
3132 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3133 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3134 // Note: user SGPRs are handled by the front-end for graphics shaders
3135 // Pad up the used user SGPRs with dead inputs.
3136
3137 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3138 // before enabling architected SGPRs for workgroup IDs.
3139 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3140
3141 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3142 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3143 // rely on it to reach 16 since if we end up having no stack usage, it will
3144 // not really be added.
3145 unsigned NumRequiredSystemSGPRs =
3146 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3147 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3148 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3149 Register Reg = Info.addReservedUserSGPR();
3150 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3151 CCInfo.AllocateReg(Reg);
3152 }
3153 }
3154
3155 if (!HasArchitectedSGPRs) {
3156 if (Info.hasWorkGroupIDX()) {
3157 Register Reg = Info.addWorkGroupIDX();
3158 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3159 CCInfo.AllocateReg(Reg);
3160 }
3161
3162 if (Info.hasWorkGroupIDY()) {
3163 Register Reg = Info.addWorkGroupIDY();
3164 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3165 CCInfo.AllocateReg(Reg);
3166 }
3167
3168 if (Info.hasWorkGroupIDZ()) {
3169 Register Reg = Info.addWorkGroupIDZ();
3170 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3171 CCInfo.AllocateReg(Reg);
3172 }
3173 }
3174
3175 if (Info.hasWorkGroupInfo()) {
3176 Register Reg = Info.addWorkGroupInfo();
3177 MF.addLiveIn(PReg: Reg, RC: &AMDGPU::SGPR_32RegClass);
3178 CCInfo.AllocateReg(Reg);
3179 }
3180
3181 if (Info.hasPrivateSegmentWaveByteOffset()) {
3182 // Scratch wave offset passed in system SGPR.
3183 unsigned PrivateSegmentWaveByteOffsetReg;
3184
3185 if (IsShader) {
3186 PrivateSegmentWaveByteOffsetReg =
3187 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3188
3189 // This is true if the scratch wave byte offset doesn't have a fixed
3190 // location.
3191 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3192 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3193 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3194 }
3195 } else
3196 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3197
3198 MF.addLiveIn(PReg: PrivateSegmentWaveByteOffsetReg, RC: &AMDGPU::SGPR_32RegClass);
3199 CCInfo.AllocateReg(Reg: PrivateSegmentWaveByteOffsetReg);
3200 }
3201
3202 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3203 Info.getNumPreloadedSGPRs() >= 16);
3204}
3205
3206static void reservePrivateMemoryRegs(const TargetMachine &TM,
3207 MachineFunction &MF,
3208 const SIRegisterInfo &TRI,
3209 SIMachineFunctionInfo &Info) {
3210 // Now that we've figured out where the scratch register inputs are, see if
3211 // should reserve the arguments and use them directly.
3212 MachineFrameInfo &MFI = MF.getFrameInfo();
3213 bool HasStackObjects = MFI.hasStackObjects();
3214 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3215
3216 // Record that we know we have non-spill stack objects so we don't need to
3217 // check all stack objects later.
3218 if (HasStackObjects)
3219 Info.setHasNonSpillStackObjects(true);
3220
3221 // Everything live out of a block is spilled with fast regalloc, so it's
3222 // almost certain that spilling will be required.
3223 if (TM.getOptLevel() == CodeGenOptLevel::None)
3224 HasStackObjects = true;
3225
3226 // For now assume stack access is needed in any callee functions, so we need
3227 // the scratch registers to pass in.
3228 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3229
3230 if (!ST.hasFlatScratchEnabled()) {
3231 if (RequiresStackAccess && ST.isAmdHsaOrMesa(F: MF.getFunction())) {
3232 // If we have stack objects, we unquestionably need the private buffer
3233 // resource. For the Code Object V2 ABI, this will be the first 4 user
3234 // SGPR inputs. We can reserve those and use them directly.
3235
3236 Register PrivateSegmentBufferReg =
3237 Info.getPreloadedReg(Value: AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
3238 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3239 } else {
3240 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3241 // We tentatively reserve the last registers (skipping the last registers
3242 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3243 // we'll replace these with the ones immediately after those which were
3244 // really allocated. In the prologue copies will be inserted from the
3245 // argument to these reserved registers.
3246
3247 // Without HSA, relocations are used for the scratch pointer and the
3248 // buffer resource setup is always inserted in the prologue. Scratch wave
3249 // offset is still in an input SGPR.
3250 Info.setScratchRSrcReg(ReservedBufferReg);
3251 }
3252 }
3253
3254 MachineRegisterInfo &MRI = MF.getRegInfo();
3255
3256 // For entry functions we have to set up the stack pointer if we use it,
3257 // whereas non-entry functions get this "for free". This means there is no
3258 // intrinsic advantage to using S32 over S34 in cases where we do not have
3259 // calls but do need a frame pointer (i.e. if we are requested to have one
3260 // because frame pointer elimination is disabled). To keep things simple we
3261 // only ever use S32 as the call ABI stack pointer, and so using it does not
3262 // imply we need a separate frame pointer.
3263 //
3264 // Try to use s32 as the SP, but move it if it would interfere with input
3265 // arguments. This won't work with calls though.
3266 //
3267 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3268 // registers.
3269 if (!MRI.isLiveIn(Reg: AMDGPU::SGPR32)) {
3270 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3271 } else {
3272 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
3273
3274 if (MFI.hasCalls())
3275 report_fatal_error(reason: "call in graphics shader with too many input SGPRs");
3276
3277 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3278 if (!MRI.isLiveIn(Reg)) {
3279 Info.setStackPtrOffsetReg(Reg);
3280 break;
3281 }
3282 }
3283
3284 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3285 report_fatal_error(reason: "failed to find register for SP");
3286 }
3287
3288 // hasFP should be accurate for entry functions even before the frame is
3289 // finalized, because it does not rely on the known stack size, only
3290 // properties like whether variable sized objects are present.
3291 if (ST.getFrameLowering()->hasFP(MF)) {
3292 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3293 }
3294}
3295
3296bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
3297 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3298 return !Info->isEntryFunction();
3299}
3300
3301void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {}
3302
3303void SITargetLowering::insertCopiesSplitCSR(
3304 MachineBasicBlock *Entry,
3305 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3306 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3307
3308 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
3309 if (!IStart)
3310 return;
3311
3312 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3313 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3314 MachineBasicBlock::iterator MBBI = Entry->begin();
3315 for (const MCPhysReg *I = IStart; *I; ++I) {
3316 const TargetRegisterClass *RC = nullptr;
3317 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3318 RC = &AMDGPU::SGPR_64RegClass;
3319 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3320 RC = &AMDGPU::SGPR_32RegClass;
3321 else
3322 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3323
3324 Register NewVR = MRI->createVirtualRegister(RegClass: RC);
3325 // Create copy from CSR to a virtual register.
3326 Entry->addLiveIn(PhysReg: *I);
3327 BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
3328 .addReg(RegNo: *I);
3329
3330 // Insert the copy-back instructions right before the terminator.
3331 for (auto *Exit : Exits)
3332 BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc(),
3333 MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
3334 .addReg(RegNo: NewVR);
3335 }
3336}
3337
3338SDValue SITargetLowering::LowerFormalArguments(
3339 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3340 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3341 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3342 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3343
3344 MachineFunction &MF = DAG.getMachineFunction();
3345 const Function &Fn = MF.getFunction();
3346 FunctionType *FType = MF.getFunction().getFunctionType();
3347 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3348 bool IsError = false;
3349
3350 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CC: CallConv)) {
3351 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3352 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3353 IsError = true;
3354 }
3355
3356 SmallVector<ISD::InputArg, 16> Splits;
3357 SmallVector<CCValAssign, 16> ArgLocs;
3358 BitVector Skipped(Ins.size());
3359 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3360 *DAG.getContext());
3361
3362 bool IsGraphics = AMDGPU::isGraphics(CC: CallConv);
3363 bool IsKernel = AMDGPU::isKernel(CC: CallConv);
3364 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC: CallConv);
3365
3366 if (IsGraphics) {
3367 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3368 assert(!UserSGPRInfo.hasDispatchPtr() &&
3369 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3370 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3371 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3372 (void)UserSGPRInfo;
3373 if (!Subtarget->hasFlatScratchEnabled())
3374 assert(!UserSGPRInfo.hasFlatScratchInit());
3375 if ((CallConv != CallingConv::AMDGPU_CS &&
3376 CallConv != CallingConv::AMDGPU_Gfx &&
3377 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3378 !Subtarget->hasArchitectedSGPRs())
3379 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3380 !Info->hasWorkGroupIDZ());
3381 }
3382
3383 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3384
3385 if (CallConv == CallingConv::AMDGPU_PS) {
3386 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3387
3388 // At least one interpolation mode must be enabled or else the GPU will
3389 // hang.
3390 //
3391 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3392 // set PSInputAddr, the user wants to enable some bits after the compilation
3393 // based on run-time states. Since we can't know what the final PSInputEna
3394 // will look like, so we shouldn't do anything here and the user should take
3395 // responsibility for the correct programming.
3396 //
3397 // Otherwise, the following restrictions apply:
3398 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3399 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3400 // enabled too.
3401 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3402 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(Index: 11))) {
3403 CCInfo.AllocateReg(Reg: AMDGPU::VGPR0);
3404 CCInfo.AllocateReg(Reg: AMDGPU::VGPR1);
3405 Info->markPSInputAllocated(Index: 0);
3406 Info->markPSInputEnabled(Index: 0);
3407 }
3408 if (Subtarget->isAmdPalOS()) {
3409 // For isAmdPalOS, the user does not enable some bits after compilation
3410 // based on run-time states; the register values being generated here are
3411 // the final ones set in hardware. Therefore we need to apply the
3412 // workaround to PSInputAddr and PSInputEnable together. (The case where
3413 // a bit is set in PSInputAddr but not PSInputEnable is where the
3414 // frontend set up an input arg for a particular interpolation mode, but
3415 // nothing uses that input arg. Really we should have an earlier pass
3416 // that removes such an arg.)
3417 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3418 if ((PsInputBits & 0x7F) == 0 ||
3419 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3420 Info->markPSInputEnabled(Index: llvm::countr_zero(Val: Info->getPSInputAddr()));
3421 }
3422 } else if (IsKernel) {
3423 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3424 } else {
3425 Splits.append(in_start: IsWholeWaveFunc ? std::next(x: Ins.begin()) : Ins.begin(),
3426 in_end: Ins.end());
3427 }
3428
3429 if (IsKernel)
3430 analyzeFormalArgumentsCompute(State&: CCInfo, Ins);
3431
3432 if (IsEntryFunc) {
3433 allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3434 allocateHSAUserSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3435 if (IsKernel && Subtarget->hasKernargPreload())
3436 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, TRI: *TRI, Info&: *Info);
3437
3438 allocateLDSKernelId(CCInfo, MF, TRI: *TRI, Info&: *Info);
3439 } else if (!IsGraphics) {
3440 // For the fixed ABI, pass workitem IDs in the last argument register.
3441 allocateSpecialInputVGPRsFixed(CCInfo, MF, TRI: *TRI, Info&: *Info);
3442
3443 // FIXME: Sink this into allocateSpecialInputSGPRs
3444 if (!Subtarget->hasFlatScratchEnabled())
3445 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
3446
3447 allocateSpecialInputSGPRs(CCInfo, MF, TRI: *TRI, Info&: *Info);
3448 }
3449
3450 if (!IsKernel) {
3451 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: isVarArg);
3452 CCInfo.AnalyzeFormalArguments(Ins: Splits, Fn: AssignFn);
3453
3454 // This assumes the registers are allocated by CCInfo in ascending order
3455 // with no gaps.
3456 Info->setNumWaveDispatchSGPRs(
3457 CCInfo.getFirstUnallocated(Regs: AMDGPU::SGPR_32RegClass.getRegisters()));
3458 Info->setNumWaveDispatchVGPRs(
3459 CCInfo.getFirstUnallocated(Regs: AMDGPU::VGPR_32RegClass.getRegisters()));
3460 } else if (Info->getNumKernargPreloadedSGPRs()) {
3461 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3462 }
3463
3464 SmallVector<SDValue, 16> Chains;
3465
3466 if (IsWholeWaveFunc) {
3467 SDValue Setup = DAG.getNode(Opcode: AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3468 ResultTys: {MVT::i1, MVT::Other}, Ops: Chain);
3469 InVals.push_back(Elt: Setup.getValue(R: 0));
3470 Chains.push_back(Elt: Setup.getValue(R: 1));
3471 }
3472
3473 // FIXME: This is the minimum kernel argument alignment. We should improve
3474 // this to the maximum alignment of the arguments.
3475 //
3476 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3477 // kern arg offset.
3478 const Align KernelArgBaseAlign = Align(16);
3479
3480 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3481 ++i) {
3482 const ISD::InputArg &Arg = Ins[i];
3483 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3484 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
3485 continue;
3486 }
3487
3488 CCValAssign &VA = ArgLocs[ArgIdx++];
3489 MVT VT = VA.getLocVT();
3490
3491 if (IsEntryFunc && VA.isMemLoc()) {
3492 VT = Ins[i].VT;
3493 EVT MemVT = VA.getLocVT();
3494
3495 const uint64_t Offset = VA.getLocMemOffset();
3496 Align Alignment = commonAlignment(A: KernelArgBaseAlign, Offset);
3497
3498 if (Arg.Flags.isByRef()) {
3499 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain, Offset);
3500
3501 const GCNTargetMachine &TM =
3502 static_cast<const GCNTargetMachine &>(getTargetMachine());
3503 if (!TM.isNoopAddrSpaceCast(SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3504 DestAS: Arg.Flags.getPointerAddrSpace())) {
3505 Ptr = DAG.getAddrSpaceCast(dl: DL, VT, Ptr, SrcAS: AMDGPUAS::CONSTANT_ADDRESS,
3506 DestAS: Arg.Flags.getPointerAddrSpace());
3507 }
3508
3509 InVals.push_back(Elt: Ptr);
3510 continue;
3511 }
3512
3513 SDValue NewArg;
3514 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(Val: i)) {
3515 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3516 // In this case the argument is packed into the previous preload SGPR.
3517 int64_t AlignDownOffset = alignDown(Value: Offset, Align: 4);
3518 int64_t OffsetDiff = Offset - AlignDownOffset;
3519 EVT IntVT = MemVT.changeTypeToInteger();
3520
3521 const SIMachineFunctionInfo *Info =
3522 MF.getInfo<SIMachineFunctionInfo>();
3523 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3524 Register Reg =
3525 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs[0];
3526
3527 assert(Reg);
3528 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3529 SDValue Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3530
3531 SDValue ShiftAmt = DAG.getConstant(Val: OffsetDiff * 8, DL, VT: MVT::i32);
3532 SDValue Extract = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Copy, N2: ShiftAmt);
3533
3534 SDValue ArgVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Extract);
3535 ArgVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MemVT, Operand: ArgVal);
3536 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: ArgVal,
3537 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3538
3539 NewArg = DAG.getMergeValues(Ops: {NewArg, Copy.getValue(R: 1)}, dl: DL);
3540 } else {
3541 const SIMachineFunctionInfo *Info =
3542 MF.getInfo<SIMachineFunctionInfo>();
3543 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
3544 const SmallVectorImpl<MCRegister> &PreloadRegs =
3545 Info->getArgInfo().PreloadKernArgs.find(Val: i)->getSecond().Regs;
3546
3547 SDValue Copy;
3548 if (PreloadRegs.size() == 1) {
3549 Register VReg = MRI.getLiveInVirtReg(PReg: PreloadRegs[0]);
3550 const TargetRegisterClass *RC = MRI.getRegClass(Reg: VReg);
3551 NewArg = DAG.getCopyFromReg(
3552 Chain, dl: DL, Reg: VReg,
3553 VT: EVT::getIntegerVT(Context&: *DAG.getContext(),
3554 BitWidth: TRI->getRegSizeInBits(RC: *RC)));
3555
3556 } else {
3557 // If the kernarg alignment does not match the alignment of the SGPR
3558 // tuple RC that can accommodate this argument, it will be built up
3559 // via copies from from the individual SGPRs that the argument was
3560 // preloaded to.
3561 SmallVector<SDValue, 4> Elts;
3562 for (auto Reg : PreloadRegs) {
3563 Register VReg = MRI.getLiveInVirtReg(PReg: Reg);
3564 Copy = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i32);
3565 Elts.push_back(Elt: Copy);
3566 }
3567 NewArg =
3568 DAG.getBuildVector(VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
3569 NumElements: PreloadRegs.size()),
3570 DL, Ops: Elts);
3571 }
3572
3573 // If the argument was preloaded to multiple consecutive 32-bit
3574 // registers because of misalignment between addressable SGPR tuples
3575 // and the argument size, we can still assume that because of kernarg
3576 // segment alignment restrictions that NewArg's size is the same as
3577 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3578 // truncate since we cannot preload to less than a single SGPR and the
3579 // MemVT may be smaller.
3580 EVT MemVTInt =
3581 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
3582 if (MemVT.bitsLT(VT: NewArg.getSimpleValueType()))
3583 NewArg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVTInt, Operand: NewArg);
3584
3585 NewArg = DAG.getBitcast(VT: MemVT, V: NewArg);
3586 NewArg = convertArgType(DAG, VT, MemVT, SL: DL, Val: NewArg,
3587 Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3588 NewArg = DAG.getMergeValues(Ops: {NewArg, Chain}, dl: DL);
3589 }
3590 } else {
3591 // Hidden arguments that are in the kernel signature must be preloaded
3592 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3593 // the argument list and is not preloaded.
3594 if (Arg.isOrigArg()) {
3595 Argument *OrigArg = Fn.getArg(i: Arg.getOrigArgIndex());
3596 if (OrigArg->hasAttribute(Kind: "amdgpu-hidden-argument")) {
3597 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
3598 *OrigArg->getParent(),
3599 "hidden argument in kernel signature was not preloaded",
3600 DL.getDebugLoc()));
3601 }
3602 }
3603
3604 NewArg =
3605 lowerKernargMemParameter(DAG, VT, MemVT, SL: DL, Chain, Offset,
3606 Alignment, Signed: Ins[i].Flags.isSExt(), Arg: &Ins[i]);
3607 }
3608 Chains.push_back(Elt: NewArg.getValue(R: 1));
3609
3610 auto *ParamTy =
3611 dyn_cast<PointerType>(Val: FType->getParamType(i: Ins[i].getOrigArgIndex()));
3612 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3613 ParamTy &&
3614 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3615 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3616 // On SI local pointers are just offsets into LDS, so they are always
3617 // less than 16-bits. On CI and newer they could potentially be
3618 // real pointers, so we can't guarantee their size.
3619 NewArg = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: NewArg.getValueType(), N1: NewArg,
3620 N2: DAG.getValueType(MVT::i16));
3621 }
3622
3623 InVals.push_back(Elt: NewArg);
3624 continue;
3625 }
3626 if (!IsEntryFunc && VA.isMemLoc()) {
3627 SDValue Val = lowerStackParameter(DAG, VA, SL: DL, Chain, Arg);
3628 InVals.push_back(Elt: Val);
3629 if (!Arg.Flags.isByVal())
3630 Chains.push_back(Elt: Val.getValue(R: 1));
3631 continue;
3632 }
3633
3634 assert(VA.isRegLoc() && "Parameter must be in a register!");
3635
3636 Register Reg = VA.getLocReg();
3637 const TargetRegisterClass *RC = nullptr;
3638 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3639 RC = &AMDGPU::VGPR_32RegClass;
3640 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3641 RC = &AMDGPU::SGPR_32RegClass;
3642 else
3643 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3644
3645 Reg = MF.addLiveIn(PReg: Reg, RC);
3646 SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT);
3647 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3648 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3649 // they will read physical regs before any side effect instructions.
3650 SDValue ReadFirstLane =
3651 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3652 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
3653 N1: ReadFirstLane, N2: Val);
3654 }
3655
3656 if (Arg.Flags.isSRet()) {
3657 // The return object should be reasonably addressable.
3658
3659 // FIXME: This helps when the return is a real sret. If it is a
3660 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3661 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3662 unsigned NumBits =
3663 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
3664 Val = DAG.getNode(
3665 Opcode: ISD::AssertZext, DL, VT, N1: Val,
3666 N2: DAG.getValueType(EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: NumBits)));
3667 }
3668
3669 Val = convertABITypeToValueType(DAG, Val, VA, SL: DL);
3670 InVals.push_back(Elt: Val);
3671 }
3672
3673 // Start adding system SGPRs.
3674 if (IsEntryFunc)
3675 allocateSystemSGPRs(CCInfo, MF, Info&: *Info, CallConv, IsShader: IsGraphics);
3676
3677 unsigned StackArgSize = CCInfo.getStackSize();
3678 Info->setBytesInStackArgArea(StackArgSize);
3679
3680 return Chains.empty() ? Chain
3681 : DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: Chains);
3682}
3683
3684// TODO: If return values can't fit in registers, we should return as many as
3685// possible in registers before passing on stack.
3686bool SITargetLowering::CanLowerReturn(
3687 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3688 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3689 const Type *RetTy) const {
3690 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3691 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3692 // for shaders. Vector types should be explicitly handled by CC.
3693 if (AMDGPU::isEntryFunctionCC(CC: CallConv))
3694 return true;
3695
3696 SmallVector<CCValAssign, 16> RVLocs;
3697 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3698 if (!CCInfo.CheckReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg)))
3699 return false;
3700
3701 // We must use the stack if return would require unavailable registers.
3702 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3703 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3704 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3705 if (CCInfo.isAllocated(Reg: AMDGPU::VGPR_32RegClass.getRegister(i)))
3706 return false;
3707
3708 return true;
3709}
3710
3711SDValue
3712SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3713 bool isVarArg,
3714 const SmallVectorImpl<ISD::OutputArg> &Outs,
3715 const SmallVectorImpl<SDValue> &OutVals,
3716 const SDLoc &DL, SelectionDAG &DAG) const {
3717 MachineFunction &MF = DAG.getMachineFunction();
3718 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3719 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3720
3721 if (AMDGPU::isKernel(CC: CallConv)) {
3722 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3723 OutVals, DL, DAG);
3724 }
3725
3726 bool IsShader = AMDGPU::isShader(CC: CallConv);
3727
3728 Info->setIfReturnsVoid(Outs.empty());
3729 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3730
3731 // CCValAssign - represent the assignment of the return value to a location.
3732 SmallVector<CCValAssign, 48> RVLocs;
3733
3734 // CCState - Info about the registers and stack slots.
3735 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3736 *DAG.getContext());
3737
3738 // Analyze outgoing return values.
3739 CCInfo.AnalyzeReturn(Outs, Fn: CCAssignFnForReturn(CC: CallConv, IsVarArg: isVarArg));
3740
3741 SDValue Glue;
3742 SmallVector<SDValue, 48> RetOps;
3743 RetOps.push_back(Elt: Chain); // Operand #0 = Chain (updated below)
3744
3745 SDValue ReadFirstLane =
3746 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
3747 // Copy the result values into the output registers.
3748 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3749 ++I, ++RealRVLocIdx) {
3750 CCValAssign &VA = RVLocs[I];
3751 assert(VA.isRegLoc() && "Can only return in registers!");
3752 // TODO: Partially return in registers if return values don't fit.
3753 SDValue Arg = OutVals[RealRVLocIdx];
3754
3755 // Copied from other backends.
3756 switch (VA.getLocInfo()) {
3757 case CCValAssign::Full:
3758 break;
3759 case CCValAssign::BCvt:
3760 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
3761 break;
3762 case CCValAssign::SExt:
3763 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3764 break;
3765 case CCValAssign::ZExt:
3766 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3767 break;
3768 case CCValAssign::AExt:
3769 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
3770 break;
3771 default:
3772 llvm_unreachable("Unknown loc info!");
3773 }
3774 if (TRI->isSGPRPhysReg(Reg: VA.getLocReg()))
3775 Arg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Arg.getValueType(),
3776 N1: ReadFirstLane, N2: Arg);
3777 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: VA.getLocReg(), N: Arg, Glue);
3778 Glue = Chain.getValue(R: 1);
3779 RetOps.push_back(Elt: DAG.getRegister(Reg: VA.getLocReg(), VT: VA.getLocVT()));
3780 }
3781
3782 // FIXME: Does sret work properly?
3783 if (!Info->isEntryFunction()) {
3784 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3785 const MCPhysReg *I =
3786 TRI->getCalleeSavedRegsViaCopy(MF: &DAG.getMachineFunction());
3787 if (I) {
3788 for (; *I; ++I) {
3789 if (AMDGPU::SReg_64RegClass.contains(Reg: *I))
3790 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
3791 else if (AMDGPU::SReg_32RegClass.contains(Reg: *I))
3792 RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i32));
3793 else
3794 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3795 }
3796 }
3797 }
3798
3799 // Update chain and glue.
3800 RetOps[0] = Chain;
3801 if (Glue.getNode())
3802 RetOps.push_back(Elt: Glue);
3803
3804 unsigned Opc = AMDGPUISD::ENDPGM;
3805 if (!IsWaveEnd)
3806 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3807 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3808 : AMDGPUISD::RET_GLUE;
3809 return DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops: RetOps);
3810}
3811
3812SDValue SITargetLowering::LowerCallResult(
3813 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3814 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3815 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3816 SDValue ThisVal) const {
3817 CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv, IsVarArg);
3818
3819 // Assign locations to each value returned by this call.
3820 SmallVector<CCValAssign, 16> RVLocs;
3821 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3822 *DAG.getContext());
3823 CCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
3824
3825 // Copy all of the result registers out of their specified physreg.
3826 for (CCValAssign VA : RVLocs) {
3827 SDValue Val;
3828
3829 if (VA.isRegLoc()) {
3830 Val =
3831 DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
3832 Chain = Val.getValue(R: 1);
3833 InGlue = Val.getValue(R: 2);
3834 } else if (VA.isMemLoc()) {
3835 report_fatal_error(reason: "TODO: return values in memory");
3836 } else
3837 llvm_unreachable("unknown argument location type");
3838
3839 switch (VA.getLocInfo()) {
3840 case CCValAssign::Full:
3841 break;
3842 case CCValAssign::BCvt:
3843 Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
3844 break;
3845 case CCValAssign::ZExt:
3846 Val = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: VA.getLocVT(), N1: Val,
3847 N2: DAG.getValueType(VA.getValVT()));
3848 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3849 break;
3850 case CCValAssign::SExt:
3851 Val = DAG.getNode(Opcode: ISD::AssertSext, DL, VT: VA.getLocVT(), N1: Val,
3852 N2: DAG.getValueType(VA.getValVT()));
3853 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3854 break;
3855 case CCValAssign::AExt:
3856 Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Val);
3857 break;
3858 default:
3859 llvm_unreachable("Unknown loc info!");
3860 }
3861
3862 InVals.push_back(Elt: Val);
3863 }
3864
3865 return Chain;
3866}
3867
3868// Add code to pass special inputs required depending on used features separate
3869// from the explicit user arguments present in the IR.
3870void SITargetLowering::passSpecialInputs(
3871 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3872 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3873 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3874 // If we don't have a call site, this was a call inserted by
3875 // legalization. These can never use special inputs.
3876 if (!CLI.CB)
3877 return;
3878
3879 SelectionDAG &DAG = CLI.DAG;
3880 const SDLoc &DL = CLI.DL;
3881 const Function &F = DAG.getMachineFunction().getFunction();
3882
3883 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3884 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3885
3886 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3887 AMDGPUFunctionArgInfo::FixedABIFunctionInfo;
3888
3889 // TODO: Unify with private memory register handling. This is complicated by
3890 // the fact that at least in kernels, the input argument is not necessarily
3891 // in the same location as the input.
3892 // clang-format off
3893 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3894 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3895 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3896 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3897 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3898 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3899 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3900 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3901 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3902 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3903 };
3904 // clang-format on
3905
3906 for (auto [InputID, Attrs] : ImplicitAttrs) {
3907 // If the callee does not use the attribute value, skip copying the value.
3908 if (all_of(Range&: Attrs, P: [&](StringRef Attr) {
3909 return Attr.empty() || CLI.CB->hasFnAttr(Kind: Attr);
3910 }))
3911 continue;
3912
3913 const auto [OutgoingArg, ArgRC, ArgTy] =
3914 CalleeArgInfo.getPreloadedValue(Value: InputID);
3915 if (!OutgoingArg)
3916 continue;
3917
3918 const auto [IncomingArg, IncomingArgRC, Ty] =
3919 CallerArgInfo.getPreloadedValue(Value: InputID);
3920 assert(IncomingArgRC == ArgRC);
3921
3922 // All special arguments are ints for now.
3923 EVT ArgVT = TRI->getSpillSize(RC: *ArgRC) == 8 ? MVT::i64 : MVT::i32;
3924 SDValue InputReg;
3925
3926 if (IncomingArg) {
3927 InputReg = loadInputValue(DAG, RC: ArgRC, VT: ArgVT, SL: DL, Arg: *IncomingArg);
3928 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3929 // The implicit arg ptr is special because it doesn't have a corresponding
3930 // input for kernels, and is computed from the kernarg segment pointer.
3931 InputReg = getImplicitArgPtr(DAG, SL: DL);
3932 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3933 std::optional<uint32_t> Id =
3934 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
3935 if (Id.has_value()) {
3936 InputReg = DAG.getConstant(Val: *Id, DL, VT: ArgVT);
3937 } else {
3938 InputReg = DAG.getPOISON(VT: ArgVT);
3939 }
3940 } else {
3941 // We may have proven the input wasn't needed, although the ABI is
3942 // requiring it. We just need to allocate the register appropriately.
3943 InputReg = DAG.getPOISON(VT: ArgVT);
3944 }
3945
3946 if (OutgoingArg->isRegister()) {
3947 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
3948 if (!CCInfo.AllocateReg(Reg: OutgoingArg->getRegister()))
3949 report_fatal_error(reason: "failed to allocate implicit input argument");
3950 } else {
3951 unsigned SpecialArgOffset =
3952 CCInfo.AllocateStack(Size: ArgVT.getStoreSize(), Alignment: Align(4));
3953 SDValue ArgStore =
3954 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
3955 MemOpChains.push_back(Elt: ArgStore);
3956 }
3957 }
3958
3959 // Pack workitem IDs into a single register or pass it as is if already
3960 // packed.
3961
3962 auto [OutgoingArg, ArgRC, Ty] =
3963 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X);
3964 if (!OutgoingArg)
3965 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3966 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
3967 if (!OutgoingArg)
3968 std::tie(args&: OutgoingArg, args&: ArgRC, args&: Ty) =
3969 CalleeArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
3970 if (!OutgoingArg)
3971 return;
3972
3973 const ArgDescriptor *IncomingArgX = std::get<0>(
3974 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_X));
3975 const ArgDescriptor *IncomingArgY = std::get<0>(
3976 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
3977 const ArgDescriptor *IncomingArgZ = std::get<0>(
3978 t: CallerArgInfo.getPreloadedValue(Value: AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
3979
3980 SDValue InputReg;
3981 SDLoc SL;
3982
3983 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-x");
3984 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-y");
3985 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr(Kind: "amdgpu-no-workitem-id-z");
3986
3987 // If incoming ids are not packed we need to pack them.
3988 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
3989 NeedWorkItemIDX) {
3990 if (Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 0) != 0) {
3991 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgX);
3992 } else {
3993 InputReg = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
3994 }
3995 }
3996
3997 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
3998 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 1) != 0) {
3999 SDValue Y = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgY);
4000 Y = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Y,
4001 N2: DAG.getShiftAmountConstant(Val: 10, VT: MVT::i32, DL: SL));
4002 InputReg = InputReg.getNode()
4003 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Y)
4004 : Y;
4005 }
4006
4007 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4008 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(Kernel: F, Dimension: 2) != 0) {
4009 SDValue Z = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: *IncomingArgZ);
4010 Z = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Z,
4011 N2: DAG.getShiftAmountConstant(Val: 20, VT: MVT::i32, DL: SL));
4012 InputReg = InputReg.getNode()
4013 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: InputReg, N2: Z)
4014 : Z;
4015 }
4016
4017 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4018 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4019 // We're in a situation where the outgoing function requires the workitem
4020 // ID, but the calling function does not have it (e.g a graphics function
4021 // calling a C calling convention function). This is illegal, but we need
4022 // to produce something.
4023 InputReg = DAG.getPOISON(VT: MVT::i32);
4024 } else {
4025 // Workitem ids are already packed, any of present incoming arguments
4026 // will carry all required fields.
4027 ArgDescriptor IncomingArg =
4028 ArgDescriptor::createArg(Arg: IncomingArgX ? *IncomingArgX
4029 : IncomingArgY ? *IncomingArgY
4030 : *IncomingArgZ,
4031 Mask: ~0u);
4032 InputReg = loadInputValue(DAG, RC: ArgRC, VT: MVT::i32, SL: DL, Arg: IncomingArg);
4033 }
4034 }
4035
4036 if (OutgoingArg->isRegister()) {
4037 if (InputReg)
4038 RegsToPass.emplace_back(Args: OutgoingArg->getRegister(), Args&: InputReg);
4039
4040 CCInfo.AllocateReg(Reg: OutgoingArg->getRegister());
4041 } else {
4042 unsigned SpecialArgOffset = CCInfo.AllocateStack(Size: 4, Alignment: Align(4));
4043 if (InputReg) {
4044 SDValue ArgStore =
4045 storeStackInputValue(DAG, SL: DL, Chain, ArgVal: InputReg, Offset: SpecialArgOffset);
4046 MemOpChains.push_back(Elt: ArgStore);
4047 }
4048 }
4049}
4050
4051bool SITargetLowering::isEligibleForTailCallOptimization(
4052 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4053 const SmallVectorImpl<ISD::OutputArg> &Outs,
4054 const SmallVectorImpl<SDValue> &OutVals,
4055 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4056 if (AMDGPU::isChainCC(CC: CalleeCC))
4057 return true;
4058
4059 if (!AMDGPU::mayTailCallThisCC(CC: CalleeCC))
4060 return false;
4061
4062 // For a divergent call target, we need to do a waterfall loop over the
4063 // possible callees which precludes us from using a simple jump.
4064 if (Callee->isDivergent())
4065 return false;
4066
4067 MachineFunction &MF = DAG.getMachineFunction();
4068 const Function &CallerF = MF.getFunction();
4069 CallingConv::ID CallerCC = CallerF.getCallingConv();
4070 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4071 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4072
4073 // Kernels aren't callable, and don't have a live in return address so it
4074 // doesn't make sense to do a tail call with entry functions.
4075 if (!CallerPreserved)
4076 return false;
4077
4078 bool CCMatch = CallerCC == CalleeCC;
4079
4080 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4081 if (AMDGPU::canGuaranteeTCO(CC: CalleeCC) && CCMatch)
4082 return true;
4083 return false;
4084 }
4085
4086 // TODO: Can we handle var args?
4087 if (IsVarArg)
4088 return false;
4089
4090 for (const Argument &Arg : CallerF.args()) {
4091 if (Arg.hasByValAttr())
4092 return false;
4093 }
4094
4095 LLVMContext &Ctx = *DAG.getContext();
4096
4097 // Check that the call results are passed in the same way.
4098 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C&: Ctx, Ins,
4099 CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
4100 CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
4101 return false;
4102
4103 // The callee has to preserve all registers the caller needs to preserve.
4104 if (!CCMatch) {
4105 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4106 if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
4107 return false;
4108 }
4109
4110 // Nothing more to check if the callee is taking no arguments.
4111 if (Outs.empty())
4112 return true;
4113
4114 SmallVector<CCValAssign, 16> ArgLocs;
4115 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4116
4117 // FIXME: We are not allocating special input registers, so we will be
4118 // deciding based on incorrect register assignments.
4119 CCInfo.AnalyzeCallOperands(Outs, Fn: CCAssignFnForCall(CC: CalleeCC, IsVarArg));
4120
4121 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4122 // If the stack arguments for this call do not fit into our own save area then
4123 // the call cannot be made tail.
4124 // TODO: Is this really necessary?
4125 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4126 return false;
4127
4128 for (const auto &[CCVA, ArgVal] : zip_equal(t&: ArgLocs, u: OutVals)) {
4129 // FIXME: What about inreg arguments that end up passed in memory?
4130 if (!CCVA.isRegLoc())
4131 continue;
4132
4133 // If we are passing an argument in an SGPR, and the value is divergent,
4134 // this call requires a waterfall loop.
4135 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(Reg: CCVA.getLocReg())) {
4136 LLVM_DEBUG(
4137 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4138 << printReg(CCVA.getLocReg(), TRI) << '\n');
4139 return false;
4140 }
4141 }
4142
4143 const MachineRegisterInfo &MRI = MF.getRegInfo();
4144 return parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals);
4145}
4146
4147bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
4148 if (!CI->isTailCall())
4149 return false;
4150
4151 const Function *ParentFn = CI->getFunction();
4152 if (AMDGPU::isEntryFunctionCC(CC: ParentFn->getCallingConv()))
4153 return false;
4154 return true;
4155}
4156
4157namespace {
4158// Chain calls have special arguments that we need to handle. These are
4159// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4160// arguments (index 0 and 1 respectively).
4161enum ChainCallArgIdx {
4162 Exec = 2,
4163 Flags,
4164 NumVGPRs,
4165 FallbackExec,
4166 FallbackCallee
4167};
4168} // anonymous namespace
4169
4170// The wave scratch offset register is used as the global base pointer.
4171SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4172 SmallVectorImpl<SDValue> &InVals) const {
4173 CallingConv::ID CallConv = CLI.CallConv;
4174 bool IsChainCallConv = AMDGPU::isChainCC(CC: CallConv);
4175
4176 SelectionDAG &DAG = CLI.DAG;
4177
4178 const SDLoc &DL = CLI.DL;
4179 SDValue Chain = CLI.Chain;
4180 SDValue Callee = CLI.Callee;
4181
4182 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4183 bool UsesDynamicVGPRs = false;
4184 if (IsChainCallConv) {
4185 // The last arguments should be the value that we need to put in EXEC,
4186 // followed by the flags and any other arguments with special meanings.
4187 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4188 // we don't treat them like the "real" arguments.
4189 auto RequestedExecIt =
4190 llvm::find_if(Range&: CLI.Outs, P: [](const ISD::OutputArg &Arg) {
4191 return Arg.OrigArgIndex == 2;
4192 });
4193 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4194
4195 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4196 CLI.OutVals.erase(CS: CLI.OutVals.begin() + SpecialArgsBeginIdx,
4197 CE: CLI.OutVals.end());
4198 CLI.Outs.erase(CS: RequestedExecIt, CE: CLI.Outs.end());
4199
4200 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4201 "Haven't popped all the special args");
4202
4203 TargetLowering::ArgListEntry RequestedExecArg =
4204 CLI.Args[ChainCallArgIdx::Exec];
4205 if (!RequestedExecArg.Ty->isIntegerTy(Bitwidth: Subtarget->getWavefrontSize()))
4206 return lowerUnhandledCall(CLI, InVals, Reason: "Invalid value for EXEC");
4207
4208 // Convert constants into TargetConstants, so they become immediate operands
4209 // instead of being selected into S_MOV.
4210 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4211 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Val&: Arg.Node)) {
4212 ChainCallSpecialArgs.push_back(Elt: DAG.getTargetConstant(
4213 Val: ArgNode->getAPIntValue(), DL, VT: ArgNode->getValueType(ResNo: 0)));
4214 } else
4215 ChainCallSpecialArgs.push_back(Elt: Arg.Node);
4216 };
4217
4218 PushNodeOrTargetConstant(RequestedExecArg);
4219
4220 // Process any other special arguments depending on the value of the flags.
4221 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4222
4223 const APInt &FlagsValue = cast<ConstantSDNode>(Val&: Flags.Node)->getAPIntValue();
4224 if (FlagsValue.isZero()) {
4225 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4226 return lowerUnhandledCall(CLI, InVals,
4227 Reason: "no additional args allowed if flags == 0");
4228 } else if (FlagsValue.isOneBitSet(BitNo: 0)) {
4229 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4230 return lowerUnhandledCall(CLI, InVals, Reason: "expected 3 additional args");
4231 }
4232
4233 if (!Subtarget->isWave32()) {
4234 return lowerUnhandledCall(
4235 CLI, InVals, Reason: "dynamic VGPR mode is only supported for wave32");
4236 }
4237
4238 UsesDynamicVGPRs = true;
4239 std::for_each(first: CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4240 last: CLI.Args.end(), f: PushNodeOrTargetConstant);
4241 }
4242 }
4243
4244 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
4245 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4246 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
4247 bool &IsTailCall = CLI.IsTailCall;
4248 bool IsVarArg = CLI.IsVarArg;
4249 bool IsSibCall = false;
4250 MachineFunction &MF = DAG.getMachineFunction();
4251
4252 if (Callee.isUndef() || isNullConstant(V: Callee)) {
4253 if (!CLI.IsTailCall) {
4254 for (ISD::InputArg &Arg : CLI.Ins)
4255 InVals.push_back(Elt: DAG.getPOISON(VT: Arg.VT));
4256 }
4257
4258 return Chain;
4259 }
4260
4261 if (IsVarArg) {
4262 return lowerUnhandledCall(CLI, InVals,
4263 Reason: "unsupported call to variadic function ");
4264 }
4265
4266 if (!CLI.CB)
4267 return lowerUnhandledCall(CLI, InVals, Reason: "unsupported libcall legalization");
4268
4269 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4270 return lowerUnhandledCall(CLI, InVals,
4271 Reason: "unsupported required tail call to function ");
4272 }
4273
4274 if (IsTailCall) {
4275 IsTailCall = isEligibleForTailCallOptimization(Callee, CalleeCC: CallConv, IsVarArg,
4276 Outs, OutVals, Ins, DAG);
4277 if (!IsTailCall &&
4278 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4279 report_fatal_error(reason: "failed to perform tail call elimination on a call "
4280 "site marked musttail or on llvm.amdgcn.cs.chain");
4281 }
4282
4283 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4284
4285 // A sibling call is one where we're under the usual C ABI and not planning
4286 // to change that but can still do a tail call:
4287 if (!TailCallOpt && IsTailCall)
4288 IsSibCall = true;
4289
4290 if (IsTailCall)
4291 ++NumTailCalls;
4292 }
4293
4294 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4295 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
4296 SmallVector<SDValue, 8> MemOpChains;
4297
4298 // Analyze operands of the call, assigning locations to each operand.
4299 SmallVector<CCValAssign, 16> ArgLocs;
4300 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4301 CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg);
4302
4303 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CC: CallConv) &&
4304 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
4305 // With a fixed ABI, allocate fixed registers before user arguments.
4306 passSpecialInputs(CLI, CCInfo, Info: *Info, RegsToPass, MemOpChains, Chain);
4307 }
4308
4309 // Mark the scratch resource descriptor as allocated so the CC analysis
4310 // does not assign user arguments to these registers, matching the callee.
4311 if (!Subtarget->hasFlatScratchEnabled())
4312 CCInfo.AllocateReg(Reg: Info->getScratchRSrcReg());
4313
4314 CCInfo.AnalyzeCallOperands(Outs, Fn: AssignFn);
4315
4316 // Get a count of how many bytes are to be pushed on the stack.
4317 unsigned NumBytes = CCInfo.getStackSize();
4318
4319 if (IsSibCall) {
4320 // Since we're not changing the ABI to make this a tail call, the memory
4321 // operands are already available in the caller's incoming argument space.
4322 NumBytes = 0;
4323 }
4324
4325 // FPDiff is the byte offset of the call's argument area from the callee's.
4326 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4327 // by this amount for a tail call. In a sibling call it must be 0 because the
4328 // caller will deallocate the entire stack and the callee still expects its
4329 // arguments to begin at SP+0. Completely unused for non-tail calls.
4330 int32_t FPDiff = 0;
4331 MachineFrameInfo &MFI = MF.getFrameInfo();
4332 auto *TRI = Subtarget->getRegisterInfo();
4333
4334 // Adjust the stack pointer for the new arguments...
4335 // These operations are automatically eliminated by the prolog/epilog pass
4336 if (!IsSibCall)
4337 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL);
4338
4339 if (!IsSibCall || IsChainCallConv) {
4340 if (!Subtarget->hasFlatScratchEnabled()) {
4341 SmallVector<SDValue, 4> CopyFromChains;
4342
4343 // In the HSA case, this should be an identity copy.
4344 SDValue ScratchRSrcReg =
4345 DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getScratchRSrcReg(), VT: MVT::v4i32);
4346 RegsToPass.emplace_back(Args: IsChainCallConv
4347 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4348 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4349 Args&: ScratchRSrcReg);
4350 CopyFromChains.push_back(Elt: ScratchRSrcReg.getValue(R: 1));
4351 Chain = DAG.getTokenFactor(DL, Vals&: CopyFromChains);
4352 }
4353 }
4354
4355 const unsigned NumSpecialInputs = RegsToPass.size();
4356
4357 MVT PtrVT = MVT::i32;
4358
4359 // Walk the register/memloc assignments, inserting copies/loads.
4360 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4361 CCValAssign &VA = ArgLocs[i];
4362 SDValue Arg = OutVals[i];
4363
4364 // Promote the value if needed.
4365 switch (VA.getLocInfo()) {
4366 case CCValAssign::Full:
4367 break;
4368 case CCValAssign::BCvt:
4369 Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
4370 break;
4371 case CCValAssign::ZExt:
4372 Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4373 break;
4374 case CCValAssign::SExt:
4375 Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4376 break;
4377 case CCValAssign::AExt:
4378 Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4379 break;
4380 case CCValAssign::FPExt:
4381 Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
4382 break;
4383 default:
4384 llvm_unreachable("Unknown loc info!");
4385 }
4386
4387 if (VA.isRegLoc()) {
4388 RegsToPass.push_back(Elt: std::pair(VA.getLocReg(), Arg));
4389 } else {
4390 assert(VA.isMemLoc());
4391
4392 SDValue DstAddr;
4393 MachinePointerInfo DstInfo;
4394
4395 unsigned LocMemOffset = VA.getLocMemOffset();
4396 int32_t Offset = LocMemOffset;
4397
4398 SDValue PtrOff = DAG.getConstant(Val: Offset, DL, VT: PtrVT);
4399 MaybeAlign Alignment;
4400
4401 if (IsTailCall) {
4402 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4403 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4404 : VA.getValVT().getStoreSize();
4405
4406 // FIXME: We can have better than the minimum byval required alignment.
4407 Alignment =
4408 Flags.isByVal()
4409 ? Flags.getNonZeroByValAlign()
4410 : commonAlignment(A: Subtarget->getStackAlignment(), Offset);
4411
4412 Offset = Offset + FPDiff;
4413 int FI = MFI.CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
4414
4415 DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
4416 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4417
4418 // Make sure any stack arguments overlapping with where we're storing
4419 // are loaded before this eventual operation. Otherwise they'll be
4420 // clobbered.
4421
4422 // FIXME: Why is this really necessary? This seems to just result in a
4423 // lot of code to copy the stack and write them back to the same
4424 // locations, which are supposed to be immutable?
4425 Chain = addTokenForArgument(Chain, DAG, MFI, ClobberedFI: FI);
4426 } else {
4427 // Stores to the argument stack area are relative to the stack pointer.
4428 SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getStackPtrOffsetReg(),
4429 VT: MVT::i32);
4430 DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SP, N2: PtrOff);
4431 DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
4432 Alignment =
4433 commonAlignment(A: Subtarget->getStackAlignment(), Offset: LocMemOffset);
4434 }
4435
4436 if (Outs[i].Flags.isByVal()) {
4437 SDValue SizeNode =
4438 DAG.getConstant(Val: Outs[i].Flags.getByValSize(), DL, VT: MVT::i32);
4439 SDValue Cpy =
4440 DAG.getMemcpy(Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
4441 Alignment: Outs[i].Flags.getNonZeroByValAlign(),
4442 /*isVol = */ false, /*AlwaysInline = */ true,
4443 /*CI=*/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo,
4444 SrcPtrInfo: MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
4445
4446 MemOpChains.push_back(Elt: Cpy);
4447 } else {
4448 SDValue Store =
4449 DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo, Alignment);
4450 MemOpChains.push_back(Elt: Store);
4451 }
4452 }
4453 }
4454
4455 if (!MemOpChains.empty())
4456 Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
4457
4458 SDValue ReadFirstLaneID =
4459 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4460
4461 SDValue TokenGlue;
4462 if (CLI.ConvergenceControlToken) {
4463 TokenGlue = DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL, VT: MVT::Glue,
4464 Operand: CLI.ConvergenceControlToken);
4465 }
4466
4467 // Build a sequence of copy-to-reg nodes chained together with token chain
4468 // and flag operands which copy the outgoing args into the appropriate regs.
4469 SDValue InGlue;
4470
4471 unsigned ArgIdx = 0;
4472 for (auto [Reg, Val] : RegsToPass) {
4473 if (ArgIdx++ >= NumSpecialInputs &&
4474 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4475 // For chain calls, the inreg arguments are required to be
4476 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4477 // they are uniform.
4478 //
4479 // For other calls, if an inreg arguments is known to be uniform,
4480 // speculatively insert a readfirstlane in case it is in a VGPR.
4481 //
4482 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4483 // value, so let that continue to produce invalid code.
4484
4485 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4486 if (TokenGlue)
4487 ReadfirstlaneArgs.push_back(Elt: TokenGlue);
4488 Val = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Val.getValueType(),
4489 Ops: ReadfirstlaneArgs);
4490 }
4491
4492 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: Val, Glue: InGlue);
4493 InGlue = Chain.getValue(R: 1);
4494 }
4495
4496 // We don't usually want to end the call-sequence here because we would tidy
4497 // the frame up *after* the call, however in the ABI-changing tail-call case
4498 // we've carefully laid out the parameters so that when sp is reset they'll be
4499 // in the correct location.
4500 if (IsTailCall && !IsSibCall) {
4501 Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: 0, Glue: InGlue, DL);
4502 InGlue = Chain.getValue(R: 1);
4503 }
4504
4505 std::vector<SDValue> Ops({Chain});
4506
4507 // Add a redundant copy of the callee global which will not be legalized, as
4508 // we need direct access to the callee later.
4509 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
4510 const GlobalValue *GV = GSD->getGlobal();
4511 Ops.push_back(x: Callee);
4512 Ops.push_back(x: DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64));
4513 } else {
4514 if (IsTailCall) {
4515 // isEligibleForTailCallOptimization considered whether the call target is
4516 // divergent, but we may still end up with a uniform value in a VGPR.
4517 // Insert a readfirstlane just in case.
4518 SDValue ReadFirstLaneID =
4519 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
4520
4521 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4522 if (TokenGlue)
4523 ReadfirstlaneArgs.push_back(Elt: TokenGlue); // Wire up convergence token.
4524 Callee = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Callee.getValueType(),
4525 Ops: ReadfirstlaneArgs);
4526 }
4527
4528 Ops.push_back(x: Callee);
4529 Ops.push_back(x: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i64));
4530 }
4531
4532 if (IsTailCall) {
4533 // Each tail call may have to adjust the stack by a different amount, so
4534 // this information must travel along with the operation for eventual
4535 // consumption by emitEpilogue.
4536 Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
4537 }
4538
4539 if (IsChainCallConv)
4540 llvm::append_range(C&: Ops, R&: ChainCallSpecialArgs);
4541
4542 // Add argument registers to the end of the list so that they are known live
4543 // into the call.
4544 for (auto &[Reg, Val] : RegsToPass)
4545 Ops.push_back(x: DAG.getRegister(Reg, VT: Val.getValueType()));
4546
4547 // Add a register mask operand representing the call-preserved registers.
4548 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4549 assert(Mask && "Missing call preserved mask for calling convention");
4550 Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
4551
4552 if (SDValue Token = CLI.ConvergenceControlToken) {
4553 SmallVector<SDValue, 2> GlueOps;
4554 GlueOps.push_back(Elt: Token);
4555 if (InGlue)
4556 GlueOps.push_back(Elt: InGlue);
4557
4558 InGlue = SDValue(DAG.getMachineNode(Opcode: TargetOpcode::CONVERGENCECTRL_GLUE, dl: DL,
4559 VT: MVT::Glue, Ops: GlueOps),
4560 0);
4561 }
4562
4563 if (InGlue)
4564 Ops.push_back(x: InGlue);
4565
4566 // If we're doing a tall call, use a TC_RETURN here rather than an
4567 // actual call instruction.
4568 if (IsTailCall) {
4569 MFI.setHasTailCall();
4570 unsigned OPC = AMDGPUISD::TC_RETURN;
4571 switch (CallConv) {
4572 case CallingConv::AMDGPU_Gfx:
4573 OPC = AMDGPUISD::TC_RETURN_GFX;
4574 break;
4575 case CallingConv::AMDGPU_CS_Chain:
4576 case CallingConv::AMDGPU_CS_ChainPreserve:
4577 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4578 : AMDGPUISD::TC_RETURN_CHAIN;
4579 break;
4580 }
4581
4582 // If the caller is a whole wave function, we need to use a special opcode
4583 // so we can patch up EXEC.
4584 if (Info->isWholeWaveFunction())
4585 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4586
4587 return DAG.getNode(Opcode: OPC, DL, VT: MVT::Other, Ops);
4588 }
4589
4590 // Returns a chain and a flag for retval copy to use.
4591 SDValue Call = DAG.getNode(Opcode: AMDGPUISD::CALL, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
4592 Chain = Call.getValue(R: 0);
4593 InGlue = Call.getValue(R: 1);
4594
4595 uint64_t CalleePopBytes = NumBytes;
4596 Chain = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: CalleePopBytes, Glue: InGlue, DL);
4597 if (!Ins.empty())
4598 InGlue = Chain.getValue(R: 1);
4599
4600 // Handle result values, copying them out of physregs into vregs that we
4601 // return.
4602 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4603 InVals, /*IsThisReturn=*/false, ThisVal: SDValue());
4604}
4605
4606// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4607// except for:
4608// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4609// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4610SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
4611 SelectionDAG &DAG) const {
4612 const MachineFunction &MF = DAG.getMachineFunction();
4613 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4614
4615 SDLoc dl(Op);
4616 EVT VT = Op.getValueType();
4617 SDValue Chain = Op.getOperand(i: 0);
4618 Register SPReg = Info->getStackPtrOffsetReg();
4619
4620 // Chain the dynamic stack allocation so that it doesn't modify the stack
4621 // pointer when other instructions are using the stack.
4622 Chain = DAG.getCALLSEQ_START(Chain, InSize: 0, OutSize: 0, DL: dl);
4623
4624 SDValue Size = Op.getOperand(i: 1);
4625 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, Reg: SPReg, VT);
4626 Align Alignment = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getAlignValue();
4627
4628 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4629 assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
4630 "Stack grows upwards for AMDGPU");
4631
4632 Chain = BaseAddr.getValue(R: 1);
4633 Align StackAlign = TFL->getStackAlign();
4634 if (Alignment > StackAlign) {
4635 uint64_t ScaledAlignment = Alignment.value()
4636 << Subtarget->getWavefrontSizeLog2();
4637 uint64_t StackAlignMask = ScaledAlignment - 1;
4638 SDValue TmpAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr,
4639 N2: DAG.getConstant(Val: StackAlignMask, DL: dl, VT));
4640 BaseAddr = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: TmpAddr,
4641 N2: DAG.getSignedConstant(Val: -ScaledAlignment, DL: dl, VT));
4642 }
4643
4644 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4645 SDValue NewSP;
4646 if (isa<ConstantSDNode>(Val: Size)) {
4647 // For constant sized alloca, scale alloca size by wave-size
4648 SDValue ScaledSize = DAG.getNode(
4649 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4650 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4651 NewSP = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value
4652 } else {
4653 // For dynamic sized alloca, perform wave-wide reduction to get max of
4654 // alloca size(divergent) and then scale it by wave-size
4655 SDValue WaveReduction =
4656 DAG.getTargetConstant(Val: Intrinsic::amdgcn_wave_reduce_umax, DL: dl, VT: MVT::i32);
4657 Size = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: WaveReduction,
4658 N2: Size, N3: DAG.getConstant(Val: 0, DL: dl, VT: MVT::i32));
4659 SDValue ScaledSize = DAG.getNode(
4660 Opcode: ISD::SHL, DL: dl, VT, N1: Size,
4661 N2: DAG.getConstant(Val: Subtarget->getWavefrontSizeLog2(), DL: dl, VT: MVT::i32));
4662 NewSP =
4663 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: BaseAddr, N2: ScaledSize); // Value in vgpr.
4664 SDValue ReadFirstLaneID =
4665 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: dl, VT: MVT::i32);
4666 NewSP = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::i32, N1: ReadFirstLaneID,
4667 N2: NewSP);
4668 }
4669
4670 Chain = DAG.getCopyToReg(Chain, dl, Reg: SPReg, N: NewSP); // Output chain
4671 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, Size1: 0, Size2: 0, Glue: SDValue(), DL: dl);
4672
4673 return DAG.getMergeValues(Ops: {BaseAddr, CallSeqEnd}, dl);
4674}
4675
4676SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
4677 if (Op.getValueType() != MVT::i32)
4678 return Op; // Defer to cannot select error.
4679
4680 Register SP = getStackPointerRegisterToSaveRestore();
4681 SDLoc SL(Op);
4682
4683 SDValue CopyFromSP = DAG.getCopyFromReg(Chain: Op->getOperand(Num: 0), dl: SL, Reg: SP, VT: MVT::i32);
4684
4685 // Convert from wave uniform to swizzled vector address. This should protect
4686 // from any edge cases where the stacksave result isn't directly used with
4687 // stackrestore.
4688 SDValue VectorAddress =
4689 DAG.getNode(Opcode: AMDGPUISD::WAVE_ADDRESS, DL: SL, VT: MVT::i32, Operand: CopyFromSP);
4690 return DAG.getMergeValues(Ops: {VectorAddress, CopyFromSP.getValue(R: 1)}, dl: SL);
4691}
4692
4693SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
4694 SelectionDAG &DAG) const {
4695 SDLoc SL(Op);
4696 assert(Op.getValueType() == MVT::i32);
4697
4698 uint32_t BothRoundHwReg =
4699 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4700 SDValue GetRoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4701
4702 SDValue IntrinID =
4703 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4704 SDValue GetReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList: Op->getVTList(),
4705 N1: Op.getOperand(i: 0), N2: IntrinID, N3: GetRoundBothImm);
4706
4707 // There are two rounding modes, one for f32 and one for f64/f16. We only
4708 // report in the standard value range if both are the same.
4709 //
4710 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4711 // ties away from zero is not supported, and the other values are rotated by
4712 // 1.
4713 //
4714 // If the two rounding modes are not the same, report a target defined value.
4715
4716 // Mode register rounding mode fields:
4717 //
4718 // [1:0] Single-precision round mode.
4719 // [3:2] Double/Half-precision round mode.
4720 //
4721 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4722 //
4723 // Hardware Spec
4724 // Toward-0 3 0
4725 // Nearest Even 0 1
4726 // +Inf 1 2
4727 // -Inf 2 3
4728 // NearestAway0 N/A 4
4729 //
4730 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4731 // table we can index by the raw hardware mode.
4732 //
4733 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4734
4735 SDValue BitTable =
4736 DAG.getConstant(Val: AMDGPU::FltRoundConversionTable, DL: SL, VT: MVT::i64);
4737
4738 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4739 SDValue RoundModeTimesNumBits =
4740 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: GetReg, N2: Two);
4741
4742 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4743 // knew only one mode was demanded.
4744 SDValue TableValue =
4745 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4746 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4747
4748 SDValue EntryMask = DAG.getConstant(Val: 0xf, DL: SL, VT: MVT::i32);
4749 SDValue TableEntry =
4750 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TruncTable, N2: EntryMask);
4751
4752 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4753 // if it's an extended value.
4754 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4755 SDValue IsStandardValue =
4756 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: TableEntry, RHS: Four, Cond: ISD::SETULT);
4757 SDValue EnumOffset = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: TableEntry, N2: Four);
4758 SDValue Result = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: IsStandardValue,
4759 N2: TableEntry, N3: EnumOffset);
4760
4761 return DAG.getMergeValues(Ops: {Result, GetReg.getValue(R: 1)}, dl: SL);
4762}
4763
4764SDValue SITargetLowering::lowerSET_ROUNDING(SDValue Op,
4765 SelectionDAG &DAG) const {
4766 SDLoc SL(Op);
4767
4768 SDValue NewMode = Op.getOperand(i: 1);
4769 assert(NewMode.getValueType() == MVT::i32);
4770
4771 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4772 // hardware MODE.fp_round values.
4773 if (auto *ConstMode = dyn_cast<ConstantSDNode>(Val&: NewMode)) {
4774 uint32_t ClampedVal = std::min(
4775 a: static_cast<uint32_t>(ConstMode->getZExtValue()),
4776 b: static_cast<uint32_t>(AMDGPU::TowardZeroF32_TowardNegativeF64));
4777 NewMode = DAG.getConstant(
4778 Val: AMDGPU::decodeFltRoundToHWConversionTable(FltRounds: ClampedVal), DL: SL, VT: MVT::i32);
4779 } else {
4780 // If we know the input can only be one of the supported standard modes in
4781 // the range 0-3, we can use a simplified mapping to hardware values.
4782 KnownBits KB = DAG.computeKnownBits(Op: NewMode);
4783 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4784 // The supported standard values are 0-3. The extended values start at 8. We
4785 // need to offset by 4 if the value is in the extended range.
4786
4787 if (UseReducedTable) {
4788 // Truncate to the low 32-bits.
4789 SDValue BitTable = DAG.getConstant(
4790 Val: AMDGPU::FltRoundToHWConversionTable & 0xffff, DL: SL, VT: MVT::i32);
4791
4792 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4793 SDValue RoundModeTimesNumBits =
4794 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: NewMode, N2: Two);
4795
4796 NewMode =
4797 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: BitTable, N2: RoundModeTimesNumBits);
4798
4799 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4800 // the table extracted bits into inline immediates.
4801 } else {
4802 // table_index = umin(value, value - 4)
4803 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4804 SDValue BitTable =
4805 DAG.getConstant(Val: AMDGPU::FltRoundToHWConversionTable, DL: SL, VT: MVT::i64);
4806
4807 SDValue Four = DAG.getConstant(Val: 4, DL: SL, VT: MVT::i32);
4808 SDValue OffsetEnum = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: NewMode, N2: Four);
4809 SDValue IndexVal =
4810 DAG.getNode(Opcode: ISD::UMIN, DL: SL, VT: MVT::i32, N1: NewMode, N2: OffsetEnum);
4811
4812 SDValue Two = DAG.getConstant(Val: 2, DL: SL, VT: MVT::i32);
4813 SDValue RoundModeTimesNumBits =
4814 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: IndexVal, N2: Two);
4815
4816 SDValue TableValue =
4817 DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i64, N1: BitTable, N2: RoundModeTimesNumBits);
4818 SDValue TruncTable = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: TableValue);
4819
4820 // No need to mask out the high bits since the setreg will ignore them
4821 // anyway.
4822 NewMode = TruncTable;
4823 }
4824
4825 // Insert a readfirstlane in case the value is a VGPR. We could do this
4826 // earlier and keep more operations scalar, but that interferes with
4827 // combining the source.
4828 SDValue ReadFirstLaneID =
4829 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4830 NewMode = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4831 N1: ReadFirstLaneID, N2: NewMode);
4832 }
4833
4834 // N.B. The setreg will be later folded into s_round_mode on supported
4835 // targets.
4836 SDValue IntrinID =
4837 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4838 uint32_t BothRoundHwReg =
4839 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 4);
4840 SDValue RoundBothImm = DAG.getTargetConstant(Val: BothRoundHwReg, DL: SL, VT: MVT::i32);
4841
4842 SDValue SetReg =
4843 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VTList: Op->getVTList(), N1: Op.getOperand(i: 0),
4844 N2: IntrinID, N3: RoundBothImm, N4: NewMode);
4845
4846 return SetReg;
4847}
4848
4849SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
4850 if (Op->isDivergent() &&
4851 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(i: 4)))
4852 // Cannot do I$ prefetch with divergent pointer.
4853 return SDValue();
4854
4855 switch (cast<MemSDNode>(Val&: Op)->getAddressSpace()) {
4856 case AMDGPUAS::FLAT_ADDRESS:
4857 case AMDGPUAS::GLOBAL_ADDRESS:
4858 case AMDGPUAS::CONSTANT_ADDRESS:
4859 break;
4860 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
4861 if (Subtarget->hasSafeSmemPrefetch())
4862 break;
4863 [[fallthrough]];
4864 default:
4865 return SDValue();
4866 }
4867
4868 // I$ prefetch
4869 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(i: 4))
4870 return SDValue();
4871
4872 return Op;
4873}
4874
4875// Work around DAG legality rules only based on the result type.
4876SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
4877 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4878 SDValue Src = Op.getOperand(i: IsStrict ? 1 : 0);
4879 EVT SrcVT = Src.getValueType();
4880
4881 if (SrcVT.getScalarType() != MVT::bf16)
4882 return Op;
4883
4884 SDLoc SL(Op);
4885 SDValue BitCast =
4886 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: SrcVT.changeTypeToInteger(), Operand: Src);
4887
4888 EVT DstVT = Op.getValueType();
4889 if (IsStrict)
4890 llvm_unreachable("Need STRICT_BF16_TO_FP");
4891
4892 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: SL, VT: DstVT, Operand: BitCast);
4893}
4894
4895SDValue SITargetLowering::lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4896 SDLoc SL(Op);
4897 if (Op.getValueType() != MVT::i64)
4898 return Op;
4899
4900 uint32_t ModeHwReg =
4901 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4902 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4903 uint32_t TrapHwReg =
4904 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4905 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4906
4907 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
4908 SDValue IntrinID =
4909 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_getreg, DL: SL, VT: MVT::i32);
4910 SDValue GetModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4911 N1: Op.getOperand(i: 0), N2: IntrinID, N3: ModeHwRegImm);
4912 SDValue GetTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL: SL, VTList,
4913 N1: Op.getOperand(i: 0), N2: IntrinID, N3: TrapHwRegImm);
4914 SDValue TokenReg =
4915 DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: GetModeReg.getValue(R: 1),
4916 N2: GetTrapReg.getValue(R: 1));
4917
4918 SDValue CvtPtr =
4919 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: GetModeReg, N2: GetTrapReg);
4920 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
4921
4922 return DAG.getMergeValues(Ops: {Result, TokenReg}, dl: SL);
4923}
4924
4925SDValue SITargetLowering::lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const {
4926 SDLoc SL(Op);
4927 if (Op.getOperand(i: 1).getValueType() != MVT::i64)
4928 return Op;
4929
4930 SDValue Input = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
4931 SDValue NewModeReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4932 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
4933 SDValue NewTrapReg = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Input,
4934 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
4935
4936 SDValue ReadFirstLaneID =
4937 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL: SL, VT: MVT::i32);
4938 NewModeReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4939 N1: ReadFirstLaneID, N2: NewModeReg);
4940 NewTrapReg = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
4941 N1: ReadFirstLaneID, N2: NewTrapReg);
4942
4943 unsigned ModeHwReg =
4944 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_MODE, Values: 0, Values: 23);
4945 SDValue ModeHwRegImm = DAG.getTargetConstant(Val: ModeHwReg, DL: SL, VT: MVT::i32);
4946 unsigned TrapHwReg =
4947 AMDGPU::Hwreg::HwregEncoding::encode(Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: 0, Values: 5);
4948 SDValue TrapHwRegImm = DAG.getTargetConstant(Val: TrapHwReg, DL: SL, VT: MVT::i32);
4949
4950 SDValue IntrinID =
4951 DAG.getTargetConstant(Val: Intrinsic::amdgcn_s_setreg, DL: SL, VT: MVT::i32);
4952 SDValue SetModeReg =
4953 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4954 N2: IntrinID, N3: ModeHwRegImm, N4: NewModeReg);
4955 SDValue SetTrapReg =
4956 DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL: SL, VT: MVT::Other, N1: Op.getOperand(i: 0),
4957 N2: IntrinID, N3: TrapHwRegImm, N4: NewTrapReg);
4958 return DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other, N1: SetTrapReg, N2: SetModeReg);
4959}
4960
4961Register SITargetLowering::getRegisterByName(const char *RegName, LLT VT,
4962 const MachineFunction &MF) const {
4963 const Function &Fn = MF.getFunction();
4964
4965 Register Reg = StringSwitch<Register>(RegName)
4966 .Case(S: "m0", Value: AMDGPU::M0)
4967 .Case(S: "exec", Value: AMDGPU::EXEC)
4968 .Case(S: "exec_lo", Value: AMDGPU::EXEC_LO)
4969 .Case(S: "exec_hi", Value: AMDGPU::EXEC_HI)
4970 .Case(S: "flat_scratch", Value: AMDGPU::FLAT_SCR)
4971 .Case(S: "flat_scratch_lo", Value: AMDGPU::FLAT_SCR_LO)
4972 .Case(S: "flat_scratch_hi", Value: AMDGPU::FLAT_SCR_HI)
4973 .Default(Value: Register());
4974 if (!Reg)
4975 return Reg;
4976
4977 if (!Subtarget->hasFlatScrRegister() &&
4978 Subtarget->getRegisterInfo()->regsOverlap(RegA: Reg, RegB: AMDGPU::FLAT_SCR)) {
4979 Fn.getContext().emitError(ErrorStr: Twine("invalid register \"" + StringRef(RegName) +
4980 "\" for subtarget."));
4981 }
4982
4983 switch (Reg) {
4984 case AMDGPU::M0:
4985 case AMDGPU::EXEC_LO:
4986 case AMDGPU::EXEC_HI:
4987 case AMDGPU::FLAT_SCR_LO:
4988 case AMDGPU::FLAT_SCR_HI:
4989 if (VT.getSizeInBits() == 32)
4990 return Reg;
4991 break;
4992 case AMDGPU::EXEC:
4993 case AMDGPU::FLAT_SCR:
4994 if (VT.getSizeInBits() == 64)
4995 return Reg;
4996 break;
4997 default:
4998 llvm_unreachable("missing register type checking");
4999 }
5000
5001 report_fatal_error(
5002 reason: Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5003}
5004
5005// If kill is not the last instruction, split the block so kill is always a
5006// proper terminator.
5007MachineBasicBlock *
5008SITargetLowering::splitKillBlock(MachineInstr &MI,
5009 MachineBasicBlock *BB) const {
5010 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, /*UpdateLiveIns=*/true);
5011 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5012 MI.setDesc(TII->getKillTerminatorFromPseudo(Opcode: MI.getOpcode()));
5013 return SplitBB;
5014}
5015
5016// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5017// \p MI will be the only instruction in the loop body block. Otherwise, it will
5018// be the first instruction in the remainder block.
5019//
5020/// \returns { LoopBody, Remainder }
5021static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5022splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
5023 MachineFunction *MF = MBB.getParent();
5024 MachineBasicBlock::iterator I(&MI);
5025
5026 // To insert the loop we need to split the block. Move everything after this
5027 // point to a new block, and insert a new empty block between the two.
5028 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
5029 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5030 MachineFunction::iterator MBBI(MBB);
5031 ++MBBI;
5032
5033 MF->insert(MBBI, MBB: LoopBB);
5034 MF->insert(MBBI, MBB: RemainderBB);
5035
5036 LoopBB->addSuccessor(Succ: LoopBB);
5037 LoopBB->addSuccessor(Succ: RemainderBB);
5038
5039 // Move the rest of the block into a new block.
5040 RemainderBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
5041
5042 if (InstInLoop) {
5043 auto Next = std::next(x: I);
5044
5045 // Move instruction to loop body.
5046 LoopBB->splice(Where: LoopBB->begin(), Other: &MBB, From: I, To: Next);
5047
5048 // Move the rest of the block.
5049 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: Next, To: MBB.end());
5050 } else {
5051 RemainderBB->splice(Where: RemainderBB->begin(), Other: &MBB, From: I, To: MBB.end());
5052 }
5053
5054 MBB.addSuccessor(Succ: LoopBB);
5055
5056 return std::pair(LoopBB, RemainderBB);
5057}
5058
5059/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5060void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
5061 MachineBasicBlock *MBB = MI.getParent();
5062 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5063 auto I = MI.getIterator();
5064 auto E = std::next(x: I);
5065
5066 // clang-format off
5067 BuildMI(BB&: *MBB, I: E, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_WAITCNT))
5068 .addImm(Val: 0);
5069 // clang-format on
5070
5071 MIBundleBuilder Bundler(*MBB, I, E);
5072 finalizeBundle(MBB&: *MBB, FirstMI: Bundler.begin());
5073}
5074
5075MachineBasicBlock *
5076SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
5077 MachineBasicBlock *BB) const {
5078 const DebugLoc &DL = MI.getDebugLoc();
5079
5080 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5081
5082 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5083
5084 // Apparently kill flags are only valid if the def is in the same block?
5085 if (MachineOperand *Src = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::data0))
5086 Src->setIsKill(false);
5087
5088 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB&: *BB, InstInLoop: true);
5089
5090 MachineBasicBlock::iterator I = LoopBB->end();
5091
5092 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5093 Values: AMDGPU::Hwreg::ID_TRAPSTS, Values: AMDGPU::Hwreg::OFFSET_MEM_VIOL, Values: 1);
5094
5095 // Clear TRAP_STS.MEM_VIOL
5096 BuildMI(BB&: *LoopBB, I: LoopBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SETREG_IMM32_B32))
5097 .addImm(Val: 0)
5098 .addImm(Val: EncodedReg);
5099
5100 bundleInstWithWaitcnt(MI);
5101
5102 Register Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5103
5104 // Load and check TRAP_STS.MEM_VIOL
5105 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: Reg)
5106 .addImm(Val: EncodedReg);
5107
5108 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5109 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
5110 .addReg(RegNo: Reg, Flags: RegState::Kill)
5111 .addImm(Val: 0);
5112 // clang-format off
5113 BuildMI(BB&: *LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
5114 .addMBB(MBB: LoopBB);
5115 // clang-format on
5116
5117 return RemainderBB;
5118}
5119
5120// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5121// wavefront. If the value is uniform and just happens to be in a VGPR, this
5122// will only do one iteration. In the worst case, this will loop 64 times.
5123//
5124// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5125static MachineBasicBlock::iterator
5126emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
5127 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5128 const DebugLoc &DL, const MachineOperand &Idx,
5129 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5130 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5131 Register &SGPRIdxReg) {
5132
5133 MachineFunction *MF = OrigBB.getParent();
5134 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5135 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5136 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5137 MachineBasicBlock::iterator I = LoopBB.begin();
5138
5139 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5140 Register PhiExec = MRI.createVirtualRegister(RegClass: BoolRC);
5141 Register NewExec = MRI.createVirtualRegister(RegClass: BoolRC);
5142 Register CurrentIdxReg =
5143 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5144 Register CondReg = MRI.createVirtualRegister(RegClass: BoolRC);
5145
5146 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiReg)
5147 .addReg(RegNo: InitReg)
5148 .addMBB(MBB: &OrigBB)
5149 .addReg(RegNo: ResultReg)
5150 .addMBB(MBB: &LoopBB);
5151
5152 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::PHI), DestReg: PhiExec)
5153 .addReg(RegNo: InitSaveExecReg)
5154 .addMBB(MBB: &OrigBB)
5155 .addReg(RegNo: NewExec)
5156 .addMBB(MBB: &LoopBB);
5157
5158 // Read the next variant <- also loop target.
5159 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: CurrentIdxReg)
5160 .addReg(RegNo: Idx.getReg(), Flags: getUndefRegState(B: Idx.isUndef()));
5161
5162 // Compare the just read M0 value to all possible Idx values.
5163 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CMP_EQ_U32_e64), DestReg: CondReg)
5164 .addReg(RegNo: CurrentIdxReg)
5165 .addReg(RegNo: Idx.getReg(), Flags: {}, SubReg: Idx.getSubReg());
5166
5167 // Update EXEC, save the original EXEC value to VCC.
5168 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.AndSaveExecOpc), DestReg: NewExec)
5169 .addReg(RegNo: CondReg, Flags: RegState::Kill);
5170
5171 MRI.setSimpleHint(VReg: NewExec, PrefReg: CondReg);
5172
5173 if (UseGPRIdxMode) {
5174 if (Offset == 0) {
5175 SGPRIdxReg = CurrentIdxReg;
5176 } else {
5177 SGPRIdxReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SGPR_32RegClass);
5178 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: SGPRIdxReg)
5179 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5180 .addImm(Val: Offset);
5181 }
5182 } else {
5183 // Move index from VCC into M0
5184 if (Offset == 0) {
5185 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5186 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill);
5187 } else {
5188 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5189 .addReg(RegNo: CurrentIdxReg, Flags: RegState::Kill)
5190 .addImm(Val: Offset);
5191 }
5192 }
5193
5194 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5195 MachineInstr *InsertPt =
5196 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.XorTermOpc), DestReg: LMC.ExecReg)
5197 .addReg(RegNo: LMC.ExecReg)
5198 .addReg(RegNo: NewExec);
5199
5200 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5201 // s_cbranch_scc0?
5202
5203 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5204 // clang-format off
5205 BuildMI(BB&: LoopBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
5206 .addMBB(MBB: &LoopBB);
5207 // clang-format on
5208
5209 return InsertPt->getIterator();
5210}
5211
5212// This has slightly sub-optimal regalloc when the source vector is killed by
5213// the read. The register allocator does not understand that the kill is
5214// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5215// subregister from it, using 1 more VGPR than necessary. This was saved when
5216// this was expanded after register allocation.
5217static MachineBasicBlock::iterator
5218loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
5219 unsigned InitResultReg, unsigned PhiReg, int Offset,
5220 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5221 MachineFunction *MF = MBB.getParent();
5222 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5223 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5224 MachineRegisterInfo &MRI = MF->getRegInfo();
5225 const DebugLoc &DL = MI.getDebugLoc();
5226 MachineBasicBlock::iterator I(&MI);
5227
5228 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5229 Register DstReg = MI.getOperand(i: 0).getReg();
5230 Register SaveExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5231 Register TmpExec = MRI.createVirtualRegister(RegClass: BoolXExecRC);
5232 const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
5233
5234 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: TmpExec);
5235
5236 // Save the EXEC mask
5237 // clang-format off
5238 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: SaveExec)
5239 .addReg(RegNo: LMC.ExecReg);
5240 // clang-format on
5241
5242 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, InstInLoop: false);
5243
5244 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5245
5246 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, OrigBB&: MBB, LoopBB&: *LoopBB, DL, Idx: *Idx,
5247 InitReg: InitResultReg, ResultReg: DstReg, PhiReg, InitSaveExecReg: TmpExec,
5248 Offset, UseGPRIdxMode, SGPRIdxReg);
5249
5250 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5251 MachineFunction::iterator MBBI(LoopBB);
5252 ++MBBI;
5253 MF->insert(MBBI, MBB: LandingPad);
5254 LoopBB->removeSuccessor(Succ: RemainderBB);
5255 LandingPad->addSuccessor(Succ: RemainderBB);
5256 LoopBB->addSuccessor(Succ: LandingPad);
5257 MachineBasicBlock::iterator First = LandingPad->begin();
5258 // clang-format off
5259 BuildMI(BB&: *LandingPad, I: First, MIMD: DL, MCID: TII->get(Opcode: LMC.MovOpc), DestReg: LMC.ExecReg)
5260 .addReg(RegNo: SaveExec);
5261 // clang-format on
5262
5263 return InsPt;
5264}
5265
5266// Returns subreg index, offset
5267static std::pair<unsigned, int>
5268computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
5269 const TargetRegisterClass *SuperRC, unsigned VecReg,
5270 int Offset) {
5271 int NumElts = TRI.getRegSizeInBits(RC: *SuperRC) / 32;
5272
5273 // Skip out of bounds offsets, or else we would end up using an undefined
5274 // register.
5275 if (Offset >= NumElts || Offset < 0)
5276 return std::pair(AMDGPU::sub0, Offset);
5277
5278 return std::pair(SIRegisterInfo::getSubRegFromChannel(Channel: Offset), 0);
5279}
5280
5281static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
5282 MachineRegisterInfo &MRI, MachineInstr &MI,
5283 int Offset) {
5284 MachineBasicBlock *MBB = MI.getParent();
5285 const DebugLoc &DL = MI.getDebugLoc();
5286 MachineBasicBlock::iterator I(&MI);
5287
5288 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5289
5290 assert(Idx->getReg() != AMDGPU::NoRegister);
5291
5292 if (Offset == 0) {
5293 // clang-format off
5294 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AMDGPU::M0)
5295 .add(MO: *Idx);
5296 // clang-format on
5297 } else {
5298 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: AMDGPU::M0)
5299 .add(MO: *Idx)
5300 .addImm(Val: Offset);
5301 }
5302}
5303
5304static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
5305 MachineRegisterInfo &MRI, MachineInstr &MI,
5306 int Offset) {
5307 MachineBasicBlock *MBB = MI.getParent();
5308 const DebugLoc &DL = MI.getDebugLoc();
5309 MachineBasicBlock::iterator I(&MI);
5310
5311 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5312
5313 if (Offset == 0)
5314 return Idx->getReg();
5315
5316 Register Tmp = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5317 BuildMI(BB&: *MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_I32), DestReg: Tmp)
5318 .add(MO: *Idx)
5319 .addImm(Val: Offset);
5320 return Tmp;
5321}
5322
5323static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
5324 MachineBasicBlock &MBB,
5325 const GCNSubtarget &ST) {
5326 const SIInstrInfo *TII = ST.getInstrInfo();
5327 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5328 MachineFunction *MF = MBB.getParent();
5329 MachineRegisterInfo &MRI = MF->getRegInfo();
5330
5331 Register Dst = MI.getOperand(i: 0).getReg();
5332 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5333 Register SrcReg = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src)->getReg();
5334 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5335
5336 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcReg);
5337 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5338
5339 unsigned SubReg;
5340 std::tie(args&: SubReg, args&: Offset) =
5341 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcReg, Offset);
5342
5343 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5347 MachineBasicBlock::iterator I(&MI);
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5351 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5352 // to avoid interfering with other uses, so probably requires a new
5353 // optimization pass.
5354 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5355
5356 const MCInstrDesc &GPRIDXDesc =
5357 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5358 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5359 .addReg(RegNo: SrcReg)
5360 .addReg(RegNo: Idx)
5361 .addImm(Val: SubReg);
5362 } else {
5363 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5364
5365 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5366 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5367 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5368 }
5369
5370 MI.eraseFromParent();
5371
5372 return &MBB;
5373 }
5374
5375 // Control flow needs to be inserted if indexing with a VGPR.
5376 const DebugLoc &DL = MI.getDebugLoc();
5377 MachineBasicBlock::iterator I(&MI);
5378
5379 Register PhiReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5380 Register InitReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
5381
5382 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF), DestReg: InitReg);
5383
5384 Register SGPRIdxReg;
5385 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: InitReg, PhiReg, Offset,
5386 UseGPRIdxMode, SGPRIdxReg);
5387
5388 MachineBasicBlock *LoopBB = InsPt->getParent();
5389
5390 if (UseGPRIdxMode) {
5391 const MCInstrDesc &GPRIDXDesc =
5392 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: true);
5393
5394 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5395 .addReg(RegNo: SrcReg)
5396 .addReg(RegNo: SGPRIdxReg)
5397 .addImm(Val: SubReg);
5398 } else {
5399 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOVRELS_B32_e32), DestReg: Dst)
5400 .addReg(RegNo: SrcReg, Flags: {}, SubReg)
5401 .addReg(RegNo: SrcReg, Flags: RegState::Implicit);
5402 }
5403
5404 MI.eraseFromParent();
5405
5406 return LoopBB;
5407}
5408
5409static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
5410 MachineBasicBlock &MBB,
5411 const GCNSubtarget &ST) {
5412 const SIInstrInfo *TII = ST.getInstrInfo();
5413 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5414 MachineFunction *MF = MBB.getParent();
5415 MachineRegisterInfo &MRI = MF->getRegInfo();
5416
5417 Register Dst = MI.getOperand(i: 0).getReg();
5418 const MachineOperand *SrcVec = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::src);
5419 const MachineOperand *Idx = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::idx);
5420 const MachineOperand *Val = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::val);
5421 int Offset = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::offset)->getImm();
5422 const TargetRegisterClass *VecRC = MRI.getRegClass(Reg: SrcVec->getReg());
5423 const TargetRegisterClass *IdxRC = MRI.getRegClass(Reg: Idx->getReg());
5424
5425 // This can be an immediate, but will be folded later.
5426 assert(Val->getReg());
5427
5428 unsigned SubReg;
5429 std::tie(args&: SubReg, args&: Offset) =
5430 computeIndirectRegAndOffset(TRI, SuperRC: VecRC, VecReg: SrcVec->getReg(), Offset);
5431 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5432
5433 if (Idx->getReg() == AMDGPU::NoRegister) {
5434 MachineBasicBlock::iterator I(&MI);
5435 const DebugLoc &DL = MI.getDebugLoc();
5436
5437 assert(Offset == 0);
5438
5439 BuildMI(BB&: MBB, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: Dst)
5440 .add(MO: *SrcVec)
5441 .add(MO: *Val)
5442 .addImm(Val: SubReg);
5443
5444 MI.eraseFromParent();
5445 return &MBB;
5446 }
5447
5448 // Check for a SGPR index.
5449 if (TII->getRegisterInfo().isSGPRClass(RC: IdxRC)) {
5450 MachineBasicBlock::iterator I(&MI);
5451 const DebugLoc &DL = MI.getDebugLoc();
5452
5453 if (UseGPRIdxMode) {
5454 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5455
5456 const MCInstrDesc &GPRIDXDesc =
5457 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5458 BuildMI(BB&: MBB, I, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5459 .addReg(RegNo: SrcVec->getReg())
5460 .add(MO: *Val)
5461 .addReg(RegNo: Idx)
5462 .addImm(Val: SubReg);
5463 } else {
5464 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
5465
5466 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5467 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5468 BuildMI(BB&: MBB, I, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5469 .addReg(RegNo: SrcVec->getReg())
5470 .add(MO: *Val)
5471 .addImm(Val: SubReg);
5472 }
5473 MI.eraseFromParent();
5474 return &MBB;
5475 }
5476
5477 // Control flow needs to be inserted if indexing with a VGPR.
5478 if (Val->isReg())
5479 MRI.clearKillFlags(Reg: Val->getReg());
5480
5481 const DebugLoc &DL = MI.getDebugLoc();
5482
5483 Register PhiReg = MRI.createVirtualRegister(RegClass: VecRC);
5484
5485 Register SGPRIdxReg;
5486 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitResultReg: SrcVec->getReg(), PhiReg, Offset,
5487 UseGPRIdxMode, SGPRIdxReg);
5488 MachineBasicBlock *LoopBB = InsPt->getParent();
5489
5490 if (UseGPRIdxMode) {
5491 const MCInstrDesc &GPRIDXDesc =
5492 TII->getIndirectGPRIDXPseudo(VecSize: TRI.getRegSizeInBits(RC: *VecRC), IsIndirectSrc: false);
5493
5494 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: GPRIDXDesc, DestReg: Dst)
5495 .addReg(RegNo: PhiReg)
5496 .add(MO: *Val)
5497 .addReg(RegNo: SGPRIdxReg)
5498 .addImm(Val: SubReg);
5499 } else {
5500 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5501 VecSize: TRI.getRegSizeInBits(RC: *VecRC), EltSize: 32, IsSGPR: false);
5502 BuildMI(BB&: *LoopBB, I: InsPt, MIMD: DL, MCID: MovRelDesc, DestReg: Dst)
5503 .addReg(RegNo: PhiReg)
5504 .add(MO: *Val)
5505 .addImm(Val: SubReg);
5506 }
5507
5508 MI.eraseFromParent();
5509 return LoopBB;
5510}
5511
5512static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
5513 MachineBasicBlock *BB) {
5514 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5515 // For GFX12, we emit s_add_u64 and s_sub_u64.
5516 MachineFunction *MF = BB->getParent();
5517 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5518 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5519 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 MachineOperand &Dest = MI.getOperand(i: 0);
5522 MachineOperand &Src0 = MI.getOperand(i: 1);
5523 MachineOperand &Src1 = MI.getOperand(i: 2);
5524 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5525 if (ST.hasScalarAddSub64()) {
5526 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5527 // clang-format off
5528 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg())
5529 .add(MO: Src0)
5530 .add(MO: Src1);
5531 // clang-format on
5532 } else {
5533 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5534 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5535
5536 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5537 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5538
5539 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5540 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5541 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5542 MI, MRI, SuperReg: Src0, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5543
5544 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5545 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub0, SubRC: &AMDGPU::SReg_32RegClass);
5546 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5547 MI, MRI, SuperReg: Src1, SuperRC: BoolRC, SubIdx: AMDGPU::sub1, SubRC: &AMDGPU::SReg_32RegClass);
5548
5549 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5550 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5551 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0).add(MO: Src0Sub0).add(MO: Src1Sub0);
5552 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1).add(MO: Src0Sub1).add(MO: Src1Sub1);
5553 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
5554 .addReg(RegNo: DestSub0)
5555 .addImm(Val: AMDGPU::sub0)
5556 .addReg(RegNo: DestSub1)
5557 .addImm(Val: AMDGPU::sub1);
5558 }
5559 MI.eraseFromParent();
5560 return BB;
5561}
5562
5563static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
5564 switch (Opc) {
5565 case AMDGPU::S_MIN_U32:
5566 return std::numeric_limits<uint32_t>::max();
5567 case AMDGPU::S_MIN_I32:
5568 return std::numeric_limits<int32_t>::max();
5569 case AMDGPU::S_MAX_U32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_MAX_I32:
5572 return std::numeric_limits<int32_t>::min();
5573 case AMDGPU::V_ADD_F32_e64: // -0.0
5574 return 0x80000000;
5575 case AMDGPU::V_SUB_F32_e64: // +0.0
5576 return 0x0;
5577 case AMDGPU::S_ADD_I32:
5578 case AMDGPU::S_SUB_I32:
5579 case AMDGPU::S_OR_B32:
5580 case AMDGPU::S_XOR_B32:
5581 return std::numeric_limits<uint32_t>::min();
5582 case AMDGPU::S_AND_B32:
5583 return std::numeric_limits<uint32_t>::max();
5584 case AMDGPU::V_MIN_F32_e64:
5585 case AMDGPU::V_MAX_F32_e64:
5586 return 0x7fc00000; // qNAN
5587 default:
5588 llvm_unreachable(
5589 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5590 }
5591}
5592
5593static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc) {
5594 switch (Opc) {
5595 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5596 return std::numeric_limits<uint64_t>::max();
5597 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5598 return std::numeric_limits<int64_t>::max();
5599 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5600 return std::numeric_limits<uint64_t>::min();
5601 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5602 return std::numeric_limits<int64_t>::min();
5603 case AMDGPU::V_MIN_F64_e64:
5604 case AMDGPU::V_MAX_F64_e64:
5605 case AMDGPU::V_MIN_NUM_F64_e64:
5606 case AMDGPU::V_MAX_NUM_F64_e64:
5607 return 0x7FF8000000000000; // qNAN
5608 case AMDGPU::S_ADD_U64_PSEUDO:
5609 case AMDGPU::S_SUB_U64_PSEUDO:
5610 case AMDGPU::S_OR_B64:
5611 case AMDGPU::S_XOR_B64:
5612 return std::numeric_limits<uint64_t>::min();
5613 case AMDGPU::S_AND_B64:
5614 return std::numeric_limits<uint64_t>::max();
5615 case AMDGPU::V_ADD_F64_e64:
5616 case AMDGPU::V_ADD_F64_pseudo_e64:
5617 return 0x8000000000000000; // -0.0
5618 default:
5619 llvm_unreachable(
5620 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5621 }
5622}
5623
5624static bool is32bitWaveReduceOperation(unsigned Opc) {
5625 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5626 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5627 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5628 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5629 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5630 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5631 Opc == AMDGPU::V_SUB_F32_e64;
5632}
5633
5634static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
5635 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5636 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5637 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5638 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5639 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5640}
5641
5642static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
5643 MachineBasicBlock &BB,
5644 const GCNSubtarget &ST,
5645 unsigned Opc) {
5646 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
5647 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5648 const DebugLoc &DL = MI.getDebugLoc();
5649 const SIInstrInfo *TII = ST.getInstrInfo();
5650
5651 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5652 Register SrcReg = MI.getOperand(i: 1).getReg();
5653 bool isSGPR = TRI->isSGPRClass(RC: MRI.getRegClass(Reg: SrcReg));
5654 Register DstReg = MI.getOperand(i: 0).getReg();
5655 MachineBasicBlock *RetBB = nullptr;
5656 if (isSGPR) {
5657 switch (Opc) {
5658 case AMDGPU::S_MIN_U32:
5659 case AMDGPU::S_MIN_I32:
5660 case AMDGPU::V_MIN_F32_e64:
5661 case AMDGPU::S_MAX_U32:
5662 case AMDGPU::S_MAX_I32:
5663 case AMDGPU::V_MAX_F32_e64:
5664 case AMDGPU::S_AND_B32:
5665 case AMDGPU::S_OR_B32: {
5666 // Idempotent operations.
5667 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: DstReg).addReg(RegNo: SrcReg);
5668 RetBB = &BB;
5669 break;
5670 }
5671 case AMDGPU::V_CMP_LT_U64_e64: // umin
5672 case AMDGPU::V_CMP_LT_I64_e64: // min
5673 case AMDGPU::V_CMP_GT_U64_e64: // umax
5674 case AMDGPU::V_CMP_GT_I64_e64: // max
5675 case AMDGPU::V_MIN_F64_e64:
5676 case AMDGPU::V_MIN_NUM_F64_e64:
5677 case AMDGPU::V_MAX_F64_e64:
5678 case AMDGPU::V_MAX_NUM_F64_e64:
5679 case AMDGPU::S_AND_B64:
5680 case AMDGPU::S_OR_B64: {
5681 // Idempotent operations.
5682 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64), DestReg: DstReg).addReg(RegNo: SrcReg);
5683 RetBB = &BB;
5684 break;
5685 }
5686 case AMDGPU::S_XOR_B32:
5687 case AMDGPU::S_XOR_B64:
5688 case AMDGPU::S_ADD_I32:
5689 case AMDGPU::S_ADD_U64_PSEUDO:
5690 case AMDGPU::V_ADD_F32_e64:
5691 case AMDGPU::V_ADD_F64_e64:
5692 case AMDGPU::V_ADD_F64_pseudo_e64:
5693 case AMDGPU::S_SUB_I32:
5694 case AMDGPU::S_SUB_U64_PSEUDO:
5695 case AMDGPU::V_SUB_F32_e64: {
5696 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5697 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5698 Register ExecMask = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5699 Register NumActiveLanes =
5700 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5701
5702 bool IsWave32 = ST.isWave32();
5703 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5704 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5705 unsigned BitCountOpc =
5706 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5707
5708 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MovOpc), DestReg: ExecMask).addReg(RegNo: ExecReg);
5709
5710 auto NewAccumulator =
5711 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: BitCountOpc), DestReg: NumActiveLanes)
5712 .addReg(RegNo: ExecMask);
5713
5714 switch (Opc) {
5715 case AMDGPU::S_XOR_B32:
5716 case AMDGPU::S_XOR_B64: {
5717 // Performing an XOR operation on a uniform value
5718 // depends on the parity of the number of active lanes.
5719 // For even parity, the result will be 0, for odd
5720 // parity the result will be the same as the input value.
5721 Register ParityRegister =
5722 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5723
5724 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_AND_B32), DestReg: ParityRegister)
5725 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5726 .addImm(Val: 1)
5727 .setOperandDead(3); // Dead scc
5728 if (Opc == AMDGPU::S_XOR_B32) {
5729 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5730 .addReg(RegNo: SrcReg)
5731 .addReg(RegNo: ParityRegister);
5732 } else {
5733 Register DestSub0 =
5734 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5735 Register DestSub1 =
5736 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5737
5738 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
5739 const TargetRegisterClass *SrcSubRC =
5740 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5741
5742 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5743 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
5744 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5745 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
5746
5747 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5748 .add(MO: Op1L)
5749 .addReg(RegNo: ParityRegister);
5750
5751 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub1)
5752 .add(MO: Op1H)
5753 .addReg(RegNo: ParityRegister);
5754
5755 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5756 .addReg(RegNo: DestSub0)
5757 .addImm(Val: AMDGPU::sub0)
5758 .addReg(RegNo: DestSub1)
5759 .addImm(Val: AMDGPU::sub1);
5760 }
5761 break;
5762 }
5763 case AMDGPU::S_SUB_I32: {
5764 Register NegatedVal = MRI.createVirtualRegister(RegClass: DstRegClass);
5765
5766 // Take the negation of the source operand.
5767 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedVal)
5768 .addImm(Val: 0)
5769 .addReg(RegNo: SrcReg);
5770 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5771 .addReg(RegNo: NegatedVal)
5772 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5773 break;
5774 }
5775 case AMDGPU::S_ADD_I32: {
5776 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DstReg)
5777 .addReg(RegNo: SrcReg)
5778 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg());
5779 break;
5780 }
5781 case AMDGPU::S_ADD_U64_PSEUDO:
5782 case AMDGPU::S_SUB_U64_PSEUDO: {
5783 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5784 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5785 Register Op1H_Op0L_Reg =
5786 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5787 Register Op1L_Op0H_Reg =
5788 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5789 Register CarryReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5790 Register AddReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5791 Register NegatedValLo =
5792 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5793 Register NegatedValHi =
5794 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5795
5796 const TargetRegisterClass *Src1RC = MRI.getRegClass(Reg: SrcReg);
5797 const TargetRegisterClass *Src1SubRC =
5798 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5799
5800 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
5802 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
5804
5805 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5806 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_SUB_I32), DestReg: NegatedValLo)
5807 .addImm(Val: 0)
5808 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5809 .setOperandDead(3); // Dead scc
5810 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ASHR_I32), DestReg: NegatedValHi)
5811 .addReg(RegNo: NegatedValLo)
5812 .addImm(Val: 31)
5813 .setOperandDead(3); // Dead scc
5814 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1L_Op0H_Reg)
5815 .add(MO: Op1L)
5816 .addReg(RegNo: NegatedValHi);
5817 }
5818 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5819 ? NegatedValLo
5820 : NewAccumulator->getOperand(i: 0).getReg();
5821 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: DestSub0)
5822 .add(MO: Op1L)
5823 .addReg(RegNo: LowOpcode);
5824 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_HI_U32), DestReg: CarryReg)
5825 .add(MO: Op1L)
5826 .addReg(RegNo: LowOpcode);
5827 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MUL_I32), DestReg: Op1H_Op0L_Reg)
5828 .add(MO: Op1H)
5829 .addReg(RegNo: LowOpcode);
5830
5831 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5832 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: HiVal)
5833 .addReg(RegNo: CarryReg)
5834 .addReg(RegNo: Op1H_Op0L_Reg)
5835 .setOperandDead(3); // Dead scc
5836
5837 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5838 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ADD_U32), DestReg: DestSub1)
5839 .addReg(RegNo: HiVal)
5840 .addReg(RegNo: Op1L_Op0H_Reg)
5841 .setOperandDead(3); // Dead scc
5842 }
5843 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5844 .addReg(RegNo: DestSub0)
5845 .addImm(Val: AMDGPU::sub0)
5846 .addReg(RegNo: DestSub1)
5847 .addImm(Val: AMDGPU::sub1);
5848 break;
5849 }
5850 case AMDGPU::V_ADD_F32_e64:
5851 case AMDGPU::V_ADD_F64_e64:
5852 case AMDGPU::V_ADD_F64_pseudo_e64:
5853 case AMDGPU::V_SUB_F32_e64: {
5854 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5855 const TargetRegisterClass *VregRC = TII->getRegClass(MCID: TII->get(Opcode: Opc), OpNum: 0);
5856 Register ActiveLanesVreg = MRI.createVirtualRegister(RegClass: VregRC);
5857 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
5858 // Get number of active lanes as a float val.
5859 BuildMI(BB, I&: MI, MIMD: DL,
5860 MCID: TII->get(Opcode: is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5861 : AMDGPU::V_CVT_F64_I32_e64),
5862 DestReg: ActiveLanesVreg)
5863 .addReg(RegNo: NewAccumulator->getOperand(i: 0).getReg())
5864 .addImm(Val: 0) // clamp
5865 .addImm(Val: 0); // output-modifier
5866
5867 // Take negation of input for SUB reduction
5868 unsigned srcMod =
5869 (Opc == AMDGPU::V_SUB_F32_e64 ||
5870 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5871 ? SISrcMods::NEG
5872 : SISrcMods::NONE;
5873 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5874 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
5875 ? AMDGPU::V_MUL_F64_pseudo_e64
5876 : AMDGPU::V_MUL_F64_e64;
5877 auto DestVregInst = BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: MulOpc),
5878 DestReg: DstVreg)
5879 .addImm(Val: srcMod) // src0 modifier
5880 .addReg(RegNo: SrcReg)
5881 .addImm(Val: SISrcMods::NONE) // src1 modifier
5882 .addReg(RegNo: ActiveLanesVreg)
5883 .addImm(Val: SISrcMods::NONE) // clamp
5884 .addImm(Val: SISrcMods::NONE); // output-mod
5885 if (is32BitOpc) {
5886 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
5887 .addReg(RegNo: DstVreg);
5888 } else {
5889 Register LaneValueLoReg =
5890 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5891 Register LaneValueHiReg =
5892 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
5893 const TargetRegisterClass *VregSubRC =
5894 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5895 MachineOperand Op1L =
5896 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5897 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
5898 MachineOperand Op1H =
5899 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: DestVregInst->getOperand(i: 0),
5900 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
5901 // lane value input should be in an sgpr
5902 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5903 DestReg: LaneValueLoReg)
5904 .add(MO: Op1L);
5905 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
5906 DestReg: LaneValueHiReg)
5907 .add(MO: Op1H);
5908 NewAccumulator =
5909 BuildMI(BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
5910 .addReg(RegNo: LaneValueLoReg)
5911 .addImm(Val: AMDGPU::sub0)
5912 .addReg(RegNo: LaneValueHiReg)
5913 .addImm(Val: AMDGPU::sub1);
5914 }
5915 }
5916 }
5917 RetBB = &BB;
5918 }
5919 }
5920 } else {
5921 // TODO: Implement DPP Strategy and switch based on immediate strategy
5922 // operand. For now, for all the cases (default, Iterative and DPP we use
5923 // iterative approach by default.)
5924
5925 // To reduce the VGPR using iterative approach, we need to iterate
5926 // over all the active lanes. Lowering consists of ComputeLoop,
5927 // which iterate over only active lanes. We use copy of EXEC register
5928 // as induction variable and every active lane modifies it using bitset0
5929 // so that we will get the next active lane for next iteration.
5930 MachineBasicBlock::iterator I = BB.end();
5931 Register SrcReg = MI.getOperand(i: 1).getReg();
5932 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5933 bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
5934
5935 // Create Control flow for loop
5936 // Split MI's Machine Basic block into For loop
5937 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, MBB&: BB, InstInLoop: true);
5938
5939 // Create virtual registers required for lowering.
5940 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5941 const TargetRegisterClass *DstRegClass = MRI.getRegClass(Reg: DstReg);
5942 Register LoopIterator = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5943 Register IdentityValReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5944 Register AccumulatorReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5945 Register ActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5946 Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
5947 Register FF1Reg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
5948 Register LaneValueReg = MRI.createVirtualRegister(RegClass: DstRegClass);
5949
5950 bool IsWave32 = ST.isWave32();
5951 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5952 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5953
5954 // Create initial values of induction variable from Exec, Accumulator and
5955 // insert branch instr to newly created ComputeBlock
5956 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: MovOpcForExec), DestReg: LoopIterator).addReg(RegNo: ExecReg);
5957 if (is32BitOpc) {
5958 uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
5959 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32), DestReg: IdentityValReg)
5960 .addImm(Val: IdentityValue);
5961 } else {
5962 uint64_t IdentityValue =
5963 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5964 ? 0x0 // +0.0 for double sub reduction
5965 : getIdentityValueFor64BitWaveReduction(Opc);
5966 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B64_IMM_PSEUDO), DestReg: IdentityValReg)
5967 .addImm(Val: IdentityValue);
5968 }
5969 // clang-format off
5970 BuildMI(BB, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_BRANCH))
5971 .addMBB(MBB: ComputeLoop);
5972 // clang-format on
5973
5974 // Start constructing ComputeLoop
5975 I = ComputeLoop->begin();
5976 auto Accumulator =
5977 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: AccumulatorReg)
5978 .addReg(RegNo: IdentityValReg)
5979 .addMBB(MBB: &BB);
5980 auto ActiveBits =
5981 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::PHI), DestReg: ActiveBitsReg)
5982 .addReg(RegNo: LoopIterator)
5983 .addMBB(MBB: &BB);
5984
5985 I = ComputeLoop->end();
5986 MachineInstr *NewAccumulator;
5987 // Perform the computations
5988 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5989 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: SFFOpc), DestReg: FF1Reg)
5990 .addReg(RegNo: ActiveBitsReg);
5991 if (is32BitOpc) {
5992 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
5993 DestReg: LaneValueReg)
5994 .addReg(RegNo: SrcReg)
5995 .addReg(RegNo: FF1Reg);
5996 if (isFPOp) {
5997 Register LaneValVreg =
5998 MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
5999 Register DstVreg = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: SrcReg));
6000 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6001 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32),
6002 DestReg: LaneValVreg)
6003 .addReg(RegNo: LaneValueReg);
6004 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6005 .addImm(Val: 0) // src0 modifier
6006 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6007 .addImm(Val: 0) // src1 modifier
6008 .addReg(RegNo: LaneValVreg)
6009 .addImm(Val: 0) // clamp
6010 .addImm(Val: 0); // omod
6011 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6012 MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: DstReg)
6013 .addReg(RegNo: DstVreg);
6014 } else {
6015 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6016 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6017 .addReg(RegNo: LaneValueReg);
6018 }
6019 } else {
6020 Register LaneValueLoReg =
6021 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6022 Register LaneValueHiReg =
6023 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6024 Register LaneValReg = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_64RegClass);
6025 const TargetRegisterClass *SrcRC = MRI.getRegClass(Reg: SrcReg);
6026 const TargetRegisterClass *SrcSubRC =
6027 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6028 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
6029 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub0, SubRC: SrcSubRC);
6030 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
6031 MI, MRI, SuperReg: MI.getOperand(i: 1), SuperRC: SrcRC, SubIdx: AMDGPU::sub1, SubRC: SrcSubRC);
6032 // lane value input should be in an sgpr
6033 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6034 DestReg: LaneValueLoReg)
6035 .add(MO: Op1L)
6036 .addReg(RegNo: FF1Reg);
6037 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READLANE_B32),
6038 DestReg: LaneValueHiReg)
6039 .add(MO: Op1H)
6040 .addReg(RegNo: FF1Reg);
6041 auto LaneValue = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6042 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: LaneValReg)
6043 .addReg(RegNo: LaneValueLoReg)
6044 .addImm(Val: AMDGPU::sub0)
6045 .addReg(RegNo: LaneValueHiReg)
6046 .addImm(Val: AMDGPU::sub1);
6047 switch (Opc) {
6048 case AMDGPU::S_OR_B64:
6049 case AMDGPU::S_AND_B64:
6050 case AMDGPU::S_XOR_B64: {
6051 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6052 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6053 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6054 .setOperandDead(3); // Dead scc
6055 break;
6056 }
6057 case AMDGPU::V_CMP_GT_I64_e64:
6058 case AMDGPU::V_CMP_GT_U64_e64:
6059 case AMDGPU::V_CMP_LT_I64_e64:
6060 case AMDGPU::V_CMP_LT_U64_e64: {
6061 Register LaneMaskReg = MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6062 Register ComparisonResultReg =
6063 MRI.createVirtualRegister(RegClass: WaveMaskRegClass);
6064 int SrcIdx =
6065 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6066 const TargetRegisterClass *VregClass =
6067 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6068 const TargetRegisterClass *VSubRegClass =
6069 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6070 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregClass);
6071 MachineOperand SrcReg0Sub0 =
6072 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6073 SuperRC: VregClass, SubIdx: AMDGPU::sub0, SubRC: VSubRegClass);
6074 MachineOperand SrcReg0Sub1 =
6075 TII->buildExtractSubRegOrImm(MI, MRI, SuperReg: Accumulator->getOperand(i: 0),
6076 SuperRC: VregClass, SubIdx: AMDGPU::sub1, SubRC: VSubRegClass);
6077 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE),
6078 DestReg: AccumulatorVReg)
6079 .add(MO: SrcReg0Sub0)
6080 .addImm(Val: AMDGPU::sub0)
6081 .add(MO: SrcReg0Sub1)
6082 .addImm(Val: AMDGPU::sub1);
6083 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: LaneMaskReg)
6084 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6085 .addReg(RegNo: AccumulatorVReg);
6086
6087 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6088 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AndOpc), DestReg: ComparisonResultReg)
6089 .addReg(RegNo: LaneMaskReg)
6090 .addReg(RegNo: ActiveBitsReg);
6091
6092 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6093 MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B64), DestReg: DstReg)
6094 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6095 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6096 break;
6097 }
6098 case AMDGPU::V_MIN_F64_e64:
6099 case AMDGPU::V_MIN_NUM_F64_e64:
6100 case AMDGPU::V_MAX_F64_e64:
6101 case AMDGPU::V_MAX_NUM_F64_e64:
6102 case AMDGPU::V_ADD_F64_e64:
6103 case AMDGPU::V_ADD_F64_pseudo_e64: {
6104 int SrcIdx =
6105 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::src);
6106 const TargetRegisterClass *VregRC =
6107 TRI->getAllocatableClass(RC: TII->getRegClass(MCID: MI.getDesc(), OpNum: SrcIdx));
6108 const TargetRegisterClass *VregSubRC =
6109 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6110 Register AccumulatorVReg = MRI.createVirtualRegister(RegClass: VregRC);
6111 Register DstVreg = MRI.createVirtualRegister(RegClass: VregRC);
6112 Register LaneValLo =
6113 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6114 Register LaneValHi =
6115 MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6116 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: AccumulatorVReg)
6117 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg());
6118 unsigned Modifier =
6119 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6120 ? SISrcMods::NEG
6121 : SISrcMods::NONE;
6122 auto DstVregInst = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstVreg)
6123 .addImm(Val: Modifier) // src0 modifiers
6124 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg())
6125 .addImm(Val: SISrcMods::NONE) // src1 modifiers
6126 .addReg(RegNo: AccumulatorVReg)
6127 .addImm(Val: SISrcMods::NONE) // clamp
6128 .addImm(Val: SISrcMods::NONE); // omod
6129 auto ReadLaneLo =
6130 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6131 DestReg: LaneValLo);
6132 auto ReadLaneHi =
6133 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32),
6134 DestReg: LaneValHi);
6135 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6136 MachineOperand Op1L =
6137 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6138 SuperRC: VregRC, SubIdx: AMDGPU::sub0, SubRC: VregSubRC);
6139 MachineOperand Op1H =
6140 TII->buildExtractSubRegOrImm(MI: Iters, MRI, SuperReg: DstVregInst->getOperand(i: 0),
6141 SuperRC: VregRC, SubIdx: AMDGPU::sub1, SubRC: VregSubRC);
6142 ReadLaneLo.add(MO: Op1L);
6143 ReadLaneHi.add(MO: Op1H);
6144 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL,
6145 MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: DstReg)
6146 .addReg(RegNo: LaneValLo)
6147 .addImm(Val: AMDGPU::sub0)
6148 .addReg(RegNo: LaneValHi)
6149 .addImm(Val: AMDGPU::sub1);
6150 break;
6151 }
6152 case AMDGPU::S_ADD_U64_PSEUDO:
6153 case AMDGPU::S_SUB_U64_PSEUDO: {
6154 NewAccumulator = BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: DstReg)
6155 .addReg(RegNo: Accumulator->getOperand(i: 0).getReg())
6156 .addReg(RegNo: LaneValue->getOperand(i: 0).getReg());
6157 ComputeLoop = Expand64BitScalarArithmetic(MI&: *NewAccumulator, BB: ComputeLoop);
6158 break;
6159 }
6160 }
6161 }
6162 // Manipulate the iterator to get the next active lane
6163 unsigned BITSETOpc =
6164 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6165 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: BITSETOpc), DestReg: NewActiveBitsReg)
6166 .addReg(RegNo: FF1Reg)
6167 .addReg(RegNo: ActiveBitsReg);
6168
6169 // Add phi nodes
6170 Accumulator.addReg(RegNo: DstReg).addMBB(MBB: ComputeLoop);
6171 ActiveBits.addReg(RegNo: NewActiveBitsReg).addMBB(MBB: ComputeLoop);
6172
6173 // Creating branching
6174 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6175 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: CMPOpc))
6176 .addReg(RegNo: NewActiveBitsReg)
6177 .addImm(Val: 0);
6178 BuildMI(BB&: *ComputeLoop, I, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6179 .addMBB(MBB: ComputeLoop);
6180
6181 RetBB = ComputeEnd;
6182 }
6183 MI.eraseFromParent();
6184 return RetBB;
6185}
6186
6187MachineBasicBlock *
6188SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
6189 MachineBasicBlock *BB) const {
6190 MachineFunction *MF = BB->getParent();
6191 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
6192 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6193 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6194 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6195 MachineRegisterInfo &MRI = MF->getRegInfo();
6196 const DebugLoc &DL = MI.getDebugLoc();
6197
6198 switch (MI.getOpcode()) {
6199 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6200 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_U32);
6201 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6202 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_U64_e64);
6203 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6204 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MIN_I32);
6205 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6206 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_LT_I64_e64);
6207 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6208 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MIN_F32_e64);
6209 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6210 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6211 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6212 ? AMDGPU::V_MIN_NUM_F64_e64
6213 : AMDGPU::V_MIN_F64_e64);
6214 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6215 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_U32);
6216 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6217 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_U64_e64);
6218 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6219 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_MAX_I32);
6220 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6221 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_CMP_GT_I64_e64);
6222 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6223 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_MAX_F32_e64);
6224 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6225 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6226 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6227 ? AMDGPU::V_MAX_NUM_F64_e64
6228 : AMDGPU::V_MAX_F64_e64);
6229 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6230 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_I32);
6231 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6232 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_ADD_U64_PSEUDO);
6233 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6234 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_ADD_F32_e64);
6235 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6236 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6237 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6238 ? AMDGPU::V_ADD_F64_pseudo_e64
6239 : AMDGPU::V_ADD_F64_e64);
6240 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6241 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_I32);
6242 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6243 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_SUB_U64_PSEUDO);
6244 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6245 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::V_SUB_F32_e64);
6246 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6247 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6248 // fadd + neg, by setting the NEG bit in the instruction.
6249 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(),
6250 Opc: ST.getGeneration() >= AMDGPUSubtarget::GFX12
6251 ? AMDGPU::V_ADD_F64_pseudo_e64
6252 : AMDGPU::V_ADD_F64_e64);
6253 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6254 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B32);
6255 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6256 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_AND_B64);
6257 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6258 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B32);
6259 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6260 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_OR_B64);
6261 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6262 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B32);
6263 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6264 return lowerWaveReduce(MI, BB&: *BB, ST: *getSubtarget(), Opc: AMDGPU::S_XOR_B64);
6265 case AMDGPU::S_UADDO_PSEUDO:
6266 case AMDGPU::S_USUBO_PSEUDO: {
6267 MachineOperand &Dest0 = MI.getOperand(i: 0);
6268 MachineOperand &Dest1 = MI.getOperand(i: 1);
6269 MachineOperand &Src0 = MI.getOperand(i: 2);
6270 MachineOperand &Src1 = MI.getOperand(i: 3);
6271
6272 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6273 ? AMDGPU::S_ADD_U32
6274 : AMDGPU::S_SUB_U32;
6275 // clang-format off
6276 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest0.getReg())
6277 .add(MO: Src0)
6278 .add(MO: Src1);
6279 // clang-format on
6280
6281 unsigned SelOpc =
6282 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6283 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: Dest1.getReg()).addImm(Val: -1).addImm(Val: 0);
6284
6285 MI.eraseFromParent();
6286 return BB;
6287 }
6288 case AMDGPU::S_ADD_U64_PSEUDO:
6289 case AMDGPU::S_SUB_U64_PSEUDO: {
6290 return Expand64BitScalarArithmetic(MI, BB);
6291 }
6292 case AMDGPU::V_ADD_U64_PSEUDO:
6293 case AMDGPU::V_SUB_U64_PSEUDO: {
6294 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6295
6296 MachineOperand &Dest = MI.getOperand(i: 0);
6297 MachineOperand &Src0 = MI.getOperand(i: 1);
6298 MachineOperand &Src1 = MI.getOperand(i: 2);
6299
6300 if (ST.hasAddSubU64Insts()) {
6301 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL,
6302 MCID: TII->get(Opcode: IsAdd ? AMDGPU::V_ADD_U64_e64
6303 : AMDGPU::V_SUB_U64_e64),
6304 DestReg: Dest.getReg())
6305 .add(MO: Src0)
6306 .add(MO: Src1)
6307 .addImm(Val: 0); // clamp
6308 TII->legalizeOperands(MI&: *I);
6309 MI.eraseFromParent();
6310 return BB;
6311 }
6312
6313 if (IsAdd && ST.hasLshlAddU64Inst()) {
6314 auto Add = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_LSHL_ADD_U64_e64),
6315 DestReg: Dest.getReg())
6316 .add(MO: Src0)
6317 .addImm(Val: 0)
6318 .add(MO: Src1);
6319 TII->legalizeOperands(MI&: *Add);
6320 MI.eraseFromParent();
6321 return BB;
6322 }
6323
6324 const auto *CarryRC = TRI->getWaveMaskRegClass();
6325
6326 Register DestSub0 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6327 Register DestSub1 = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6328
6329 Register CarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6330 Register DeadCarryReg = MRI.createVirtualRegister(RegClass: CarryRC);
6331
6332 const TargetRegisterClass *Src0RC = Src0.isReg()
6333 ? MRI.getRegClass(Reg: Src0.getReg())
6334 : &AMDGPU::VReg_64RegClass;
6335 const TargetRegisterClass *Src1RC = Src1.isReg()
6336 ? MRI.getRegClass(Reg: Src1.getReg())
6337 : &AMDGPU::VReg_64RegClass;
6338
6339 const TargetRegisterClass *Src0SubRC =
6340 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6341 const TargetRegisterClass *Src1SubRC =
6342 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6343
6344 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6345 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6346 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6347 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6348
6349 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6350 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6351 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6352 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6353
6354 unsigned LoOpc =
6355 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6356 MachineInstr *LoHalf = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: LoOpc), DestReg: DestSub0)
6357 .addReg(RegNo: CarryReg, Flags: RegState::Define)
6358 .add(MO: SrcReg0Sub0)
6359 .add(MO: SrcReg1Sub0)
6360 .addImm(Val: 0); // clamp bit
6361
6362 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6363 MachineInstr *HiHalf =
6364 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: HiOpc), DestReg: DestSub1)
6365 .addReg(RegNo: DeadCarryReg, Flags: RegState::Define | RegState::Dead)
6366 .add(MO: SrcReg0Sub1)
6367 .add(MO: SrcReg1Sub1)
6368 .addReg(RegNo: CarryReg, Flags: RegState::Kill)
6369 .addImm(Val: 0); // clamp bit
6370
6371 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::REG_SEQUENCE), DestReg: Dest.getReg())
6372 .addReg(RegNo: DestSub0)
6373 .addImm(Val: AMDGPU::sub0)
6374 .addReg(RegNo: DestSub1)
6375 .addImm(Val: AMDGPU::sub1);
6376 TII->legalizeOperands(MI&: *LoHalf);
6377 TII->legalizeOperands(MI&: *HiHalf);
6378 MI.eraseFromParent();
6379 return BB;
6380 }
6381 case AMDGPU::S_ADD_CO_PSEUDO:
6382 case AMDGPU::S_SUB_CO_PSEUDO: {
6383 // This pseudo has a chance to be selected
6384 // only from uniform add/subcarry node. All the VGPR operands
6385 // therefore assumed to be splat vectors.
6386 MachineBasicBlock::iterator MII = MI;
6387 MachineOperand &Dest = MI.getOperand(i: 0);
6388 MachineOperand &CarryDest = MI.getOperand(i: 1);
6389 MachineOperand &Src0 = MI.getOperand(i: 2);
6390 MachineOperand &Src1 = MI.getOperand(i: 3);
6391 MachineOperand &Src2 = MI.getOperand(i: 4);
6392 if (Src0.isReg() && TRI->isVectorRegister(MRI, Reg: Src0.getReg())) {
6393 Register RegOp0 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6394 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp0)
6395 .addReg(RegNo: Src0.getReg());
6396 Src0.setReg(RegOp0);
6397 }
6398 if (Src1.isReg() && TRI->isVectorRegister(MRI, Reg: Src1.getReg())) {
6399 Register RegOp1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6400 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp1)
6401 .addReg(RegNo: Src1.getReg());
6402 Src1.setReg(RegOp1);
6403 }
6404 Register RegOp2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32_XM0RegClass);
6405 if (TRI->isVectorRegister(MRI, Reg: Src2.getReg())) {
6406 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_READFIRSTLANE_B32), DestReg: RegOp2)
6407 .addReg(RegNo: Src2.getReg());
6408 Src2.setReg(RegOp2);
6409 }
6410
6411 if (ST.isWave64()) {
6412 if (ST.hasScalarCompareEq64()) {
6413 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U64))
6414 .addReg(RegNo: Src2.getReg())
6415 .addImm(Val: 0);
6416 } else {
6417 const TargetRegisterClass *Src2RC = MRI.getRegClass(Reg: Src2.getReg());
6418 const TargetRegisterClass *SubRC =
6419 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6420 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6421 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub0, SubRC);
6422 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6423 MI: MII, MRI, SuperReg: Src2, SuperRC: Src2RC, SubIdx: AMDGPU::sub1, SubRC);
6424 Register Src2_32 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6425
6426 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_OR_B32), DestReg: Src2_32)
6427 .add(MO: Src2Sub0)
6428 .add(MO: Src2Sub1);
6429
6430 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6431 .addReg(RegNo: Src2_32, Flags: RegState::Kill)
6432 .addImm(Val: 0);
6433 }
6434 } else {
6435 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_LG_U32))
6436 .addReg(RegNo: Src2.getReg())
6437 .addImm(Val: 0);
6438 }
6439
6440 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6441 ? AMDGPU::S_ADDC_U32
6442 : AMDGPU::S_SUBB_U32;
6443
6444 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: Dest.getReg()).add(MO: Src0).add(MO: Src1);
6445
6446 unsigned SelOpc =
6447 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6448
6449 BuildMI(BB&: *BB, I: MII, MIMD: DL, MCID: TII->get(Opcode: SelOpc), DestReg: CarryDest.getReg())
6450 .addImm(Val: -1)
6451 .addImm(Val: 0);
6452
6453 MI.eraseFromParent();
6454 return BB;
6455 }
6456 case AMDGPU::SI_INIT_M0: {
6457 MachineOperand &M0Init = MI.getOperand(i: 0);
6458 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6459 MCID: TII->get(Opcode: M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6460 DestReg: AMDGPU::M0)
6461 .add(MO: M0Init);
6462 MI.eraseFromParent();
6463 return BB;
6464 }
6465 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6466 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6467 BuildMI(BB&: *BB, I: MI.getIterator(), MIMD: MI.getDebugLoc(),
6468 MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6469 .addImm(Val: 0)
6470 .addImm(Val: 0);
6471 return BB;
6472 }
6473 case AMDGPU::GET_GROUPSTATICSIZE: {
6474 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6475 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6476 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_MOV_B32))
6477 .add(MO: MI.getOperand(i: 0))
6478 .addImm(Val: MFI->getLDSSize());
6479 MI.eraseFromParent();
6480 return BB;
6481 }
6482 case AMDGPU::GET_SHADERCYCLESHILO: {
6483 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6484 // The algorithm is:
6485 //
6486 // hi1 = getreg(SHADER_CYCLES_HI)
6487 // lo1 = getreg(SHADER_CYCLES_LO)
6488 // hi2 = getreg(SHADER_CYCLES_HI)
6489 //
6490 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6491 // Otherwise there was overflow and the result is hi2:0. In both cases the
6492 // result should represent the actual time at some point during the sequence
6493 // of three getregs.
6494 using namespace AMDGPU::Hwreg;
6495 Register RegHi1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6496 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi1)
6497 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6498 Register RegLo1 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6499 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegLo1)
6500 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES, Values: 0, Values: 32));
6501 Register RegHi2 = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6502 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_GETREG_B32), DestReg: RegHi2)
6503 .addImm(Val: HwregEncoding::encode(Values: ID_SHADER_CYCLES_HI, Values: 0, Values: 32));
6504 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CMP_EQ_U32))
6505 .addReg(RegNo: RegHi1)
6506 .addReg(RegNo: RegHi2);
6507 Register RegLo = MRI.createVirtualRegister(RegClass: &AMDGPU::SReg_32RegClass);
6508 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CSELECT_B32), DestReg: RegLo)
6509 .addReg(RegNo: RegLo1)
6510 .addImm(Val: 0);
6511 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE))
6512 .add(MO: MI.getOperand(i: 0))
6513 .addReg(RegNo: RegLo)
6514 .addImm(Val: AMDGPU::sub0)
6515 .addReg(RegNo: RegHi2)
6516 .addImm(Val: AMDGPU::sub1);
6517 MI.eraseFromParent();
6518 return BB;
6519 }
6520 case AMDGPU::SI_INDIRECT_SRC_V1:
6521 case AMDGPU::SI_INDIRECT_SRC_V2:
6522 case AMDGPU::SI_INDIRECT_SRC_V3:
6523 case AMDGPU::SI_INDIRECT_SRC_V4:
6524 case AMDGPU::SI_INDIRECT_SRC_V5:
6525 case AMDGPU::SI_INDIRECT_SRC_V6:
6526 case AMDGPU::SI_INDIRECT_SRC_V7:
6527 case AMDGPU::SI_INDIRECT_SRC_V8:
6528 case AMDGPU::SI_INDIRECT_SRC_V9:
6529 case AMDGPU::SI_INDIRECT_SRC_V10:
6530 case AMDGPU::SI_INDIRECT_SRC_V11:
6531 case AMDGPU::SI_INDIRECT_SRC_V12:
6532 case AMDGPU::SI_INDIRECT_SRC_V16:
6533 case AMDGPU::SI_INDIRECT_SRC_V32:
6534 return emitIndirectSrc(MI, MBB&: *BB, ST: *getSubtarget());
6535 case AMDGPU::SI_INDIRECT_DST_V1:
6536 case AMDGPU::SI_INDIRECT_DST_V2:
6537 case AMDGPU::SI_INDIRECT_DST_V3:
6538 case AMDGPU::SI_INDIRECT_DST_V4:
6539 case AMDGPU::SI_INDIRECT_DST_V5:
6540 case AMDGPU::SI_INDIRECT_DST_V6:
6541 case AMDGPU::SI_INDIRECT_DST_V7:
6542 case AMDGPU::SI_INDIRECT_DST_V8:
6543 case AMDGPU::SI_INDIRECT_DST_V9:
6544 case AMDGPU::SI_INDIRECT_DST_V10:
6545 case AMDGPU::SI_INDIRECT_DST_V11:
6546 case AMDGPU::SI_INDIRECT_DST_V12:
6547 case AMDGPU::SI_INDIRECT_DST_V16:
6548 case AMDGPU::SI_INDIRECT_DST_V32:
6549 return emitIndirectDst(MI, MBB&: *BB, ST: *getSubtarget());
6550 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6551 case AMDGPU::SI_KILL_I1_PSEUDO:
6552 return splitKillBlock(MI, BB);
6553 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6554 Register Dst = MI.getOperand(i: 0).getReg();
6555 const MachineOperand &Src0 = MI.getOperand(i: 1);
6556 const MachineOperand &Src1 = MI.getOperand(i: 2);
6557 Register SrcCond = MI.getOperand(i: 3).getReg();
6558
6559 Register DstLo = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6560 Register DstHi = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
6561 const auto *CondRC = TRI->getWaveMaskRegClass();
6562 Register SrcCondCopy = MRI.createVirtualRegister(RegClass: CondRC);
6563
6564 const TargetRegisterClass *Src0RC = Src0.isReg()
6565 ? MRI.getRegClass(Reg: Src0.getReg())
6566 : &AMDGPU::VReg_64RegClass;
6567 const TargetRegisterClass *Src1RC = Src1.isReg()
6568 ? MRI.getRegClass(Reg: Src1.getReg())
6569 : &AMDGPU::VReg_64RegClass;
6570
6571 const TargetRegisterClass *Src0SubRC =
6572 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6573 const TargetRegisterClass *Src1SubRC =
6574 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6575
6576 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6577 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub0, SubRC: Src0SubRC);
6578 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6579 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub0, SubRC: Src1SubRC);
6580
6581 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6582 MI, MRI, SuperReg: Src0, SuperRC: Src0RC, SubIdx: AMDGPU::sub1, SubRC: Src0SubRC);
6583 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6584 MI, MRI, SuperReg: Src1, SuperRC: Src1RC, SubIdx: AMDGPU::sub1, SubRC: Src1SubRC);
6585
6586 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::COPY), DestReg: SrcCondCopy).addReg(RegNo: SrcCond);
6587 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstLo)
6588 .addImm(Val: 0)
6589 .add(MO: Src0Sub0)
6590 .addImm(Val: 0)
6591 .add(MO: Src1Sub0)
6592 .addReg(RegNo: SrcCondCopy);
6593 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_CNDMASK_B32_e64), DestReg: DstHi)
6594 .addImm(Val: 0)
6595 .add(MO: Src0Sub1)
6596 .addImm(Val: 0)
6597 .add(MO: Src1Sub1)
6598 .addReg(RegNo: SrcCondCopy);
6599
6600 BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::REG_SEQUENCE), DestReg: Dst)
6601 .addReg(RegNo: DstLo)
6602 .addImm(Val: AMDGPU::sub0)
6603 .addReg(RegNo: DstHi)
6604 .addImm(Val: AMDGPU::sub1);
6605 MI.eraseFromParent();
6606 return BB;
6607 }
6608 case AMDGPU::SI_BR_UNDEF: {
6609 MachineInstr *Br = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_SCC1))
6610 .add(MO: MI.getOperand(i: 0));
6611 Br->getOperand(i: 1).setIsUndef(); // read undef SCC
6612 MI.eraseFromParent();
6613 return BB;
6614 }
6615 case AMDGPU::ADJCALLSTACKUP:
6616 case AMDGPU::ADJCALLSTACKDOWN: {
6617 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
6618 MachineInstrBuilder MIB(*MF, &MI);
6619 MIB.addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::ImplicitDefine)
6620 .addReg(RegNo: Info->getStackPtrOffsetReg(), Flags: RegState::Implicit);
6621 return BB;
6622 }
6623 case AMDGPU::SI_CALL_ISEL: {
6624 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(MF: *MF);
6625
6626 MachineInstrBuilder MIB;
6627 MIB = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::SI_CALL), DestReg: ReturnAddrReg);
6628
6629 for (const MachineOperand &MO : MI.operands())
6630 MIB.add(MO);
6631
6632 MIB.cloneMemRefs(OtherMI: MI);
6633 MI.eraseFromParent();
6634 return BB;
6635 }
6636 case AMDGPU::V_ADD_CO_U32_e32:
6637 case AMDGPU::V_SUB_CO_U32_e32:
6638 case AMDGPU::V_SUBREV_CO_U32_e32: {
6639 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6640 unsigned Opc = MI.getOpcode();
6641
6642 bool NeedClampOperand = false;
6643 if (TII->pseudoToMCOpcode(Opcode: Opc) == -1) {
6644 Opc = AMDGPU::getVOPe64(Opcode: Opc);
6645 NeedClampOperand = true;
6646 }
6647
6648 auto I = BuildMI(BB&: *BB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: MI.getOperand(i: 0).getReg());
6649 if (TII->isVOP3(MI: *I)) {
6650 I.addReg(RegNo: TRI->getVCC(), Flags: RegState::Define);
6651 }
6652 I.add(MO: MI.getOperand(i: 1)).add(MO: MI.getOperand(i: 2));
6653 if (NeedClampOperand)
6654 I.addImm(Val: 0); // clamp bit for e64 encoding
6655
6656 TII->legalizeOperands(MI&: *I);
6657
6658 MI.eraseFromParent();
6659 return BB;
6660 }
6661 case AMDGPU::V_ADDC_U32_e32:
6662 case AMDGPU::V_SUBB_U32_e32:
6663 case AMDGPU::V_SUBBREV_U32_e32:
6664 // These instructions have an implicit use of vcc which counts towards the
6665 // constant bus limit.
6666 TII->legalizeOperands(MI);
6667 return BB;
6668 case AMDGPU::DS_GWS_INIT:
6669 case AMDGPU::DS_GWS_SEMA_BR:
6670 case AMDGPU::DS_GWS_BARRIER:
6671 case AMDGPU::DS_GWS_SEMA_V:
6672 case AMDGPU::DS_GWS_SEMA_P:
6673 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6674 // A s_waitcnt 0 is required to be the instruction immediately following.
6675 if (getSubtarget()->hasGWSAutoReplay()) {
6676 bundleInstWithWaitcnt(MI);
6677 return BB;
6678 }
6679
6680 return emitGWSMemViolTestLoop(MI, BB);
6681 case AMDGPU::S_SETREG_B32: {
6682 // Try to optimize cases that only set the denormal mode or rounding mode.
6683 //
6684 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6685 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6686 // instead.
6687 //
6688 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6689 // allow you to have a no side effect instruction in the output of a
6690 // sideeffecting pattern.
6691 auto [ID, Offset, Width] =
6692 AMDGPU::Hwreg::HwregEncoding::decode(Encoded: MI.getOperand(i: 1).getImm());
6693 if (ID != AMDGPU::Hwreg::ID_MODE)
6694 return BB;
6695
6696 const unsigned WidthMask = maskTrailingOnes<unsigned>(N: Width);
6697 const unsigned SetMask = WidthMask << Offset;
6698
6699 if (getSubtarget()->hasDenormModeInst()) {
6700 unsigned SetDenormOp = 0;
6701 unsigned SetRoundOp = 0;
6702
6703 // The dedicated instructions can only set the whole denorm or round mode
6704 // at once, not a subset of bits in either.
6705 if (SetMask ==
6706 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
6707 // If this fully sets both the round and denorm mode, emit the two
6708 // dedicated instructions for these.
6709 SetRoundOp = AMDGPU::S_ROUND_MODE;
6710 SetDenormOp = AMDGPU::S_DENORM_MODE;
6711 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6712 SetRoundOp = AMDGPU::S_ROUND_MODE;
6713 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6714 SetDenormOp = AMDGPU::S_DENORM_MODE;
6715 }
6716
6717 if (SetRoundOp || SetDenormOp) {
6718 MachineInstr *Def = MRI.getVRegDef(Reg: MI.getOperand(i: 0).getReg());
6719 if (Def && Def->isMoveImmediate() && Def->getOperand(i: 1).isImm()) {
6720 unsigned ImmVal = Def->getOperand(i: 1).getImm();
6721 if (SetRoundOp) {
6722 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetRoundOp))
6723 .addImm(Val: ImmVal & 0xf);
6724
6725 // If we also have the denorm mode, get just the denorm mode bits.
6726 ImmVal >>= 4;
6727 }
6728
6729 if (SetDenormOp) {
6730 BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: SetDenormOp))
6731 .addImm(Val: ImmVal & 0xf);
6732 }
6733
6734 MI.eraseFromParent();
6735 return BB;
6736 }
6737 }
6738 }
6739
6740 // If only FP bits are touched, used the no side effects pseudo.
6741 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6742 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6743 MI.setDesc(TII->get(Opcode: AMDGPU::S_SETREG_B32_mode));
6744
6745 return BB;
6746 }
6747 case AMDGPU::S_INVERSE_BALLOT_U32:
6748 case AMDGPU::S_INVERSE_BALLOT_U64:
6749 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6750 // necessary. After that they are equivalent to a COPY.
6751 MI.setDesc(TII->get(Opcode: AMDGPU::COPY));
6752 return BB;
6753 case AMDGPU::ENDPGM_TRAP: {
6754 if (BB->succ_empty() && std::next(x: MI.getIterator()) == BB->end()) {
6755 MI.setDesc(TII->get(Opcode: AMDGPU::S_ENDPGM));
6756 MI.addOperand(Op: MachineOperand::CreateImm(Val: 0));
6757 return BB;
6758 }
6759
6760 // We need a block split to make the real endpgm a terminator. We also don't
6761 // want to break phis in successor blocks, so we can't just delete to the
6762 // end of the block.
6763
6764 MachineBasicBlock *SplitBB = BB->splitAt(SplitInst&: MI, UpdateLiveIns: false /*UpdateLiveIns*/);
6765 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6766 MF->push_back(MBB: TrapBB);
6767 // clang-format off
6768 BuildMI(BB&: *TrapBB, I: TrapBB->end(), MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_ENDPGM))
6769 .addImm(Val: 0);
6770 BuildMI(BB&: *BB, I: &MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::S_CBRANCH_EXECNZ))
6771 .addMBB(MBB: TrapBB);
6772 // clang-format on
6773
6774 BB->addSuccessor(Succ: TrapBB);
6775 MI.eraseFromParent();
6776 return SplitBB;
6777 }
6778 case AMDGPU::SIMULATED_TRAP: {
6779 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6780 MachineBasicBlock *SplitBB =
6781 TII->insertSimulatedTrap(MRI, MBB&: *BB, MI, DL: MI.getDebugLoc());
6782 MI.eraseFromParent();
6783 return SplitBB;
6784 }
6785 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6786 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6787 assert(MFI->isWholeWaveFunction());
6788
6789 // During ISel, it's difficult to propagate the original EXEC mask to use as
6790 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6791 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF&: *BB->getParent());
6792 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6793 Register OriginalExec = Setup->getOperand(i: 0).getReg();
6794 MF->getRegInfo().clearKillFlags(Reg: OriginalExec);
6795 MI.getOperand(i: 0).setReg(OriginalExec);
6796 return BB;
6797 }
6798 default:
6799 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6800 if (!MI.mayStore())
6801 AddMemOpInit(MI);
6802 return BB;
6803 }
6804 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, MBB: BB);
6805 }
6806}
6807
6808bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
6809 // This currently forces unfolding various combinations of fsub into fma with
6810 // free fneg'd operands. As long as we have fast FMA (controlled by
6811 // isFMAFasterThanFMulAndFAdd), we should perform these.
6812
6813 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6814 // most of these combines appear to be cycle neutral but save on instruction
6815 // count / code size.
6816 return true;
6817}
6818
6819bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
6820
6821EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
6822 EVT VT) const {
6823 if (!VT.isVector()) {
6824 return MVT::i1;
6825 }
6826 return EVT::getVectorVT(Context&: Ctx, VT: MVT::i1, NumElements: VT.getVectorNumElements());
6827}
6828
6829MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
6830 // TODO: Should i16 be used always if legal? For now it would force VALU
6831 // shifts.
6832 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6833}
6834
6835LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
6836 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6837 ? Ty.changeElementSize(NewEltSize: 16)
6838 : Ty.changeElementSize(NewEltSize: 32);
6839}
6840
6841// Answering this is somewhat tricky and depends on the specific device which
6842// have different rates for fma or all f64 operations.
6843//
6844// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6845// regardless of which device (although the number of cycles differs between
6846// devices), so it is always profitable for f64.
6847//
6848// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6849// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6850// which we can always do even without fused FP ops since it returns the same
6851// result as the separate operations and since it is always full
6852// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6853// however does not support denormals, so we do report fma as faster if we have
6854// a fast fma device and require denormals.
6855//
6856bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6857 EVT VT) const {
6858 VT = VT.getScalarType();
6859
6860 switch (VT.getSimpleVT().SimpleTy) {
6861 case MVT::f32: {
6862 // If mad is not available this depends only on if f32 fma is full rate.
6863 if (!Subtarget->hasMadMacF32Insts())
6864 return Subtarget->hasFastFMAF32();
6865
6866 // Otherwise f32 mad is always full rate and returns the same result as
6867 // the separate operations so should be preferred over fma.
6868 // However does not support denormals.
6869 if (!denormalModeIsFlushAllF32(MF))
6870 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6871
6872 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6873 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6874 }
6875 case MVT::f64:
6876 return true;
6877 case MVT::f16:
6878 case MVT::bf16:
6879 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6880 default:
6881 break;
6882 }
6883
6884 return false;
6885}
6886
6887bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
6888 LLT Ty) const {
6889 switch (Ty.getScalarSizeInBits()) {
6890 case 16:
6891 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f16);
6892 case 32:
6893 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f32);
6894 case 64:
6895 return isFMAFasterThanFMulAndFAdd(MF, VT: MVT::f64);
6896 default:
6897 break;
6898 }
6899
6900 return false;
6901}
6902
6903bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
6904 if (!Ty.isScalar())
6905 return false;
6906
6907 if (Ty.getScalarSizeInBits() == 16)
6908 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(MF: *MI.getMF());
6909 if (Ty.getScalarSizeInBits() == 32)
6910 return Subtarget->hasMadMacF32Insts() &&
6911 denormalModeIsFlushAllF32(MF: *MI.getMF());
6912
6913 return false;
6914}
6915
6916bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
6917 const SDNode *N) const {
6918 // TODO: Check future ftz flag
6919 // v_mad_f32/v_mac_f32 do not support denormals.
6920 EVT VT = N->getValueType(ResNo: 0);
6921 if (VT == MVT::f32)
6922 return Subtarget->hasMadMacF32Insts() &&
6923 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
6924 if (VT == MVT::f16) {
6925 return Subtarget->hasMadF16() &&
6926 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
6927 }
6928
6929 return false;
6930}
6931
6932//===----------------------------------------------------------------------===//
6933// Custom DAG Lowering Operations
6934//===----------------------------------------------------------------------===//
6935
6936// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6937// wider vector type is legal.
6938SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
6939 SelectionDAG &DAG) const {
6940 unsigned Opc = Op.getOpcode();
6941 EVT VT = Op.getValueType();
6942 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6943 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6944 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6945 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6946 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6947 VT == MVT::v32bf16);
6948
6949 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6950
6951 SDLoc SL(Op);
6952 SDValue OpLo = DAG.getNode(Opcode: Opc, DL: SL, VT: Lo.getValueType(), Operand: Lo, Flags: Op->getFlags());
6953 SDValue OpHi = DAG.getNode(Opcode: Opc, DL: SL, VT: Hi.getValueType(), Operand: Hi, Flags: Op->getFlags());
6954
6955 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6956}
6957
6958// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6959// regression whereby extra unnecessary instructions were added to codegen
6960// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6961// instructions to extract the result from the vector.
6962SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const {
6963 [[maybe_unused]] EVT VT = Op.getValueType();
6964
6965 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6966 VT == MVT::v16i32) &&
6967 "Unexpected ValueType.");
6968
6969 return DAG.UnrollVectorOp(N: Op.getNode());
6970}
6971
6972// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6973// wider vector type is legal.
6974SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
6975 SelectionDAG &DAG) const {
6976 unsigned Opc = Op.getOpcode();
6977 EVT VT = Op.getValueType();
6978 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6979 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6980 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6981 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6982 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6983 VT == MVT::v32bf16);
6984
6985 auto [Lo0, Hi0] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
6986 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
6987
6988 SDLoc SL(Op);
6989
6990 SDValue OpLo =
6991 DAG.getNode(Opcode: Opc, DL: SL, VT: Lo0.getValueType(), N1: Lo0, N2: Lo1, Flags: Op->getFlags());
6992 SDValue OpHi =
6993 DAG.getNode(Opcode: Opc, DL: SL, VT: Hi0.getValueType(), N1: Hi0, N2: Hi1, Flags: Op->getFlags());
6994
6995 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
6996}
6997
6998SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
6999 SelectionDAG &DAG) const {
7000 unsigned Opc = Op.getOpcode();
7001 EVT VT = Op.getValueType();
7002 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7003 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7004 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7005 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7006 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7007 VT == MVT::v32bf16);
7008
7009 SDValue Op0 = Op.getOperand(i: 0);
7010 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7011 ? DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0)
7012 : std::pair(Op0, Op0);
7013
7014 auto [Lo1, Hi1] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 1);
7015 auto [Lo2, Hi2] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 2);
7016
7017 SDLoc SL(Op);
7018 auto ResVT = DAG.GetSplitDestVTs(VT);
7019
7020 SDValue OpLo =
7021 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.first, N1: Lo0, N2: Lo1, N3: Lo2, Flags: Op->getFlags());
7022 SDValue OpHi =
7023 DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT.second, N1: Hi0, N2: Hi1, N3: Hi2, Flags: Op->getFlags());
7024
7025 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc(Op), VT, N1: OpLo, N2: OpHi);
7026}
7027
7028SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7029 switch (Op.getOpcode()) {
7030 default:
7031 return AMDGPUTargetLowering::LowerOperation(Op, DAG);
7032 case ISD::BRCOND:
7033 return LowerBRCOND(Op, DAG);
7034 case ISD::RETURNADDR:
7035 return LowerRETURNADDR(Op, DAG);
7036 case ISD::SPONENTRY:
7037 return LowerSPONENTRY(Op, DAG);
7038 case ISD::LOAD: {
7039 SDValue Result = LowerLOAD(Op, DAG);
7040 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7041 "Load should return a value and a chain");
7042 return Result;
7043 }
7044 case ISD::FSQRT: {
7045 EVT VT = Op.getValueType();
7046 if (VT == MVT::f32)
7047 return lowerFSQRTF32(Op, DAG);
7048 if (VT == MVT::f64)
7049 return lowerFSQRTF64(Op, DAG);
7050 return SDValue();
7051 }
7052 case ISD::FSIN:
7053 case ISD::FCOS:
7054 return LowerTrig(Op, DAG);
7055 case ISD::SELECT:
7056 return LowerSELECT(Op, DAG);
7057 case ISD::FDIV:
7058 return LowerFDIV(Op, DAG);
7059 case ISD::FFREXP:
7060 return LowerFFREXP(Op, DAG);
7061 case ISD::ATOMIC_CMP_SWAP:
7062 return LowerATOMIC_CMP_SWAP(Op, DAG);
7063 case ISD::STORE:
7064 return LowerSTORE(Op, DAG);
7065 case ISD::GlobalAddress: {
7066 MachineFunction &MF = DAG.getMachineFunction();
7067 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7068 return LowerGlobalAddress(MFI, Op, DAG);
7069 }
7070 case ISD::ExternalSymbol:
7071 return LowerExternalSymbol(Op, DAG);
7072 case ISD::INTRINSIC_WO_CHAIN:
7073 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7074 case ISD::INTRINSIC_W_CHAIN:
7075 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7076 case ISD::INTRINSIC_VOID:
7077 return LowerINTRINSIC_VOID(Op, DAG);
7078 case ISD::ADDRSPACECAST:
7079 return lowerADDRSPACECAST(Op, DAG);
7080 case ISD::INSERT_SUBVECTOR:
7081 return lowerINSERT_SUBVECTOR(Op, DAG);
7082 case ISD::INSERT_VECTOR_ELT:
7083 return lowerINSERT_VECTOR_ELT(Op, DAG);
7084 case ISD::EXTRACT_VECTOR_ELT:
7085 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7086 case ISD::VECTOR_SHUFFLE:
7087 return lowerVECTOR_SHUFFLE(Op, DAG);
7088 case ISD::SCALAR_TO_VECTOR:
7089 return lowerSCALAR_TO_VECTOR(Op, DAG);
7090 case ISD::BUILD_VECTOR:
7091 return lowerBUILD_VECTOR(Op, DAG);
7092 case ISD::FP_ROUND:
7093 case ISD::STRICT_FP_ROUND:
7094 return lowerFP_ROUND(Op, DAG);
7095 case ISD::TRAP:
7096 return lowerTRAP(Op, DAG);
7097 case ISD::DEBUGTRAP:
7098 return lowerDEBUGTRAP(Op, DAG);
7099 case ISD::ABS:
7100 case ISD::FABS:
7101 case ISD::FNEG:
7102 case ISD::FCANONICALIZE:
7103 case ISD::BSWAP:
7104 return splitUnaryVectorOp(Op, DAG);
7105 case ISD::FMINNUM:
7106 case ISD::FMAXNUM:
7107 return lowerFMINNUM_FMAXNUM(Op, DAG);
7108 case ISD::FMINIMUMNUM:
7109 case ISD::FMAXIMUMNUM:
7110 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7111 case ISD::FMINIMUM:
7112 case ISD::FMAXIMUM:
7113 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7114 case ISD::FLDEXP:
7115 case ISD::STRICT_FLDEXP:
7116 return lowerFLDEXP(Op, DAG);
7117 case ISD::FMA:
7118 return splitTernaryVectorOp(Op, DAG);
7119 case ISD::FP_TO_SINT:
7120 case ISD::FP_TO_UINT:
7121 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7122 Op.getValueType() == MVT::i16 &&
7123 Op.getOperand(i: 0).getValueType() == MVT::f32) {
7124 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7125 return Op;
7126 }
7127 return LowerFP_TO_INT(Op, DAG);
7128 case ISD::SHL:
7129 case ISD::SRA:
7130 case ISD::SRL:
7131 case ISD::ADD:
7132 case ISD::SUB:
7133 case ISD::SMIN:
7134 case ISD::SMAX:
7135 case ISD::UMIN:
7136 case ISD::UMAX:
7137 case ISD::FADD:
7138 case ISD::FMUL:
7139 case ISD::FMINNUM_IEEE:
7140 case ISD::FMAXNUM_IEEE:
7141 case ISD::UADDSAT:
7142 case ISD::USUBSAT:
7143 case ISD::SADDSAT:
7144 case ISD::SSUBSAT:
7145 return splitBinaryVectorOp(Op, DAG);
7146 case ISD::FCOPYSIGN:
7147 return lowerFCOPYSIGN(Op, DAG);
7148 case ISD::MUL:
7149 return lowerMUL(Op, DAG);
7150 case ISD::SMULO:
7151 case ISD::UMULO:
7152 return lowerXMULO(Op, DAG);
7153 case ISD::SMUL_LOHI:
7154 case ISD::UMUL_LOHI:
7155 return lowerXMUL_LOHI(Op, DAG);
7156 case ISD::DYNAMIC_STACKALLOC:
7157 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7158 case ISD::STACKSAVE:
7159 return LowerSTACKSAVE(Op, DAG);
7160 case ISD::GET_ROUNDING:
7161 return lowerGET_ROUNDING(Op, DAG);
7162 case ISD::SET_ROUNDING:
7163 return lowerSET_ROUNDING(Op, DAG);
7164 case ISD::PREFETCH:
7165 return lowerPREFETCH(Op, DAG);
7166 case ISD::FP_EXTEND:
7167 case ISD::STRICT_FP_EXTEND:
7168 return lowerFP_EXTEND(Op, DAG);
7169 case ISD::GET_FPENV:
7170 return lowerGET_FPENV(Op, DAG);
7171 case ISD::SET_FPENV:
7172 return lowerSET_FPENV(Op, DAG);
7173 case ISD::ROTR:
7174 return lowerROTR(Op, DAG);
7175 }
7176 return SDValue();
7177}
7178
7179// Used for D16: Casts the result of an instruction into the right vector,
7180// packs values if loads return unpacked values.
7181static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
7182 const SDLoc &DL, SelectionDAG &DAG,
7183 bool Unpacked) {
7184 if (!LoadVT.isVector())
7185 return Result;
7186
7187 // Cast back to the original packed type or to a larger type that is a
7188 // multiple of 32 bit for D16. Widening the return type is a required for
7189 // legalization.
7190 EVT FittingLoadVT = LoadVT;
7191 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7192 FittingLoadVT =
7193 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7194 NumElements: LoadVT.getVectorNumElements() + 1);
7195 }
7196
7197 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7198 // Truncate to v2i16/v4i16.
7199 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7200
7201 // Workaround legalizer not scalarizing truncate after vector op
7202 // legalization but not creating intermediate vector trunc.
7203 SmallVector<SDValue, 4> Elts;
7204 DAG.ExtractVectorElements(Op: Result, Args&: Elts);
7205 for (SDValue &Elt : Elts)
7206 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Elt);
7207
7208 // Pad illegal v1i16/v3fi6 to v4i16
7209 if ((LoadVT.getVectorNumElements() % 2) == 1)
7210 Elts.push_back(Elt: DAG.getPOISON(VT: MVT::i16));
7211
7212 Result = DAG.getBuildVector(VT: IntLoadVT, DL, Ops: Elts);
7213
7214 // Bitcast to original type (v2f16/v4f16).
7215 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7216 }
7217
7218 // Cast back to the original packed type.
7219 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: FittingLoadVT, Operand: Result);
7220}
7221
7222SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7223 SelectionDAG &DAG,
7224 ArrayRef<SDValue> Ops,
7225 bool IsIntrinsic) const {
7226 SDLoc DL(M);
7227
7228 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7229 EVT LoadVT = M->getValueType(ResNo: 0);
7230
7231 EVT EquivLoadVT = LoadVT;
7232 if (LoadVT.isVector()) {
7233 if (Unpacked) {
7234 EquivLoadVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32,
7235 NumElements: LoadVT.getVectorNumElements());
7236 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7237 // Widen v3f16 to legal type
7238 EquivLoadVT =
7239 EVT::getVectorVT(Context&: *DAG.getContext(), VT: LoadVT.getVectorElementType(),
7240 NumElements: LoadVT.getVectorNumElements() + 1);
7241 }
7242 }
7243
7244 // Change from v4f16/v2f16 to EquivLoadVT.
7245 SDVTList VTList = DAG.getVTList(VT1: EquivLoadVT, VT2: MVT::Other);
7246
7247 SDValue Load = DAG.getMemIntrinsicNode(
7248 Opcode: IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, dl: DL, VTList, Ops,
7249 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
7250
7251 SDValue Adjusted = adjustLoadValueTypeImpl(Result: Load, LoadVT, DL, DAG, Unpacked);
7252
7253 return DAG.getMergeValues(Ops: {Adjusted, Load.getValue(R: 1)}, dl: DL);
7254}
7255
7256SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7257 SelectionDAG &DAG,
7258 ArrayRef<SDValue> Ops) const {
7259 SDLoc DL(M);
7260 EVT LoadVT = M->getValueType(ResNo: 0);
7261 EVT EltType = LoadVT.getScalarType();
7262 EVT IntVT = LoadVT.changeTypeToInteger();
7263
7264 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7265
7266 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7267 bool IsTFE = M->getNumValues() == 3;
7268
7269 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7270 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7271 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7272 : AMDGPUISD::BUFFER_LOAD;
7273
7274 if (IsD16) {
7275 return adjustLoadValueType(Opcode: AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7276 }
7277
7278 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7279 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7280 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, MMO: M->getMemOperand(),
7281 IsTFE);
7282
7283 if (isTypeLegal(VT: LoadVT)) {
7284 return getMemIntrinsicNode(Opcode: Opc, DL, VTList: M->getVTList(), Ops, MemVT: IntVT,
7285 MMO: M->getMemOperand(), DAG);
7286 }
7287
7288 EVT CastVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: LoadVT);
7289 SDVTList VTList = DAG.getVTList(VT1: CastVT, VT2: MVT::Other);
7290 SDValue MemNode = getMemIntrinsicNode(Opcode: Opc, DL, VTList, Ops, MemVT: CastVT,
7291 MMO: M->getMemOperand(), DAG);
7292 return DAG.getMergeValues(
7293 Ops: {DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: MemNode), MemNode.getValue(R: 1)},
7294 dl: DL);
7295}
7296
7297static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7298 SelectionDAG &DAG) {
7299 EVT VT = N->getValueType(ResNo: 0);
7300 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7301 if (!ICmpInst::isIntPredicate(P: static_cast<ICmpInst::Predicate>(CondCode)))
7302 return DAG.getPOISON(VT);
7303
7304 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7305
7306 SDValue LHS = N->getOperand(Num: 1);
7307 SDValue RHS = N->getOperand(Num: 2);
7308
7309 SDLoc DL(N);
7310
7311 EVT CmpVT = LHS.getValueType();
7312 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(VT: MVT::i16)) {
7313 unsigned PromoteOp =
7314 ICmpInst::isSigned(Pred: IcInput) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7315 LHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: LHS);
7316 RHS = DAG.getNode(Opcode: PromoteOp, DL, VT: MVT::i32, Operand: RHS);
7317 }
7318
7319 ISD::CondCode CCOpcode = getICmpCondCode(Pred: IcInput);
7320
7321 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7322 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7323
7324 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS,
7325 N3: DAG.getCondCode(Cond: CCOpcode));
7326 if (VT.bitsEq(VT: CCVT))
7327 return SetCC;
7328 return DAG.getZExtOrTrunc(Op: SetCC, DL, VT);
7329}
7330
7331static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N,
7332 SelectionDAG &DAG) {
7333 EVT VT = N->getValueType(ResNo: 0);
7334
7335 unsigned CondCode = N->getConstantOperandVal(Num: 3);
7336 if (!FCmpInst::isFPPredicate(P: static_cast<FCmpInst::Predicate>(CondCode)))
7337 return DAG.getPOISON(VT);
7338
7339 SDValue Src0 = N->getOperand(Num: 1);
7340 SDValue Src1 = N->getOperand(Num: 2);
7341 EVT CmpVT = Src0.getValueType();
7342 SDLoc SL(N);
7343
7344 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(VT: CmpVT)) {
7345 Src0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src0);
7346 Src1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Src1);
7347 }
7348
7349 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7350 ISD::CondCode CCOpcode = getFCmpCondCode(Pred: IcInput);
7351 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7352 EVT CCVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: WavefrontSize);
7353 SDValue SetCC = DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT: CCVT, N1: Src0, N2: Src1,
7354 N3: DAG.getCondCode(Cond: CCOpcode));
7355 if (VT.bitsEq(VT: CCVT))
7356 return SetCC;
7357 return DAG.getZExtOrTrunc(Op: SetCC, DL: SL, VT);
7358}
7359
7360static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
7361 SelectionDAG &DAG) {
7362 EVT VT = N->getValueType(ResNo: 0);
7363 SDValue Src = N->getOperand(Num: 1);
7364 SDLoc SL(N);
7365
7366 if (Src.getOpcode() == ISD::SETCC) {
7367 SDValue Op0 = Src.getOperand(i: 0);
7368 SDValue Op1 = Src.getOperand(i: 1);
7369 // Need to expand bfloat to float for comparison (setcc).
7370 if (Op0.getValueType() == MVT::bf16) {
7371 Op0 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op0);
7372 Op1 = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op1);
7373 }
7374 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7375 return DAG.getNode(Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: Op0, N2: Op1, N3: Src.getOperand(i: 2));
7376 }
7377 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Val&: Src)) {
7378 // (ballot 0) -> 0
7379 if (Arg->isZero())
7380 return DAG.getConstant(Val: 0, DL: SL, VT);
7381
7382 // (ballot 1) -> EXEC/EXEC_LO
7383 if (Arg->isOne()) {
7384 Register Exec;
7385 if (VT.getScalarSizeInBits() == 32)
7386 Exec = AMDGPU::EXEC_LO;
7387 else if (VT.getScalarSizeInBits() == 64)
7388 Exec = AMDGPU::EXEC;
7389 else
7390 return SDValue();
7391
7392 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: Exec, VT);
7393 }
7394 }
7395
7396 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7397 // ISD::SETNE)
7398 return DAG.getNode(
7399 Opcode: AMDGPUISD::SETCC, DL: SL, VT, N1: DAG.getZExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32),
7400 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), N3: DAG.getCondCode(Cond: ISD::SETNE));
7401}
7402
7403static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
7404 SelectionDAG &DAG) {
7405 EVT VT = N->getValueType(ResNo: 0);
7406 unsigned ValSize = VT.getSizeInBits();
7407 unsigned IID = N->getConstantOperandVal(Num: 0);
7408 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7409 IID == Intrinsic::amdgcn_permlanex16;
7410 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7411 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7412 SDLoc SL(N);
7413 MVT IntVT = MVT::getIntegerVT(BitWidth: ValSize);
7414 const GCNSubtarget *ST = TLI.getSubtarget();
7415 unsigned SplitSize = 32;
7416 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7417 ST->hasDPALU_DPP() &&
7418 AMDGPU::isLegalDPALU_DPPControl(ST: *ST, DC: N->getConstantOperandVal(Num: 3)))
7419 SplitSize = 64;
7420
7421 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7422 SDValue Src2, MVT ValT) -> SDValue {
7423 SmallVector<SDValue, 8> Operands;
7424 switch (IID) {
7425 case Intrinsic::amdgcn_permlane16:
7426 case Intrinsic::amdgcn_permlanex16:
7427 case Intrinsic::amdgcn_update_dpp:
7428 Operands.push_back(Elt: N->getOperand(Num: 6));
7429 Operands.push_back(Elt: N->getOperand(Num: 5));
7430 Operands.push_back(Elt: N->getOperand(Num: 4));
7431 [[fallthrough]];
7432 case Intrinsic::amdgcn_writelane:
7433 Operands.push_back(Elt: Src2);
7434 [[fallthrough]];
7435 case Intrinsic::amdgcn_readlane:
7436 case Intrinsic::amdgcn_set_inactive:
7437 case Intrinsic::amdgcn_set_inactive_chain_arg:
7438 case Intrinsic::amdgcn_mov_dpp8:
7439 Operands.push_back(Elt: Src1);
7440 [[fallthrough]];
7441 case Intrinsic::amdgcn_readfirstlane:
7442 case Intrinsic::amdgcn_permlane64:
7443 Operands.push_back(Elt: Src0);
7444 break;
7445 default:
7446 llvm_unreachable("unhandled lane op");
7447 }
7448
7449 Operands.push_back(Elt: DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32));
7450 std::reverse(first: Operands.begin(), last: Operands.end());
7451
7452 if (SDNode *GL = N->getGluedNode()) {
7453 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7454 GL = GL->getOperand(Num: 0).getNode();
7455 Operands.push_back(Elt: DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7456 Operand: SDValue(GL, 0)));
7457 }
7458
7459 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: ValT, Ops: Operands);
7460 };
7461
7462 SDValue Src0 = N->getOperand(Num: 1);
7463 SDValue Src1, Src2;
7464 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7465 IID == Intrinsic::amdgcn_mov_dpp8 ||
7466 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7467 Src1 = N->getOperand(Num: 2);
7468 if (IID == Intrinsic::amdgcn_writelane ||
7469 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7470 Src2 = N->getOperand(Num: 3);
7471 }
7472
7473 if (ValSize == SplitSize) {
7474 // Already legal
7475 return SDValue();
7476 }
7477
7478 if (ValSize < 32) {
7479 bool IsFloat = VT.isFloatingPoint();
7480 Src0 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src0) : Src0,
7481 DL: SL, VT: MVT::i32);
7482
7483 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7484 Src1 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src1) : Src1,
7485 DL: SL, VT: MVT::i32);
7486 }
7487
7488 if (IID == Intrinsic::amdgcn_writelane) {
7489 Src2 = DAG.getAnyExtOrTrunc(Op: IsFloat ? DAG.getBitcast(VT: IntVT, V: Src2) : Src2,
7490 DL: SL, VT: MVT::i32);
7491 }
7492
7493 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7494 SDValue Trunc = DAG.getAnyExtOrTrunc(Op: LaneOp, DL: SL, VT: IntVT);
7495 return IsFloat ? DAG.getBitcast(VT, V: Trunc) : Trunc;
7496 }
7497
7498 if (ValSize % SplitSize != 0)
7499 return SDValue();
7500
7501 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7502 EVT VT = N->getValueType(ResNo: 0);
7503 unsigned NE = VT.getVectorNumElements();
7504 EVT EltVT = VT.getVectorElementType();
7505 SmallVector<SDValue, 8> Scalars;
7506 unsigned NumOperands = N->getNumOperands();
7507 SmallVector<SDValue, 4> Operands(NumOperands);
7508 SDNode *GL = N->getGluedNode();
7509
7510 // only handle convergencectrl_glue
7511 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7512
7513 for (unsigned i = 0; i != NE; ++i) {
7514 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7515 ++j) {
7516 SDValue Operand = N->getOperand(Num: j);
7517 EVT OperandVT = Operand.getValueType();
7518 if (OperandVT.isVector()) {
7519 // A vector operand; extract a single element.
7520 EVT OperandEltVT = OperandVT.getVectorElementType();
7521 Operands[j] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: OperandEltVT,
7522 N1: Operand, N2: DAG.getVectorIdxConstant(Val: i, DL: SL));
7523 } else {
7524 // A scalar operand; just use it as is.
7525 Operands[j] = Operand;
7526 }
7527 }
7528
7529 if (GL)
7530 Operands[NumOperands - 1] =
7531 DAG.getNode(Opcode: ISD::CONVERGENCECTRL_GLUE, DL: SL, VT: MVT::Glue,
7532 Operand: SDValue(GL->getOperand(Num: 0).getNode(), 0));
7533
7534 Scalars.push_back(Elt: DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: EltVT, Ops: Operands));
7535 }
7536
7537 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NE);
7538 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops: Scalars);
7539 };
7540
7541 if (VT.isVector()) {
7542 switch (MVT::SimpleValueType EltTy =
7543 VT.getVectorElementType().getSimpleVT().SimpleTy) {
7544 case MVT::i32:
7545 case MVT::f32:
7546 if (SplitSize == 32) {
7547 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7548 return unrollLaneOp(LaneOp.getNode());
7549 }
7550 [[fallthrough]];
7551 case MVT::i16:
7552 case MVT::f16:
7553 case MVT::bf16: {
7554 unsigned SubVecNumElt =
7555 SplitSize / VT.getVectorElementType().getSizeInBits();
7556 MVT SubVecVT = MVT::getVectorVT(VT: EltTy, NumElements: SubVecNumElt);
7557 SmallVector<SDValue, 4> Pieces;
7558 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7559 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7560 Src0SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src0,
7561 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7562
7563 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7564 IsPermLane16)
7565 Src1SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src1,
7566 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7567
7568 if (IID == Intrinsic::amdgcn_writelane)
7569 Src2SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: SubVecVT, N1: Src2,
7570 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
7571
7572 Pieces.push_back(
7573 Elt: IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7574 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7575 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7576 EltIdx += SubVecNumElt;
7577 }
7578 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT, Ops: Pieces);
7579 }
7580 default:
7581 // Handle all other cases by bitcasting to i32 vectors
7582 break;
7583 }
7584 }
7585
7586 MVT VecVT =
7587 MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: SplitSize), NumElements: ValSize / SplitSize);
7588 Src0 = DAG.getBitcast(VT: VecVT, V: Src0);
7589
7590 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7591 Src1 = DAG.getBitcast(VT: VecVT, V: Src1);
7592
7593 if (IID == Intrinsic::amdgcn_writelane)
7594 Src2 = DAG.getBitcast(VT: VecVT, V: Src2);
7595
7596 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7597 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7598 return DAG.getBitcast(VT, V: UnrolledLaneOp);
7599}
7600
7601static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N,
7602 SelectionDAG &DAG) {
7603 EVT VT = N->getValueType(ResNo: 0);
7604
7605 if (VT.getSizeInBits() != 32)
7606 return SDValue();
7607
7608 SDLoc SL(N);
7609
7610 SDValue Value = N->getOperand(Num: 1);
7611 SDValue Index = N->getOperand(Num: 2);
7612
7613 // ds_bpermute requires index to be multiplied by 4
7614 SDValue ShiftAmount = DAG.getShiftAmountConstant(Val: 2, VT: MVT::i32, DL: SL);
7615 SDValue ShiftedIndex =
7616 DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: Index.getValueType(), N1: Index, N2: ShiftAmount);
7617
7618 // Intrinsics will require i32 to operate on
7619 SDValue ValueI32 = DAG.getBitcast(VT: MVT::i32, V: Value);
7620
7621 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
7622 SmallVector<SDValue> IntrinArgs) -> SDValue {
7623 SmallVector<SDValue> Operands(1);
7624 Operands[0] = DAG.getTargetConstant(Val: IID, DL: SL, VT: MVT::i32);
7625 Operands.append(RHS: IntrinArgs);
7626 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: RetVT, Ops: Operands);
7627 };
7628
7629 // If we can bpermute across the whole wave, then just do that
7630 if (TLI.getSubtarget()->supportsWaveWideBPermute()) {
7631 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7632 {ShiftedIndex, ValueI32});
7633 return DAG.getBitcast(VT, V: BPermute);
7634 }
7635
7636 assert(TLI.getSubtarget()->isWave64());
7637
7638 // Otherwise, we need to make use of whole wave mode
7639 SDValue PoisonVal = DAG.getPOISON(VT: ValueI32->getValueType(ResNo: 0));
7640
7641 // Set inactive lanes to poison
7642 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7643 {ValueI32, PoisonVal});
7644 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7645 {ShiftedIndex, PoisonVal});
7646
7647 SDValue Swapped =
7648 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7649
7650 // Get permutation of each half, then we'll select which one to use
7651 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7652 {WWMIndex, WWMValue});
7653 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7654 MVT::i32, {WWMIndex, Swapped});
7655 SDValue BPermOtherHalfWWM =
7656 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7657
7658 // Select which side to take the permute from
7659 SDValue ThreadIDMask = DAG.getAllOnesConstant(DL: SL, VT: MVT::i32);
7660 // We can get away with only using mbcnt_lo here since we're only
7661 // trying to detect which side of 32 each lane is on, and mbcnt_lo
7662 // returns 32 for lanes 32-63.
7663 SDValue ThreadID =
7664 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7665 {ThreadIDMask, DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32)});
7666
7667 SDValue SameOrOtherHalf =
7668 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32,
7669 N1: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: ThreadID, N2: Index),
7670 N2: DAG.getTargetConstant(Val: 32, DL: SL, VT: MVT::i32));
7671 SDValue UseSameHalf =
7672 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SameOrOtherHalf,
7673 RHS: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond: ISD::SETEQ);
7674 SDValue Result = DAG.getSelect(DL: SL, VT: MVT::i32, Cond: UseSameHalf, LHS: BPermSameHalf,
7675 RHS: BPermOtherHalfWWM);
7676 return DAG.getBitcast(VT, V: Result);
7677}
7678
7679void SITargetLowering::ReplaceNodeResults(SDNode *N,
7680 SmallVectorImpl<SDValue> &Results,
7681 SelectionDAG &DAG) const {
7682 switch (N->getOpcode()) {
7683 case ISD::INSERT_VECTOR_ELT: {
7684 if (SDValue Res = lowerINSERT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7685 Results.push_back(Elt: Res);
7686 return;
7687 }
7688 case ISD::EXTRACT_VECTOR_ELT: {
7689 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(Op: SDValue(N, 0), DAG))
7690 Results.push_back(Elt: Res);
7691 return;
7692 }
7693 case ISD::INTRINSIC_WO_CHAIN: {
7694 unsigned IID = N->getConstantOperandVal(Num: 0);
7695 switch (IID) {
7696 case Intrinsic::amdgcn_make_buffer_rsrc:
7697 Results.push_back(Elt: lowerPointerAsRsrcIntrin(Op: N, DAG));
7698 return;
7699 case Intrinsic::amdgcn_cvt_pkrtz: {
7700 SDValue Src0 = N->getOperand(Num: 1);
7701 SDValue Src1 = N->getOperand(Num: 2);
7702 SDLoc SL(N);
7703 SDValue Cvt =
7704 DAG.getNode(Opcode: AMDGPUISD::CVT_PKRTZ_F16_F32, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7705 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Cvt));
7706 return;
7707 }
7708 case Intrinsic::amdgcn_cvt_pknorm_i16:
7709 case Intrinsic::amdgcn_cvt_pknorm_u16:
7710 case Intrinsic::amdgcn_cvt_pk_i16:
7711 case Intrinsic::amdgcn_cvt_pk_u16: {
7712 SDValue Src0 = N->getOperand(Num: 1);
7713 SDValue Src1 = N->getOperand(Num: 2);
7714 SDLoc SL(N);
7715 unsigned Opcode;
7716
7717 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7718 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7719 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7720 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7721 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7722 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7723 else
7724 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7725
7726 EVT VT = N->getValueType(ResNo: 0);
7727 if (isTypeLegal(VT))
7728 Results.push_back(Elt: DAG.getNode(Opcode, DL: SL, VT, N1: Src0, N2: Src1));
7729 else {
7730 SDValue Cvt = DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Src0, N2: Src1);
7731 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: Cvt));
7732 }
7733 return;
7734 }
7735 case Intrinsic::amdgcn_s_buffer_load: {
7736 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7737 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7738 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7739 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7740 // s_buffer_load_i8.
7741 if (!Subtarget->hasScalarSubwordLoads())
7742 return;
7743 SDValue Op = SDValue(N, 0);
7744 SDValue Rsrc = Op.getOperand(i: 1);
7745 SDValue Offset = Op.getOperand(i: 2);
7746 SDValue CachePolicy = Op.getOperand(i: 3);
7747 EVT VT = Op.getValueType();
7748 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7749 SDLoc DL(Op);
7750 MachineFunction &MF = DAG.getMachineFunction();
7751 const DataLayout &DataLayout = DAG.getDataLayout();
7752 Align Alignment =
7753 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
7754 MachineMemOperand *MMO = MF.getMachineMemOperand(
7755 PtrInfo: MachinePointerInfo(),
7756 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
7757 MachineMemOperand::MOInvariant,
7758 Size: VT.getStoreSize(), BaseAlignment: Alignment);
7759 SDValue LoadVal;
7760 if (!Offset->isDivergent()) {
7761 SDValue Ops[] = {Rsrc, // source register
7762 Offset, CachePolicy};
7763 SDValue BufferLoad =
7764 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_UBYTE, dl: DL,
7765 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
7766 LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
7767 } else {
7768 SDValue Ops[] = {
7769 DAG.getEntryNode(), // Chain
7770 Rsrc, // rsrc
7771 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
7772 {}, // voffset
7773 {}, // soffset
7774 {}, // offset
7775 CachePolicy, // cachepolicy
7776 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
7777 };
7778 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
7779 LoadVal = handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
7780 }
7781 Results.push_back(Elt: LoadVal);
7782 return;
7783 }
7784 case Intrinsic::amdgcn_dead: {
7785 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7786 Results.push_back(Elt: DAG.getPOISON(VT: N->getValueType(ResNo: I)));
7787 return;
7788 }
7789 }
7790 break;
7791 }
7792 case ISD::INTRINSIC_W_CHAIN: {
7793 if (SDValue Res = LowerINTRINSIC_W_CHAIN(Op: SDValue(N, 0), DAG)) {
7794 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7795 // FIXME: Hacky
7796 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7797 Results.push_back(Elt: Res.getOperand(i: I));
7798 }
7799 } else {
7800 Results.push_back(Elt: Res);
7801 Results.push_back(Elt: Res.getValue(R: 1));
7802 }
7803 return;
7804 }
7805
7806 break;
7807 }
7808 case ISD::SELECT: {
7809 SDLoc SL(N);
7810 EVT VT = N->getValueType(ResNo: 0);
7811 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT);
7812 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 1));
7813 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: N->getOperand(Num: 2));
7814
7815 EVT SelectVT = NewVT;
7816 if (NewVT.bitsLT(VT: MVT::i32)) {
7817 LHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: LHS);
7818 RHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: RHS);
7819 SelectVT = MVT::i32;
7820 }
7821
7822 SDValue NewSelect =
7823 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: SelectVT, N1: N->getOperand(Num: 0), N2: LHS, N3: RHS);
7824
7825 if (NewVT != SelectVT)
7826 NewSelect = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: NewVT, Operand: NewSelect);
7827 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: NewSelect));
7828 return;
7829 }
7830 case ISD::FNEG: {
7831 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7832 break;
7833
7834 SDLoc SL(N);
7835 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7836
7837 SDValue Op = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: BC,
7838 N2: DAG.getConstant(Val: 0x80008000, DL: SL, VT: MVT::i32));
7839 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7840 return;
7841 }
7842 case ISD::FABS: {
7843 if (N->getValueType(ResNo: 0) != MVT::v2f16)
7844 break;
7845
7846 SDLoc SL(N);
7847 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: N->getOperand(Num: 0));
7848
7849 SDValue Op = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: BC,
7850 N2: DAG.getConstant(Val: 0x7fff7fff, DL: SL, VT: MVT::i32));
7851 Results.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2f16, Operand: Op));
7852 return;
7853 }
7854 case ISD::FSQRT: {
7855 if (N->getValueType(ResNo: 0) != MVT::f16)
7856 break;
7857 Results.push_back(Elt: lowerFSQRTF16(Op: SDValue(N, 0), DAG));
7858 break;
7859 }
7860 default:
7861 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
7862 break;
7863 }
7864}
7865
7866/// Helper function for LowerBRCOND
7867static SDNode *findUser(SDValue Value, unsigned Opcode) {
7868
7869 for (SDUse &U : Value->uses()) {
7870 if (U.get() != Value)
7871 continue;
7872
7873 if (U.getUser()->getOpcode() == Opcode)
7874 return U.getUser();
7875 }
7876 return nullptr;
7877}
7878
7879unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7880 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7881 switch (Intr->getConstantOperandVal(Num: 1)) {
7882 case Intrinsic::amdgcn_if:
7883 return AMDGPUISD::IF;
7884 case Intrinsic::amdgcn_else:
7885 return AMDGPUISD::ELSE;
7886 case Intrinsic::amdgcn_loop:
7887 return AMDGPUISD::LOOP;
7888 case Intrinsic::amdgcn_end_cf:
7889 llvm_unreachable("should not occur");
7890 default:
7891 return 0;
7892 }
7893 }
7894
7895 // break, if_break, else_break are all only used as inputs to loop, not
7896 // directly as branch conditions.
7897 return 0;
7898}
7899
7900bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
7901 const Triple &TT = getTargetMachine().getTargetTriple();
7902 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
7903 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
7904 AMDGPU::shouldEmitConstantsToTextSection(TT);
7905}
7906
7907bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
7908 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7909 return false;
7910
7911 // FIXME: Either avoid relying on address space here or change the default
7912 // address space for functions to avoid the explicit check.
7913 return (GV->getValueType()->isFunctionTy() ||
7914 !isNonGlobalAddrSpace(AS: GV->getAddressSpace())) &&
7915 !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(GV);
7916}
7917
7918bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
7919 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7920}
7921
7922bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
7923 if (!GV->hasExternalLinkage())
7924 return true;
7925
7926 const auto OS = getTargetMachine().getTargetTriple().getOS();
7927 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7928}
7929
7930/// This transforms the control flow intrinsics to get the branch destination as
7931/// last parameter, also switches branch target with BR if the need arise
7932SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7933 SDLoc DL(BRCOND);
7934
7935 SDNode *Intr = BRCOND.getOperand(i: 1).getNode();
7936 SDValue Target = BRCOND.getOperand(i: 2);
7937 SDNode *BR = nullptr;
7938 SDNode *SetCC = nullptr;
7939
7940 switch (Intr->getOpcode()) {
7941 case ISD::SETCC: {
7942 // As long as we negate the condition everything is fine
7943 SetCC = Intr;
7944 Intr = SetCC->getOperand(Num: 0).getNode();
7945 break;
7946 }
7947 case ISD::XOR: {
7948 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7949 SDValue LHS = Intr->getOperand(Num: 0);
7950 SDValue RHS = Intr->getOperand(Num: 1);
7951 if (auto *C = dyn_cast<ConstantSDNode>(Val&: RHS); C && C->getZExtValue()) {
7952 Intr = LHS.getNode();
7953 break;
7954 }
7955 [[fallthrough]];
7956 }
7957 default: {
7958 // Get the target from BR if we don't negate the condition
7959 BR = findUser(Value: BRCOND, Opcode: ISD::BR);
7960 assert(BR && "brcond missing unconditional branch user");
7961 Target = BR->getOperand(Num: 1);
7962 }
7963 }
7964
7965 unsigned CFNode = isCFIntrinsic(Intr);
7966 if (CFNode == 0) {
7967 // This is a uniform branch so we don't need to legalize.
7968 return BRCOND;
7969 }
7970
7971 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7972 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
7973
7974 assert(!SetCC ||
7975 (SetCC->getConstantOperandVal(1) == 1 &&
7976 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7977 ISD::SETNE));
7978
7979 // operands of the new intrinsic call
7980 SmallVector<SDValue, 4> Ops;
7981 if (HaveChain)
7982 Ops.push_back(Elt: BRCOND.getOperand(i: 0));
7983
7984 Ops.append(in_start: Intr->op_begin() + (HaveChain ? 2 : 1), in_end: Intr->op_end());
7985 Ops.push_back(Elt: Target);
7986
7987 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7988
7989 // build the new intrinsic call
7990 SDNode *Result = DAG.getNode(Opcode: CFNode, DL, VTList: DAG.getVTList(VTs: Res), Ops).getNode();
7991
7992 if (!HaveChain) {
7993 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(i: 0)};
7994
7995 Result = DAG.getMergeValues(Ops, dl: DL).getNode();
7996 }
7997
7998 if (BR) {
7999 // Give the branch instruction our target
8000 SDValue Ops[] = {BR->getOperand(Num: 0), BRCOND.getOperand(i: 2)};
8001 SDValue NewBR = DAG.getNode(Opcode: ISD::BR, DL, VTList: BR->getVTList(), Ops);
8002 DAG.ReplaceAllUsesWith(From: BR, To: NewBR.getNode());
8003 }
8004
8005 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8006
8007 // Copy the intrinsic results to registers
8008 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8009 SDNode *CopyToReg = findUser(Value: SDValue(Intr, i), Opcode: ISD::CopyToReg);
8010 if (!CopyToReg)
8011 continue;
8012
8013 Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: CopyToReg->getOperand(Num: 1),
8014 N: SDValue(Result, i - 1), Glue: SDValue());
8015
8016 DAG.ReplaceAllUsesWith(From: SDValue(CopyToReg, 0), To: CopyToReg->getOperand(Num: 0));
8017 }
8018
8019 // Remove the old intrinsic from the chain
8020 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Intr, Intr->getNumValues() - 1),
8021 To: Intr->getOperand(Num: 0));
8022
8023 return Chain;
8024}
8025
8026SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8027 MVT VT = Op.getSimpleValueType();
8028 SDLoc DL(Op);
8029 // Checking the depth
8030 if (Op.getConstantOperandVal(i: 0) != 0)
8031 return DAG.getConstant(Val: 0, DL, VT);
8032
8033 MachineFunction &MF = DAG.getMachineFunction();
8034 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8035 // Check for kernel and shader functions
8036 if (Info->isEntryFunction())
8037 return DAG.getConstant(Val: 0, DL, VT);
8038
8039 MachineFrameInfo &MFI = MF.getFrameInfo();
8040 // There is a call to @llvm.returnaddress in this function
8041 MFI.setReturnAddressIsTaken(true);
8042
8043 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8044 // Get the return address reg and mark it as an implicit live-in
8045 Register Reg = MF.addLiveIn(PReg: TRI->getReturnAddressReg(MF),
8046 RC: getRegClassFor(VT, isDivergent: Op.getNode()->isDivergent()));
8047
8048 return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
8049}
8050
8051SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8052 MachineFunction &MF = DAG.getMachineFunction();
8053 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8054
8055 // For functions that set up their own stack, select the GET_STACK_BASE
8056 // pseudo.
8057 if (MFI->isBottomOfStack())
8058 return Op;
8059
8060 // For everything else, create a dummy stack object.
8061 int FI = MF.getFrameInfo().CreateFixedObject(Size: 1, SPOffset: 0, /*IsImmutable=*/false);
8062 return DAG.getFrameIndex(FI, VT: Op.getValueType());
8063}
8064
8065SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8066 const SDLoc &DL, EVT VT) const {
8067 return Op.getValueType().bitsLE(VT)
8068 ? DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Op)
8069 : DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Op,
8070 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8071}
8072
8073SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8074 SelectionDAG &DAG) const {
8075 EVT DstVT = Op.getValueType();
8076 unsigned NumElts = DstVT.getVectorNumElements();
8077 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8078
8079 auto [Lo, Hi] = DAG.SplitVectorOperand(N: Op.getNode(), OpNo: 0);
8080
8081 SDLoc DL(Op);
8082 unsigned Opc = Op.getOpcode();
8083 SDValue Flags = Op.getOperand(i: 1);
8084 EVT HalfDstVT =
8085 EVT::getVectorVT(Context&: *DAG.getContext(), VT: DstVT.getScalarType(), NumElements: NumElts / 2);
8086 SDValue OpLo = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Lo, N2: Flags);
8087 SDValue OpHi = DAG.getNode(Opcode: Opc, DL, VT: HalfDstVT, N1: Hi, N2: Flags);
8088
8089 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: OpLo, N2: OpHi);
8090}
8091
8092SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8093 SDValue Src = Op.getOperand(i: 0);
8094 EVT SrcVT = Src.getValueType();
8095 EVT DstVT = Op.getValueType();
8096
8097 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8098 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8099 if (SrcVT.getScalarType() != MVT::f32)
8100 return SDValue();
8101 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8102 }
8103
8104 if (SrcVT.getScalarType() != MVT::f64)
8105 return Op;
8106
8107 SDLoc DL(Op);
8108 if (DstVT == MVT::f16) {
8109 // TODO: Handle strictfp
8110 if (Op.getOpcode() != ISD::FP_ROUND)
8111 return Op;
8112
8113 if (!Subtarget->has16BitInsts()) {
8114 SDValue FpToFp16 = DAG.getNode(Opcode: ISD::FP_TO_FP16, DL, VT: MVT::i32, Operand: Src);
8115 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8116 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8117 }
8118 if (Op->getFlags().hasApproximateFuncs()) {
8119 SDValue Flags = Op.getOperand(i: 1);
8120 SDValue Src32 = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f32, N1: Src, N2: Flags);
8121 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: MVT::f16, N1: Src32, N2: Flags);
8122 }
8123 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8124 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: FpToFp16);
8125 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f16, Operand: Trunc);
8126 }
8127
8128 assert(DstVT.getScalarType() == MVT::bf16 &&
8129 "custom lower FP_ROUND for f16 or bf16");
8130 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8131
8132 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8133 // hardware f32 -> bf16 instruction.
8134 EVT F32VT = SrcVT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::f32);
8135 SDValue Rod = expandRoundInexactToOdd(ResultVT: F32VT, Op: Src, DL, DAG);
8136 return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: DstVT, N1: Rod,
8137 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
8138}
8139
8140SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8141 SelectionDAG &DAG) const {
8142 EVT VT = Op.getValueType();
8143 const MachineFunction &MF = DAG.getMachineFunction();
8144 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8145 bool IsIEEEMode = Info->getMode().IEEE;
8146
8147 // FIXME: Assert during selection that this is only selected for
8148 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8149 // mode functions, but this happens to be OK since it's only done in cases
8150 // where there is known no sNaN.
8151 if (IsIEEEMode)
8152 return expandFMINNUM_FMAXNUM(N: Op.getNode(), DAG);
8153
8154 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8155 VT == MVT::v16bf16)
8156 return splitBinaryVectorOp(Op, DAG);
8157 return Op;
8158}
8159
8160SDValue
8161SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8162 SelectionDAG &DAG) const {
8163 EVT VT = Op.getValueType();
8164 const MachineFunction &MF = DAG.getMachineFunction();
8165 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8166 bool IsIEEEMode = Info->getMode().IEEE;
8167
8168 if (IsIEEEMode)
8169 return expandFMINIMUMNUM_FMAXIMUMNUM(N: Op.getNode(), DAG);
8170
8171 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8172 VT == MVT::v16bf16)
8173 return splitBinaryVectorOp(Op, DAG);
8174 return Op;
8175}
8176
8177SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8178 SelectionDAG &DAG) const {
8179 EVT VT = Op.getValueType();
8180 if (VT.isVector())
8181 return splitBinaryVectorOp(Op, DAG);
8182
8183 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8184 !Subtarget->hasMinimum3Maximum3F16() &&
8185 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8186 "should not need to widen f16 minimum/maximum to v2f16");
8187
8188 // Widen f16 operation to v2f16
8189
8190 // fminimum f16:x, f16:y ->
8191 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8192 // (v2f16 (scalar_to_vector y))), 0
8193 SDLoc SL(Op);
8194 SDValue WideSrc0 =
8195 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 0));
8196 SDValue WideSrc1 =
8197 DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: SL, VT: MVT::v2f16, Operand: Op.getOperand(i: 1));
8198
8199 SDValue Widened =
8200 DAG.getNode(Opcode: Op.getOpcode(), DL: SL, VT: MVT::v2f16, N1: WideSrc0, N2: WideSrc1);
8201
8202 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::f16, N1: Widened,
8203 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8204}
8205
8206SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8207 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8208 EVT VT = Op.getValueType();
8209 assert(VT == MVT::f16);
8210
8211 SDValue Exp = Op.getOperand(i: IsStrict ? 2 : 1);
8212 EVT ExpVT = Exp.getValueType();
8213 if (ExpVT == MVT::i16)
8214 return Op;
8215
8216 SDLoc DL(Op);
8217
8218 // Correct the exponent type for f16 to i16.
8219 // Clamp the range of the exponent to the instruction's range.
8220
8221 // TODO: This should be a generic narrowing legalization, and can easily be
8222 // for GlobalISel.
8223
8224 SDValue MinExp = DAG.getSignedConstant(Val: minIntN(N: 16), DL, VT: ExpVT);
8225 SDValue ClampMin = DAG.getNode(Opcode: ISD::SMAX, DL, VT: ExpVT, N1: Exp, N2: MinExp);
8226
8227 SDValue MaxExp = DAG.getSignedConstant(Val: maxIntN(N: 16), DL, VT: ExpVT);
8228 SDValue Clamp = DAG.getNode(Opcode: ISD::SMIN, DL, VT: ExpVT, N1: ClampMin, N2: MaxExp);
8229
8230 SDValue TruncExp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Clamp);
8231
8232 if (IsStrict) {
8233 return DAG.getNode(Opcode: ISD::STRICT_FLDEXP, DL, ResultTys: {VT, MVT::Other},
8234 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), TruncExp});
8235 }
8236
8237 return DAG.getNode(Opcode: ISD::FLDEXP, DL, VT, N1: Op.getOperand(i: 0), N2: TruncExp);
8238}
8239
8240static unsigned getExtOpcodeForPromotedOp(SDValue Op) {
8241 switch (Op->getOpcode()) {
8242 case ISD::SRA:
8243 case ISD::SMIN:
8244 case ISD::SMAX:
8245 return ISD::SIGN_EXTEND;
8246 case ISD::SRL:
8247 case ISD::UMIN:
8248 case ISD::UMAX:
8249 return ISD::ZERO_EXTEND;
8250 case ISD::ADD:
8251 case ISD::SUB:
8252 case ISD::AND:
8253 case ISD::OR:
8254 case ISD::XOR:
8255 case ISD::SHL:
8256 case ISD::SELECT:
8257 case ISD::MUL:
8258 // operation result won't be influenced by garbage high bits.
8259 // TODO: are all of those cases correct, and are there more?
8260 return ISD::ANY_EXTEND;
8261 case ISD::SETCC: {
8262 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8263 return ISD::isSignedIntSetCC(Code: CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8264 }
8265 default:
8266 llvm_unreachable("unexpected opcode!");
8267 }
8268}
8269
8270SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8271 DAGCombinerInfo &DCI) const {
8272 const unsigned Opc = Op.getOpcode();
8273 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8274 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8275 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8276 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8277 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8278
8279 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8280 : Op->getOperand(Num: 0).getValueType();
8281 auto &DAG = DCI.DAG;
8282 auto ExtTy = OpTy.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
8283
8284 if (DCI.isBeforeLegalizeOps() ||
8285 isNarrowingProfitable(N: Op.getNode(), SrcVT: ExtTy, DestVT: OpTy))
8286 return SDValue();
8287
8288 SDLoc DL(Op);
8289 SDValue LHS;
8290 SDValue RHS;
8291 if (Opc == ISD::SELECT) {
8292 LHS = Op->getOperand(Num: 1);
8293 RHS = Op->getOperand(Num: 2);
8294 } else {
8295 LHS = Op->getOperand(Num: 0);
8296 RHS = Op->getOperand(Num: 1);
8297 }
8298
8299 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8300 LHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {LHS});
8301
8302 // Special case: for shifts, the RHS always needs a zext.
8303 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8304 RHS = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: ExtTy, Operand: {RHS});
8305 else
8306 RHS = DAG.getNode(Opcode: ExtOp, DL, VT: ExtTy, Operand: {RHS});
8307
8308 // setcc always return i1/i1 vec so no need to truncate after.
8309 if (Opc == ISD::SETCC) {
8310 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: 2))->get();
8311 return DAG.getSetCC(DL, VT: Op.getValueType(), LHS, RHS, Cond: CC);
8312 }
8313
8314 // For other ops, we extend the operation's return type as well so we need to
8315 // truncate back to the original type.
8316 SDValue NewVal;
8317 if (Opc == ISD::SELECT)
8318 NewVal = DAG.getNode(Opcode: ISD::SELECT, DL, VT: ExtTy, Ops: {Op->getOperand(Num: 0), LHS, RHS});
8319 else
8320 NewVal = DAG.getNode(Opcode: Opc, DL, VT: ExtTy, Ops: {LHS, RHS});
8321
8322 return DAG.getZExtOrTrunc(Op: NewVal, DL, VT: OpTy);
8323}
8324
8325SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8326 SDValue Mag = Op.getOperand(i: 0);
8327 EVT MagVT = Mag.getValueType();
8328
8329 if (MagVT.getVectorNumElements() > 2)
8330 return splitBinaryVectorOp(Op, DAG);
8331
8332 SDValue Sign = Op.getOperand(i: 1);
8333 EVT SignVT = Sign.getValueType();
8334
8335 if (MagVT == SignVT)
8336 return Op;
8337
8338 // fcopysign v2f16:mag, v2f32:sign ->
8339 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8340
8341 SDLoc SL(Op);
8342 SDValue SignAsInt32 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Sign);
8343 SDValue SignAsInt16 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::v2i16, Operand: SignAsInt32);
8344
8345 SDValue SignAsHalf16 = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MagVT, Operand: SignAsInt16);
8346
8347 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT: MagVT, N1: Mag, N2: SignAsHalf16);
8348}
8349
8350// Custom lowering for vector multiplications and s_mul_u64.
8351SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8352 EVT VT = Op.getValueType();
8353
8354 // Split vector operands.
8355 if (VT.isVector())
8356 return splitBinaryVectorOp(Op, DAG);
8357
8358 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8359
8360 // There are four ways to lower s_mul_u64:
8361 //
8362 // 1. If all the operands are uniform, then we lower it as it is.
8363 //
8364 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8365 // multiplications because there is not a vector equivalent of s_mul_u64.
8366 //
8367 // 3. If the cost model decides that it is more efficient to use vector
8368 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8369 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8370 //
8371 // 4. If the cost model decides to use vector registers and both of the
8372 // operands are zero-extended/sign-extended from 32-bits, then we split the
8373 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8374 // possible to check if the operands are zero-extended or sign-extended in
8375 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8376 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8377 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8378 // If the cost model decides that we have to use vector registers, then
8379 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8380 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8381 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8382 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8383 // SIInstrInfo.cpp .
8384
8385 if (Op->isDivergent())
8386 return SDValue();
8387
8388 SDValue Op0 = Op.getOperand(i: 0);
8389 SDValue Op1 = Op.getOperand(i: 1);
8390 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8391 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8392 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8393 KnownBits Op0KnownBits = DAG.computeKnownBits(Op: Op0);
8394 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8395 KnownBits Op1KnownBits = DAG.computeKnownBits(Op: Op1);
8396 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8397 SDLoc SL(Op);
8398 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8399 return SDValue(
8400 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_U64_U32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8401 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op: Op0);
8402 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op: Op1);
8403 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8404 return SDValue(
8405 DAG.getMachineNode(Opcode: AMDGPU::S_MUL_I64_I32_PSEUDO, dl: SL, VT, Op1: Op0, Op2: Op1), 0);
8406 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8407 return Op;
8408}
8409
8410SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8411 EVT VT = Op.getValueType();
8412 SDLoc SL(Op);
8413 SDValue LHS = Op.getOperand(i: 0);
8414 SDValue RHS = Op.getOperand(i: 1);
8415 bool isSigned = Op.getOpcode() == ISD::SMULO;
8416
8417 if (ConstantSDNode *RHSC = isConstOrConstSplat(N: RHS)) {
8418 const APInt &C = RHSC->getAPIntValue();
8419 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8420 if (C.isPowerOf2()) {
8421 // smulo(x, signed_min) is same as umulo(x, signed_min).
8422 bool UseArithShift = isSigned && !C.isMinSignedValue();
8423 SDValue ShiftAmt = DAG.getConstant(Val: C.logBase2(), DL: SL, VT: MVT::i32);
8424 SDValue Result = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: LHS, N2: ShiftAmt);
8425 SDValue Overflow =
8426 DAG.getSetCC(DL: SL, VT: MVT::i1,
8427 LHS: DAG.getNode(Opcode: UseArithShift ? ISD::SRA : ISD::SRL, DL: SL, VT,
8428 N1: Result, N2: ShiftAmt),
8429 RHS: LHS, Cond: ISD::SETNE);
8430 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8431 }
8432 }
8433
8434 SDValue Result = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT, N1: LHS, N2: RHS);
8435 SDValue Top =
8436 DAG.getNode(Opcode: isSigned ? ISD::MULHS : ISD::MULHU, DL: SL, VT, N1: LHS, N2: RHS);
8437
8438 SDValue Sign = isSigned
8439 ? DAG.getNode(Opcode: ISD::SRA, DL: SL, VT, N1: Result,
8440 N2: DAG.getConstant(Val: VT.getScalarSizeInBits() - 1,
8441 DL: SL, VT: MVT::i32))
8442 : DAG.getConstant(Val: 0, DL: SL, VT);
8443 SDValue Overflow = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Top, RHS: Sign, Cond: ISD::SETNE);
8444
8445 return DAG.getMergeValues(Ops: {Result, Overflow}, dl: SL);
8446}
8447
8448SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8449 if (Op->isDivergent()) {
8450 // Select to V_MAD_[IU]64_[IU]32.
8451 return Op;
8452 }
8453 if (Subtarget->hasSMulHi()) {
8454 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8455 return SDValue();
8456 }
8457 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8458 // calculate the high part, so we might as well do the whole thing with
8459 // V_MAD_[IU]64_[IU]32.
8460 return Op;
8461}
8462
8463SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8464 if (!Subtarget->hasTrapHandler() ||
8465 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8466 return lowerTrapEndpgm(Op, DAG);
8467
8468 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8469 : lowerTrapHsaQueuePtr(Op, DAG);
8470}
8471
8472SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8473 SDLoc SL(Op);
8474 SDValue Chain = Op.getOperand(i: 0);
8475 return DAG.getNode(Opcode: AMDGPUISD::ENDPGM_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8476}
8477
8478SDValue
8479SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8480 const SDLoc &DL, Align Alignment,
8481 ImplicitParameter Param) const {
8482 MachineFunction &MF = DAG.getMachineFunction();
8483 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8484 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL: DL, Chain: DAG.getEntryNode(), Offset);
8485 MachinePointerInfo PtrInfo =
8486 getKernargSegmentPtrInfo(MF&: DAG.getMachineFunction());
8487 return DAG.getLoad(
8488 VT, dl: DL, Chain: DAG.getEntryNode(), Ptr, PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
8489 MMOFlags: MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
8490}
8491
8492SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8493 SelectionDAG &DAG) const {
8494 SDLoc SL(Op);
8495 SDValue Chain = Op.getOperand(i: 0);
8496
8497 SDValue QueuePtr;
8498 // For code object version 5, QueuePtr is passed through implicit kernarg.
8499 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8500 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8501 QueuePtr =
8502 loadImplicitKernelArgument(DAG, VT: MVT::i64, DL: SL, Alignment: Align(8), Param: QUEUE_PTR);
8503 } else {
8504 MachineFunction &MF = DAG.getMachineFunction();
8505 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8506 Register UserSGPR = Info->getQueuePtrUserSGPR();
8507
8508 if (UserSGPR == AMDGPU::NoRegister) {
8509 // We probably are in a function incorrectly marked with
8510 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8511 // trap, so just use a null pointer.
8512 QueuePtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8513 } else {
8514 QueuePtr = CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR,
8515 VT: MVT::i64);
8516 }
8517 }
8518
8519 SDValue SGPR01 = DAG.getRegister(Reg: AMDGPU::SGPR0_SGPR1, VT: MVT::i64);
8520 SDValue ToReg = DAG.getCopyToReg(Chain, dl: SL, Reg: SGPR01, N: QueuePtr, Glue: SDValue());
8521
8522 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8523 SDValue Ops[] = {ToReg, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16), SGPR01,
8524 ToReg.getValue(R: 1)};
8525 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8526}
8527
8528SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8529 SDLoc SL(Op);
8530 SDValue Chain = Op.getOperand(i: 0);
8531
8532 // We need to simulate the 's_trap 2' instruction on targets that run in
8533 // PRIV=1 (where it is treated as a nop).
8534 if (Subtarget->hasPrivEnabledTrap2NopBug())
8535 return DAG.getNode(Opcode: AMDGPUISD::SIMULATED_TRAP, DL: SL, VT: MVT::Other, Operand: Chain);
8536
8537 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8538 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8539 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8540}
8541
8542SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8543 SDLoc SL(Op);
8544 SDValue Chain = Op.getOperand(i: 0);
8545 MachineFunction &MF = DAG.getMachineFunction();
8546
8547 if (!Subtarget->hasTrapHandler() ||
8548 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8549 LLVMContext &Ctx = MF.getFunction().getContext();
8550 Ctx.diagnose(DI: DiagnosticInfoUnsupported(MF.getFunction(),
8551 "debugtrap handler not supported",
8552 Op.getDebugLoc(), DS_Warning));
8553 return Chain;
8554 }
8555
8556 uint64_t TrapID =
8557 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8558 SDValue Ops[] = {Chain, DAG.getTargetConstant(Val: TrapID, DL: SL, VT: MVT::i16)};
8559 return DAG.getNode(Opcode: AMDGPUISD::TRAP, DL: SL, VT: MVT::Other, Ops);
8560}
8561
8562SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8563 SelectionDAG &DAG) const {
8564 if (Subtarget->hasApertureRegs()) {
8565 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8566 ? AMDGPU::SRC_SHARED_BASE
8567 : AMDGPU::SRC_PRIVATE_BASE;
8568 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8569 !Subtarget->hasGloballyAddressableScratch()) &&
8570 "Cannot use src_private_base with globally addressable scratch!");
8571 // Note: this feature (register) is broken. When used as a 32-bit operand,
8572 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8573 // bits.
8574 //
8575 // To work around the issue, emit a 64 bit copy from this register
8576 // then extract the high bits. Note that this shouldn't even result in a
8577 // shift being emitted and simply become a pair of registers (e.g.):
8578 // s_mov_b64 s[6:7], src_shared_base
8579 // v_mov_b32_e32 v1, s7
8580 SDValue Copy =
8581 DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ApertureRegNo, VT: MVT::v2i32);
8582 return DAG.getExtractVectorElt(DL, VT: MVT::i32, Vec: Copy, Idx: 1);
8583 }
8584
8585 // For code object version 5, private_base and shared_base are passed through
8586 // implicit kernargs.
8587 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8588 if (AMDGPU::getAMDHSACodeObjectVersion(M: *M) >= AMDGPU::AMDHSA_COV5) {
8589 ImplicitParameter Param =
8590 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
8591 return loadImplicitKernelArgument(DAG, VT: MVT::i32, DL, Alignment: Align(4), Param);
8592 }
8593
8594 MachineFunction &MF = DAG.getMachineFunction();
8595 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8596 Register UserSGPR = Info->getQueuePtrUserSGPR();
8597 if (UserSGPR == AMDGPU::NoRegister) {
8598 // We probably are in a function incorrectly marked with
8599 // amdgpu-no-queue-ptr. This is undefined.
8600 return DAG.getPOISON(VT: MVT::i32);
8601 }
8602
8603 SDValue QueuePtr =
8604 CreateLiveInRegister(DAG, RC: &AMDGPU::SReg_64RegClass, Reg: UserSGPR, VT: MVT::i64);
8605
8606 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8607 // private_segment_aperture_base_hi.
8608 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8609
8610 SDValue Ptr =
8611 DAG.getObjectPtrOffset(SL: DL, Ptr: QueuePtr, Offset: TypeSize::getFixed(ExactSize: StructOffset));
8612
8613 // TODO: Use custom target PseudoSourceValue.
8614 // TODO: We should use the value from the IR intrinsic call, but it might not
8615 // be available and how do we get it?
8616 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8617 return DAG.getLoad(VT: MVT::i32, dl: DL, Chain: QueuePtr.getValue(R: 1), Ptr, PtrInfo,
8618 Alignment: commonAlignment(A: Align(64), Offset: StructOffset),
8619 MMOFlags: MachineMemOperand::MODereferenceable |
8620 MachineMemOperand::MOInvariant);
8621}
8622
8623/// Return true if the value is a known valid address, such that a null check is
8624/// not necessary.
8625static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
8626 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8627 if (isa<FrameIndexSDNode, GlobalAddressSDNode, BasicBlockSDNode>(Val))
8628 return true;
8629
8630 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8631 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AS: AddrSpace);
8632
8633 // TODO: Search through arithmetic, handle arguments and loads
8634 // marked nonnull.
8635 return false;
8636}
8637
8638SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8639 SelectionDAG &DAG) const {
8640 SDLoc SL(Op);
8641
8642 const AMDGPUTargetMachine &TM =
8643 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8644
8645 unsigned DestAS, SrcAS;
8646 SDValue Src;
8647 bool IsNonNull = false;
8648 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Val&: Op)) {
8649 SrcAS = ASC->getSrcAddressSpace();
8650 Src = ASC->getOperand(Num: 0);
8651 DestAS = ASC->getDestAddressSpace();
8652 } else {
8653 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8654 Op.getConstantOperandVal(0) ==
8655 Intrinsic::amdgcn_addrspacecast_nonnull);
8656 Src = Op->getOperand(Num: 1);
8657 SrcAS = Op->getConstantOperandVal(Num: 2);
8658 DestAS = Op->getConstantOperandVal(Num: 3);
8659 IsNonNull = true;
8660 }
8661
8662 SDValue FlatNullPtr = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i64);
8663
8664 // flat -> local/private
8665 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8666 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8667 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8668 SDValue Ptr = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8669
8670 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8671 Subtarget->hasGloballyAddressableScratch()) {
8672 // flat -> private with globally addressable scratch: subtract
8673 // src_flat_scratch_base_lo.
8674 SDValue FlatScratchBaseLo(
8675 DAG.getMachineNode(
8676 Opcode: AMDGPU::S_MOV_B32, dl: SL, VT: MVT::i32,
8677 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, VT: MVT::i32)),
8678 0);
8679 Ptr = DAG.getNode(Opcode: ISD::SUB, DL: SL, VT: MVT::i32, N1: Ptr, N2: FlatScratchBaseLo);
8680 }
8681
8682 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8683 return Ptr;
8684
8685 unsigned NullVal = AMDGPU::getNullPointerValue(AS: DestAS);
8686 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8687 SDValue NonNull = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: FlatNullPtr, Cond: ISD::SETNE);
8688
8689 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i32, N1: NonNull, N2: Ptr,
8690 N3: SegmentNullPtr);
8691 }
8692 }
8693
8694 // local/private -> flat
8695 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8696 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8697 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8698 SDValue CvtPtr;
8699 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8700 Subtarget->hasGloballyAddressableScratch()) {
8701 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8702 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8703 SDValue AllOnes = DAG.getSignedTargetConstant(Val: -1, DL: SL, VT: MVT::i32);
8704 SDValue ThreadID = DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
8705 ThreadID = DAG.getNode(
8706 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8707 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_lo, DL: SL, VT: MVT::i32),
8708 N2: AllOnes, N3: ThreadID);
8709 if (Subtarget->isWave64())
8710 ThreadID = DAG.getNode(
8711 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32,
8712 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_mbcnt_hi, DL: SL, VT: MVT::i32),
8713 N2: AllOnes, N3: ThreadID);
8714 SDValue ShAmt = DAG.getShiftAmountConstant(
8715 Val: 57 - 32 - Subtarget->getWavefrontSizeLog2(), VT: MVT::i32, DL: SL);
8716 SDValue SrcHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: ThreadID, N2: ShAmt);
8717 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: SrcHi);
8718 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8719 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8720 // 64-bit hi:lo value.
8721 SDValue FlatScratchBase = {
8722 DAG.getMachineNode(
8723 Opcode: AMDGPU::S_MOV_B64, dl: SL, VT: MVT::i64,
8724 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE, VT: MVT::i64)),
8725 0};
8726 CvtPtr = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i64, N1: CvtPtr, N2: FlatScratchBase);
8727 } else {
8728 SDValue Aperture = getSegmentAperture(AS: SrcAS, DL: SL, DAG);
8729 CvtPtr = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Aperture);
8730 CvtPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: CvtPtr);
8731 }
8732
8733 if (IsNonNull || isKnownNonNull(Val: Op, DAG, TM, AddrSpace: SrcAS))
8734 return CvtPtr;
8735
8736 unsigned NullVal = AMDGPU::getNullPointerValue(AS: SrcAS);
8737 SDValue SegmentNullPtr = DAG.getConstant(Val: NullVal, DL: SL, VT: MVT::i32);
8738
8739 SDValue NonNull =
8740 DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: Src, RHS: SegmentNullPtr, Cond: ISD::SETNE);
8741
8742 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::i64, N1: NonNull, N2: CvtPtr,
8743 N3: FlatNullPtr);
8744 }
8745 }
8746
8747 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8748 Op.getValueType() == MVT::i64) {
8749 const SIMachineFunctionInfo *Info =
8750 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8751 if (Info->get32BitAddressHighBits() == 0)
8752 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i64, Operand: Src);
8753
8754 SDValue Hi = DAG.getConstant(Val: Info->get32BitAddressHighBits(), DL: SL, VT: MVT::i32);
8755 SDValue Vec = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: Src, N2: Hi);
8756 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
8757 }
8758
8759 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8760 Src.getValueType() == MVT::i64)
8761 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Src);
8762
8763 // global <-> flat are no-ops and never emitted.
8764
8765 // Invalid casts are poison.
8766 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
8767}
8768
8769// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8770// the small vector and inserting them into the big vector. That is better than
8771// the default expansion of doing it via a stack slot. Even though the use of
8772// the stack slot would be optimized away afterwards, the stack slot itself
8773// remains.
8774SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8775 SelectionDAG &DAG) const {
8776 SDValue Vec = Op.getOperand(i: 0);
8777 SDValue Ins = Op.getOperand(i: 1);
8778 SDValue Idx = Op.getOperand(i: 2);
8779 EVT VecVT = Vec.getValueType();
8780 EVT InsVT = Ins.getValueType();
8781 EVT EltVT = VecVT.getVectorElementType();
8782 unsigned InsNumElts = InsVT.getVectorNumElements();
8783 unsigned IdxVal = Idx->getAsZExtVal();
8784 SDLoc SL(Op);
8785
8786 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8787 // Insert 32-bit registers at a time.
8788 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8789
8790 unsigned VecNumElts = VecVT.getVectorNumElements();
8791 EVT NewVecVT =
8792 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: VecNumElts / 2);
8793 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8794 : EVT::getVectorVT(Context&: *DAG.getContext(),
8795 VT: MVT::i32, NumElements: InsNumElts / 2);
8796
8797 Vec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVecVT, Operand: Vec);
8798 Ins = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewInsVT, Operand: Ins);
8799
8800 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8801 SDValue Elt;
8802 if (InsNumElts == 2) {
8803 Elt = Ins;
8804 } else {
8805 Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Ins,
8806 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8807 }
8808 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: NewVecVT, N1: Vec, N2: Elt,
8809 N3: DAG.getConstant(Val: IdxVal / 2 + I, DL: SL, VT: MVT::i32));
8810 }
8811
8812 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Vec);
8813 }
8814
8815 for (unsigned I = 0; I != InsNumElts; ++I) {
8816 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Ins,
8817 N2: DAG.getConstant(Val: I, DL: SL, VT: MVT::i32));
8818 Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: VecVT, N1: Vec, N2: Elt,
8819 N3: DAG.getConstant(Val: IdxVal + I, DL: SL, VT: MVT::i32));
8820 }
8821 return Vec;
8822}
8823
8824SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8825 SelectionDAG &DAG) const {
8826 SDValue Vec = Op.getOperand(i: 0);
8827 SDValue InsVal = Op.getOperand(i: 1);
8828 SDValue Idx = Op.getOperand(i: 2);
8829 EVT VecVT = Vec.getValueType();
8830 EVT EltVT = VecVT.getVectorElementType();
8831 unsigned VecSize = VecVT.getSizeInBits();
8832 unsigned EltSize = EltVT.getSizeInBits();
8833 SDLoc SL(Op);
8834
8835 // Specially handle the case of v4i16 with static indexing.
8836 unsigned NumElts = VecVT.getVectorNumElements();
8837 auto *KIdx = dyn_cast<ConstantSDNode>(Val&: Idx);
8838 if (NumElts == 4 && EltSize == 16 && KIdx) {
8839 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Vec);
8840
8841 SDValue LoHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8842 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
8843 SDValue HiHalf = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: BCVec,
8844 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
8845
8846 SDValue LoVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: LoHalf);
8847 SDValue HiVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i16, Operand: HiHalf);
8848
8849 unsigned Idx = KIdx->getZExtValue();
8850 bool InsertLo = Idx < 2;
8851 SDValue InsHalf = DAG.getNode(
8852 Opcode: ISD::INSERT_VECTOR_ELT, DL: SL, VT: MVT::v2i16, N1: InsertLo ? LoVec : HiVec,
8853 N2: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: InsVal),
8854 N3: DAG.getConstant(Val: InsertLo ? Idx : (Idx - 2), DL: SL, VT: MVT::i32));
8855
8856 InsHalf = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: InsHalf);
8857
8858 SDValue Concat =
8859 InsertLo ? DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {InsHalf, HiHalf})
8860 : DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {LoHalf, InsHalf});
8861
8862 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: Concat);
8863 }
8864
8865 // Static indexing does not lower to stack access, and hence there is no need
8866 // for special custom lowering to avoid stack access.
8867 if (isa<ConstantSDNode>(Val: Idx))
8868 return SDValue();
8869
8870 // Avoid stack access for dynamic indexing by custom lowering to
8871 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8872
8873 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8874
8875 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8876
8877 // Convert vector index to bit-index and get the required bit mask.
8878 assert(isPowerOf2_32(EltSize));
8879 const auto EltMask = maskTrailingOnes<uint64_t>(N: EltSize);
8880 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8881 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8882 SDValue BFM = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: IntVT,
8883 N1: DAG.getConstant(Val: EltMask, DL: SL, VT: IntVT), N2: ScaledIdx);
8884
8885 // 1. Create a congruent vector with the target value in each element.
8886 SDValue ExtVal = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT,
8887 Operand: DAG.getSplatBuildVector(VT: VecVT, DL: SL, Op: InsVal));
8888
8889 // 2. Mask off all other indices except the required index within (1).
8890 SDValue LHS = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: BFM, N2: ExtVal);
8891
8892 // 3. Mask off the required index within the target vector.
8893 SDValue BCVec = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8894 SDValue RHS =
8895 DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IntVT, N1: DAG.getNOT(DL: SL, Val: BFM, VT: IntVT), N2: BCVec);
8896
8897 // 4. Get (2) and (3) ORed into the target vector.
8898 SDValue BFI =
8899 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: IntVT, N1: LHS, N2: RHS, Flags: SDNodeFlags::Disjoint);
8900
8901 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecVT, Operand: BFI);
8902}
8903
8904SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8905 SelectionDAG &DAG) const {
8906 SDLoc SL(Op);
8907
8908 EVT ResultVT = Op.getValueType();
8909 SDValue Vec = Op.getOperand(i: 0);
8910 SDValue Idx = Op.getOperand(i: 1);
8911 EVT VecVT = Vec.getValueType();
8912 unsigned VecSize = VecVT.getSizeInBits();
8913 EVT EltVT = VecVT.getVectorElementType();
8914
8915 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8916
8917 // Make sure we do any optimizations that will make it easier to fold
8918 // source modifiers before obscuring it with bit operations.
8919
8920 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8921 if (SDValue Combined = performExtractVectorEltCombine(N: Op.getNode(), DCI))
8922 return Combined;
8923
8924 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8925 SDValue Lo, Hi;
8926 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT: VecVT);
8927
8928 if (VecSize == 128) {
8929 SDValue V2 = DAG.getBitcast(VT: MVT::v2i64, V: Vec);
8930 Lo = DAG.getBitcast(VT: LoVT,
8931 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8932 N2: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32)));
8933 Hi = DAG.getBitcast(VT: HiVT,
8934 V: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8935 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32)));
8936 } else if (VecSize == 256) {
8937 SDValue V2 = DAG.getBitcast(VT: MVT::v4i64, V: Vec);
8938 SDValue Parts[4];
8939 for (unsigned P = 0; P < 4; ++P) {
8940 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8941 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8942 }
8943
8944 Lo = DAG.getBitcast(VT: LoVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8945 N1: Parts[0], N2: Parts[1]));
8946 Hi = DAG.getBitcast(VT: HiVT, V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i64,
8947 N1: Parts[2], N2: Parts[3]));
8948 } else {
8949 assert(VecSize == 512);
8950
8951 SDValue V2 = DAG.getBitcast(VT: MVT::v8i64, V: Vec);
8952 SDValue Parts[8];
8953 for (unsigned P = 0; P < 8; ++P) {
8954 Parts[P] = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i64, N1: V2,
8955 N2: DAG.getConstant(Val: P, DL: SL, VT: MVT::i32));
8956 }
8957
8958 Lo = DAG.getBitcast(VT: LoVT,
8959 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8960 N1: Parts[0], N2: Parts[1], N3: Parts[2], N4: Parts[3]));
8961 Hi = DAG.getBitcast(VT: HiVT,
8962 V: DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v4i64,
8963 N1: Parts[4], N2: Parts[5], N3: Parts[6], N4: Parts[7]));
8964 }
8965
8966 EVT IdxVT = Idx.getValueType();
8967 unsigned NElem = VecVT.getVectorNumElements();
8968 assert(isPowerOf2_32(NElem));
8969 SDValue IdxMask = DAG.getConstant(Val: NElem / 2 - 1, DL: SL, VT: IdxVT);
8970 SDValue NewIdx = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: IdxVT, N1: Idx, N2: IdxMask);
8971 SDValue Half = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IdxMask, True: Hi, False: Lo, Cond: ISD::SETUGT);
8972 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Half, N2: NewIdx);
8973 }
8974
8975 assert(VecSize <= 64);
8976
8977 MVT IntVT = MVT::getIntegerVT(BitWidth: VecSize);
8978
8979 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8980 SDValue VecBC = peekThroughBitcasts(V: Vec);
8981 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8982 SDValue Src = VecBC.getOperand(i: 0);
8983 Src = DAG.getBitcast(VT: Src.getValueType().changeTypeToInteger(), V: Src);
8984 Vec = DAG.getAnyExtOrTrunc(Op: Src, DL: SL, VT: IntVT);
8985 }
8986
8987 unsigned EltSize = EltVT.getSizeInBits();
8988 assert(isPowerOf2_32(EltSize));
8989
8990 SDValue ScaleFactor = DAG.getConstant(Val: Log2_32(Value: EltSize), DL: SL, VT: MVT::i32);
8991
8992 // Convert vector index to bit-index (* EltSize)
8993 SDValue ScaledIdx = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Idx, N2: ScaleFactor);
8994
8995 SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: IntVT, Operand: Vec);
8996 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: IntVT, N1: BC, N2: ScaledIdx);
8997
8998 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8999 SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i16, Operand: Elt);
9000 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: ResultVT, Operand: Result);
9001 }
9002
9003 return DAG.getAnyExtOrTrunc(Op: Elt, DL: SL, VT: ResultVT);
9004}
9005
9006static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9007 assert(Elt % 2 == 0);
9008 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9009}
9010
9011static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9012 assert(Elt % 2 == 0);
9013 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9014 !(Mask[Elt + 1] & 1);
9015}
9016
9017SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9018 SelectionDAG &DAG) const {
9019 SDLoc SL(Op);
9020 EVT ResultVT = Op.getValueType();
9021 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val&: Op);
9022 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9023 const int NewSrcNumElts = 2;
9024 MVT PackVT = MVT::getVectorVT(VT: EltVT, NumElements: NewSrcNumElts);
9025 int SrcNumElts = Op.getOperand(i: 0).getValueType().getVectorNumElements();
9026
9027 // Break up the shuffle into registers sized pieces.
9028 //
9029 // We're trying to form sub-shuffles that the register allocation pipeline
9030 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9031 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9032 // pair of copies into a consecutive register copy, so use the ordinary
9033 // extract_vector_elt lowering unless we can use the shuffle.
9034 //
9035 // TODO: This is a bit of hack, and we should probably always use
9036 // extract_subvector for the largest possible subvector we can (or at least
9037 // use it for PackVT aligned pieces). However we have worse support for
9038 // combines on them don't directly treat extract_subvector / insert_subvector
9039 // as legal. The DAG scheduler also ends up doing a worse job with the
9040 // extract_subvectors.
9041 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9042
9043 // vector_shuffle <0,1,6,7> lhs, rhs
9044 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9045 //
9046 // vector_shuffle <6,7,2,3> lhs, rhs
9047 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9048 //
9049 // vector_shuffle <6,7,0,1> lhs, rhs
9050 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9051
9052 // Avoid scalarizing when both halves are reading from consecutive elements.
9053
9054 // If we're treating 2 element shuffles as legal, also create odd-to-even
9055 // shuffles of neighboring pairs.
9056 //
9057 // vector_shuffle <3,2,7,6> lhs, rhs
9058 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9059 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9060
9061 SmallVector<SDValue, 16> Pieces;
9062 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9063 if (ShouldUseConsecutiveExtract &&
9064 elementPairIsContiguous(Mask: SVN->getMask(), Elt: I)) {
9065 const int Idx = SVN->getMaskElt(Idx: I);
9066 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9067 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9068 SDValue SubVec = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT,
9069 N1: SVN->getOperand(Num: VecIdx),
9070 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
9071 Pieces.push_back(Elt: SubVec);
9072 } else if (elementPairIsOddToEven(Mask: SVN->getMask(), Elt: I) &&
9073 isOperationLegal(Op: ISD::VECTOR_SHUFFLE, VT: PackVT)) {
9074 int Idx0 = SVN->getMaskElt(Idx: I);
9075 int Idx1 = SVN->getMaskElt(Idx: I + 1);
9076
9077 SDValue SrcOp0 = SVN->getOperand(Num: 0);
9078 SDValue SrcOp1 = SrcOp0;
9079 if (Idx0 >= SrcNumElts) {
9080 SrcOp0 = SVN->getOperand(Num: 1);
9081 Idx0 -= SrcNumElts;
9082 }
9083
9084 if (Idx1 >= SrcNumElts) {
9085 SrcOp1 = SVN->getOperand(Num: 1);
9086 Idx1 -= SrcNumElts;
9087 }
9088
9089 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9090 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9091
9092 // Extract nearest even aligned piece.
9093 SDValue SubVec0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp0,
9094 N2: DAG.getConstant(Val: AlignedIdx0, DL: SL, VT: MVT::i32));
9095 SDValue SubVec1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SL, VT: PackVT, N1: SrcOp1,
9096 N2: DAG.getConstant(Val: AlignedIdx1, DL: SL, VT: MVT::i32));
9097
9098 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9099 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9100
9101 SDValue Result0 = SubVec0;
9102 SDValue Result1 = SubVec0;
9103
9104 if (SubVec0 != SubVec1) {
9105 NewMaskIdx1 += NewSrcNumElts;
9106 Result1 = SubVec1;
9107 } else {
9108 Result1 = DAG.getPOISON(VT: PackVT);
9109 }
9110
9111 SDValue Shuf = DAG.getVectorShuffle(VT: PackVT, dl: SL, N1: Result0, N2: Result1,
9112 Mask: {NewMaskIdx0, NewMaskIdx1});
9113 Pieces.push_back(Elt: Shuf);
9114 } else {
9115 const int Idx0 = SVN->getMaskElt(Idx: I);
9116 const int Idx1 = SVN->getMaskElt(Idx: I + 1);
9117 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9118 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9119 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9120 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9121
9122 SDValue Vec0 = SVN->getOperand(Num: VecIdx0);
9123 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec0,
9124 N2: DAG.getSignedConstant(Val: EltIdx0, DL: SL, VT: MVT::i32));
9125
9126 SDValue Vec1 = SVN->getOperand(Num: VecIdx1);
9127 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec1,
9128 N2: DAG.getSignedConstant(Val: EltIdx1, DL: SL, VT: MVT::i32));
9129 Pieces.push_back(Elt: DAG.getBuildVector(VT: PackVT, DL: SL, Ops: {Elt0, Elt1}));
9130 }
9131 }
9132
9133 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SL, VT: ResultVT, Ops: Pieces);
9134}
9135
9136SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9137 SelectionDAG &DAG) const {
9138 SDValue SVal = Op.getOperand(i: 0);
9139 EVT ResultVT = Op.getValueType();
9140 EVT SValVT = SVal.getValueType();
9141 SDValue UndefVal = DAG.getPOISON(VT: SValVT);
9142 SDLoc SL(Op);
9143
9144 SmallVector<SDValue, 8> VElts;
9145 VElts.push_back(Elt: SVal);
9146 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9147 VElts.push_back(Elt: UndefVal);
9148
9149 return DAG.getBuildVector(VT: ResultVT, DL: SL, Ops: VElts);
9150}
9151
9152SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9153 SelectionDAG &DAG) const {
9154 SDLoc SL(Op);
9155 EVT VT = Op.getValueType();
9156
9157 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9158 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9159
9160 SDValue Lo = Op.getOperand(i: 0);
9161 SDValue Hi = Op.getOperand(i: 1);
9162
9163 // Avoid adding defined bits with the zero_extend.
9164 if (Hi.isUndef()) {
9165 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9166 SDValue ExtLo = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9167 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ExtLo);
9168 }
9169
9170 Hi = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Hi);
9171 Hi = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Hi);
9172
9173 SDValue ShlHi = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT: MVT::i32, N1: Hi,
9174 N2: DAG.getConstant(Val: 16, DL: SL, VT: MVT::i32));
9175 if (Lo.isUndef())
9176 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: ShlHi);
9177
9178 Lo = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Lo);
9179 Lo = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT: MVT::i32, Operand: Lo);
9180
9181 SDValue Or =
9182 DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Lo, N2: ShlHi, Flags: SDNodeFlags::Disjoint);
9183 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Or);
9184 }
9185
9186 // Split into 2-element chunks.
9187 const unsigned NumParts = VT.getVectorNumElements() / 2;
9188 EVT PartVT = MVT::getVectorVT(VT: VT.getVectorElementType().getSimpleVT(), NumElements: 2);
9189 MVT PartIntVT = MVT::getIntegerVT(BitWidth: PartVT.getSizeInBits());
9190
9191 SmallVector<SDValue> Casts;
9192 for (unsigned P = 0; P < NumParts; ++P) {
9193 SDValue Vec = DAG.getBuildVector(
9194 VT: PartVT, DL: SL, Ops: {Op.getOperand(i: P * 2), Op.getOperand(i: P * 2 + 1)});
9195 Casts.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: PartIntVT, Operand: Vec));
9196 }
9197
9198 SDValue Blend =
9199 DAG.getBuildVector(VT: MVT::getVectorVT(VT: PartIntVT, NumElements: NumParts), DL: SL, Ops: Casts);
9200 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Blend);
9201}
9202
9203bool SITargetLowering::isOffsetFoldingLegal(
9204 const GlobalAddressSDNode *GA) const {
9205 // OSes that use ELF REL relocations (instead of RELA) can only store a
9206 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9207 // which can create arbitrary 64-bit addends. (This is only a problem for
9208 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9209 // the high 32 bits of the addend.)
9210 //
9211 // This should be kept in sync with how HasRelocationAddend is initialized in
9212 // the constructor of ELFAMDGPUAsmBackend.
9213 if (!Subtarget->isAmdHsaOS())
9214 return false;
9215
9216 // We can fold offsets for anything that doesn't require a GOT relocation.
9217 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9218 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
9219 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
9220 !shouldEmitGOTReloc(GV: GA->getGlobal());
9221}
9222
9223static SDValue
9224buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
9225 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9226 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9227 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9228 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9229 // lowered to the following code sequence:
9230 //
9231 // For constant address space:
9232 // s_getpc_b64 s[0:1]
9233 // s_add_u32 s0, s0, $symbol
9234 // s_addc_u32 s1, s1, 0
9235 //
9236 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9237 // a fixup or relocation is emitted to replace $symbol with a literal
9238 // constant, which is a pc-relative offset from the encoding of the $symbol
9239 // operand to the global variable.
9240 //
9241 // For global address space:
9242 // s_getpc_b64 s[0:1]
9243 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9244 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9245 //
9246 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9247 // fixups or relocations are emitted to replace $symbol@*@lo and
9248 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9249 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9250 // operand to the global variable.
9251 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9252 assert(GAFlags != SIInstrInfo::MO_NONE);
9253
9254 SDValue Ptr =
9255 DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset, TargetFlags: GAFlags + 2);
9256 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET64, DL, VT: PtrVT, Operand: Ptr);
9257 }
9258
9259 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags);
9260 SDValue PtrHi;
9261 if (GAFlags == SIInstrInfo::MO_NONE)
9262 PtrHi = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
9263 else
9264 PtrHi = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: Offset, TargetFlags: GAFlags + 1);
9265 return DAG.getNode(Opcode: AMDGPUISD::PC_ADD_REL_OFFSET, DL, VT: PtrVT, N1: PtrLo, N2: PtrHi);
9266}
9267
9268SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
9269 SDValue Op,
9270 SelectionDAG &DAG) const {
9271 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Val&: Op);
9272 SDLoc DL(GSD);
9273 EVT PtrVT = Op.getValueType();
9274
9275 const GlobalValue *GV = GSD->getGlobal();
9276 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9277 shouldUseLDSConstAddress(GV)) ||
9278 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
9279 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
9280 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
9281 GV->hasExternalLinkage()) {
9282 const GlobalVariable &GVar = *cast<GlobalVariable>(Val: GV);
9283 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9284 // zero-sized type in other languages to declare the dynamic shared
9285 // memory which size is not known at the compile time. They will be
9286 // allocated by the runtime and placed directly after the static
9287 // allocated ones. They all share the same offset.
9288 if (GVar.getGlobalSize(DL: GVar.getDataLayout()) == 0) {
9289 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9290 // Adjust alignment for that dynamic shared memory array.
9291 Function &F = DAG.getMachineFunction().getFunction();
9292 MFI->setDynLDSAlign(F, GV: GVar);
9293 MFI->setUsesDynamicLDS(true);
9294 return SDValue(
9295 DAG.getMachineNode(Opcode: AMDGPU::GET_GROUPSTATICSIZE, dl: DL, VT: PtrVT), 0);
9296 }
9297 }
9298 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
9299 }
9300
9301 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
9302 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: GSD->getOffset(),
9303 TargetFlags: SIInstrInfo::MO_ABS32_LO);
9304 return DAG.getNode(Opcode: AMDGPUISD::LDS, DL, VT: MVT::i32, Operand: GA);
9305 }
9306
9307 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9308 if (Subtarget->has64BitLiterals()) {
9309 SDValue Addr = DAG.getTargetGlobalAddress(
9310 GV, DL, VT: MVT::i64, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS64);
9311 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B64, dl: DL, VT: MVT::i64, Op1: Addr),
9312 0);
9313 }
9314
9315 SDValue AddrLo = DAG.getTargetGlobalAddress(
9316 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_LO);
9317 AddrLo = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrLo), 0};
9318
9319 SDValue AddrHi = DAG.getTargetGlobalAddress(
9320 GV, DL, VT: MVT::i32, offset: GSD->getOffset(), TargetFlags: SIInstrInfo::MO_ABS32_HI);
9321 AddrHi = {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: AddrHi), 0};
9322
9323 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i64, N1: AddrLo, N2: AddrHi);
9324 }
9325
9326 if (shouldEmitFixup(GV))
9327 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT);
9328
9329 if (shouldEmitPCReloc(GV))
9330 return buildPCRelGlobalAddress(DAG, GV, DL, Offset: GSD->getOffset(), PtrVT,
9331 GAFlags: SIInstrInfo::MO_REL32);
9332
9333 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, Offset: 0, PtrVT,
9334 GAFlags: SIInstrInfo::MO_GOTPCREL32);
9335 PointerType *PtrTy =
9336 PointerType::get(C&: *DAG.getContext(), AddressSpace: AMDGPUAS::CONSTANT_ADDRESS);
9337 const DataLayout &DataLayout = DAG.getDataLayout();
9338 Align Alignment = DataLayout.getABITypeAlign(Ty: PtrTy);
9339 MachinePointerInfo PtrInfo =
9340 MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction());
9341
9342 return DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: GOTAddr, PtrInfo, Alignment,
9343 MMOFlags: MachineMemOperand::MODereferenceable |
9344 MachineMemOperand::MOInvariant);
9345}
9346
9347SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9348 SelectionDAG &DAG) const {
9349 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9350 const Function &Fn = DAG.getMachineFunction().getFunction();
9351 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9352 Fn, "unsupported external symbol", Op.getDebugLoc()));
9353 return DAG.getPOISON(VT: Op.getValueType());
9354}
9355
9356SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
9357 const SDLoc &DL, SDValue V) const {
9358 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9359 // the destination register.
9360 //
9361 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9362 // so we will end up with redundant moves to m0.
9363 //
9364 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9365
9366 // A Null SDValue creates a glue result.
9367 SDNode *M0 = DAG.getMachineNode(Opcode: AMDGPU::SI_INIT_M0, dl: DL, VT1: MVT::Other, VT2: MVT::Glue,
9368 Op1: V, Op2: Chain);
9369 return SDValue(M0, 0);
9370}
9371
9372SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9373 MVT VT,
9374 unsigned Offset) const {
9375 SDLoc SL(Op);
9376 SDValue Param = lowerKernargMemParameter(
9377 DAG, VT: MVT::i32, MemVT: MVT::i32, SL, Chain: DAG.getEntryNode(), Offset, Alignment: Align(4), Signed: false);
9378 // The local size values will have the hi 16-bits as zero.
9379 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Param,
9380 N2: DAG.getValueType(VT));
9381}
9382
9383static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9384 EVT VT) {
9385 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9386 DAG.getMachineFunction().getFunction(),
9387 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9388 return DAG.getPOISON(VT);
9389}
9390
9391static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
9392 EVT VT) {
9393 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9394 DAG.getMachineFunction().getFunction(),
9395 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9396 return DAG.getPOISON(VT);
9397}
9398
9399static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
9400 ArrayRef<SDValue> Elts) {
9401 assert(!Elts.empty());
9402 MVT Type;
9403 unsigned NumElts = Elts.size();
9404
9405 if (NumElts <= 12) {
9406 Type = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
9407 } else {
9408 assert(Elts.size() <= 16);
9409 Type = MVT::v16f32;
9410 NumElts = 16;
9411 }
9412
9413 SmallVector<SDValue, 16> VecElts(NumElts);
9414 for (unsigned i = 0; i < Elts.size(); ++i) {
9415 SDValue Elt = Elts[i];
9416 if (Elt.getValueType() != MVT::f32)
9417 Elt = DAG.getBitcast(VT: MVT::f32, V: Elt);
9418 VecElts[i] = Elt;
9419 }
9420 for (unsigned i = Elts.size(); i < NumElts; ++i)
9421 VecElts[i] = DAG.getPOISON(VT: MVT::f32);
9422
9423 if (NumElts == 1)
9424 return VecElts[0];
9425 return DAG.getBuildVector(VT: Type, DL, Ops: VecElts);
9426}
9427
9428static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9429 SDValue Src, int ExtraElts) {
9430 EVT SrcVT = Src.getValueType();
9431
9432 SmallVector<SDValue, 8> Elts;
9433
9434 if (SrcVT.isVector())
9435 DAG.ExtractVectorElements(Op: Src, Args&: Elts);
9436 else
9437 Elts.push_back(Elt: Src);
9438
9439 SDValue Undef = DAG.getPOISON(VT: SrcVT.getScalarType());
9440 while (ExtraElts--)
9441 Elts.push_back(Elt: Undef);
9442
9443 return DAG.getBuildVector(VT: CastVT, DL, Ops: Elts);
9444}
9445
9446// Re-construct the required return value for a image load intrinsic.
9447// This is more complicated due to the optional use TexFailCtrl which means the
9448// required return type is an aggregate
9449static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
9450 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9451 bool Unpacked, bool IsD16, int DMaskPop,
9452 int NumVDataDwords, bool IsAtomicPacked16Bit,
9453 const SDLoc &DL) {
9454 // Determine the required return type. This is the same regardless of
9455 // IsTexFail flag
9456 EVT ReqRetVT = ResultTypes[0];
9457 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9458 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9459 ? (ReqRetNumElts + 1) / 2
9460 : ReqRetNumElts;
9461
9462 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9463
9464 MVT DataDwordVT =
9465 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: NumDataDwords);
9466
9467 MVT MaskPopVT =
9468 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(VT: MVT::i32, NumElements: MaskPopDwords);
9469
9470 SDValue Data(Result, 0);
9471 SDValue TexFail;
9472
9473 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9474 SDValue ZeroIdx = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
9475 if (MaskPopVT.isVector()) {
9476 Data = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MaskPopVT,
9477 N1: SDValue(Result, 0), N2: ZeroIdx);
9478 } else {
9479 Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MaskPopVT,
9480 N1: SDValue(Result, 0), N2: ZeroIdx);
9481 }
9482 }
9483
9484 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9485 Data = padEltsToUndef(DAG, DL, CastVT: DataDwordVT, Src: Data,
9486 ExtraElts: NumDataDwords - MaskPopDwords);
9487
9488 if (IsD16)
9489 Data = adjustLoadValueTypeImpl(Result: Data, LoadVT: ReqRetVT, DL, DAG, Unpacked);
9490
9491 EVT LegalReqRetVT = ReqRetVT;
9492 if (!ReqRetVT.isVector()) {
9493 if (!Data.getValueType().isInteger())
9494 Data = DAG.getNode(Opcode: ISD::BITCAST, DL,
9495 VT: Data.getValueType().changeTypeToInteger(), Operand: Data);
9496 Data = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ReqRetVT.changeTypeToInteger(), Operand: Data);
9497 } else {
9498 // We need to widen the return vector to a legal type
9499 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9500 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9501 LegalReqRetVT =
9502 EVT::getVectorVT(Context&: *DAG.getContext(), VT: ReqRetVT.getVectorElementType(),
9503 NumElements: ReqRetVT.getVectorNumElements() + 1);
9504 }
9505 }
9506 Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LegalReqRetVT, Operand: Data);
9507
9508 if (IsTexFail) {
9509 TexFail =
9510 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: SDValue(Result, 0),
9511 N2: DAG.getConstant(Val: MaskPopDwords, DL, VT: MVT::i32));
9512
9513 return DAG.getMergeValues(Ops: {Data, TexFail, SDValue(Result, 1)}, dl: DL);
9514 }
9515
9516 if (Result->getNumValues() == 1)
9517 return Data;
9518
9519 return DAG.getMergeValues(Ops: {Data, SDValue(Result, 1)}, dl: DL);
9520}
9521
9522static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9523 SDValue *LWE, bool &IsTexFail) {
9524 auto *TexFailCtrlConst = cast<ConstantSDNode>(Val: TexFailCtrl.getNode());
9525
9526 uint64_t Value = TexFailCtrlConst->getZExtValue();
9527 if (Value) {
9528 IsTexFail = true;
9529 }
9530
9531 SDLoc DL(TexFailCtrlConst);
9532 *TFE = DAG.getTargetConstant(Val: (Value & 0x1) ? 1 : 0, DL, VT: MVT::i32);
9533 Value &= ~(uint64_t)0x1;
9534 *LWE = DAG.getTargetConstant(Val: (Value & 0x2) ? 1 : 0, DL, VT: MVT::i32);
9535 Value &= ~(uint64_t)0x2;
9536
9537 return Value == 0;
9538}
9539
9540static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
9541 MVT PackVectorVT,
9542 SmallVectorImpl<SDValue> &PackedAddrs,
9543 unsigned DimIdx, unsigned EndIdx,
9544 unsigned NumGradients) {
9545 SDLoc DL(Op);
9546 for (unsigned I = DimIdx; I < EndIdx; I++) {
9547 SDValue Addr = Op.getOperand(i: I);
9548
9549 // Gradients are packed with undef for each coordinate.
9550 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9551 // 1D: undef,dx/dh; undef,dx/dv
9552 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9553 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9554 if (((I + 1) >= EndIdx) ||
9555 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9556 I == DimIdx + NumGradients - 1))) {
9557 if (Addr.getValueType() != MVT::i16)
9558 Addr = DAG.getBitcast(VT: MVT::i16, V: Addr);
9559 Addr = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Addr);
9560 } else {
9561 Addr = DAG.getBuildVector(VT: PackVectorVT, DL, Ops: {Addr, Op.getOperand(i: I + 1)});
9562 I++;
9563 }
9564 Addr = DAG.getBitcast(VT: MVT::f32, V: Addr);
9565 PackedAddrs.push_back(Elt: Addr);
9566 }
9567}
9568
9569SDValue SITargetLowering::lowerImage(SDValue Op,
9570 const AMDGPU::ImageDimIntrinsicInfo *Intr,
9571 SelectionDAG &DAG, bool WithChain) const {
9572 SDLoc DL(Op);
9573 MachineFunction &MF = DAG.getMachineFunction();
9574 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9575 unsigned IntrOpcode = Intr->BaseOpcode;
9576 // For image atomic: use no-return opcode if result is unused.
9577 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
9578 !Op.getNode()->hasAnyUseOfValue(Value: 0))
9579 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
9580 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9581 AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: IntrOpcode);
9582 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(DimEnum: Intr->Dim);
9583 bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI: *Subtarget);
9584 bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
9585 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
9586
9587 SmallVector<EVT, 3> ResultTypes(Op->values());
9588 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9589 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
9590 ResultTypes.erase(CI: &ResultTypes[0]);
9591
9592 bool IsD16 = false;
9593 bool IsG16 = false;
9594 bool IsA16 = false;
9595 SDValue VData;
9596 int NumVDataDwords = 0;
9597 bool AdjustRetType = false;
9598 bool IsAtomicPacked16Bit = false;
9599
9600 // Offset of intrinsic arguments
9601 const unsigned ArgOffset = WithChain ? 2 : 1;
9602
9603 unsigned DMask;
9604 unsigned DMaskLanes = 0;
9605
9606 if (BaseOpcode->Atomic) {
9607 VData = Op.getOperand(i: 2);
9608
9609 IsAtomicPacked16Bit =
9610 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9611 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9612 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9613 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9614
9615 bool Is64Bit = VData.getValueSizeInBits() == 64;
9616 if (BaseOpcode->AtomicX2) {
9617 SDValue VData2 = Op.getOperand(i: 3);
9618 VData = DAG.getBuildVector(VT: Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9619 Ops: {VData, VData2});
9620 if (Is64Bit)
9621 VData = DAG.getBitcast(VT: MVT::v4i32, V: VData);
9622
9623 if (!BaseOpcode->NoReturn)
9624 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9625
9626 DMask = Is64Bit ? 0xf : 0x3;
9627 NumVDataDwords = Is64Bit ? 4 : 2;
9628 } else {
9629 DMask = Is64Bit ? 0x3 : 0x1;
9630 NumVDataDwords = Is64Bit ? 2 : 1;
9631 }
9632 } else {
9633 DMask = Op->getConstantOperandVal(Num: ArgOffset + Intr->DMaskIndex);
9634 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(Value: DMask);
9635
9636 if (BaseOpcode->Store) {
9637 VData = Op.getOperand(i: 2);
9638
9639 MVT StoreVT = VData.getSimpleValueType();
9640 if (StoreVT.getScalarType() == MVT::f16) {
9641 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9642 return Op; // D16 is unsupported for this instruction
9643
9644 IsD16 = true;
9645 VData = handleD16VData(VData, DAG, ImageStore: true);
9646 }
9647
9648 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9649 } else if (!BaseOpcode->NoReturn) {
9650 // Work out the num dwords based on the dmask popcount and underlying type
9651 // and whether packing is supported.
9652 MVT LoadVT = ResultTypes[0].getSimpleVT();
9653 if (LoadVT.getScalarType() == MVT::f16) {
9654 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9655 return Op; // D16 is unsupported for this instruction
9656
9657 IsD16 = true;
9658 }
9659
9660 // Confirm that the return type is large enough for the dmask specified
9661 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9662 (!LoadVT.isVector() && DMaskLanes > 1))
9663 return Op;
9664
9665 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9666 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9667 // instructions.
9668 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9669 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9670 NumVDataDwords = (DMaskLanes + 1) / 2;
9671 else
9672 NumVDataDwords = DMaskLanes;
9673
9674 AdjustRetType = true;
9675 }
9676 }
9677
9678 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9679 SmallVector<SDValue, 4> VAddrs;
9680
9681 // Check for 16 bit addresses or derivatives and pack if true.
9682 MVT VAddrVT =
9683 Op.getOperand(i: ArgOffset + Intr->GradientStart).getSimpleValueType();
9684 MVT VAddrScalarVT = VAddrVT.getScalarType();
9685 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9686 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9687
9688 VAddrVT = Op.getOperand(i: ArgOffset + Intr->CoordStart).getSimpleValueType();
9689 VAddrScalarVT = VAddrVT.getScalarType();
9690 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9691 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9692
9693 // Push back extra arguments.
9694 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9695 if (IsA16 && (Op.getOperand(i: ArgOffset + I).getValueType() == MVT::f16)) {
9696 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9697 // Special handling of bias when A16 is on. Bias is of type half but
9698 // occupies full 32-bit.
9699 SDValue Bias = DAG.getBuildVector(
9700 VT: MVT::v2f16, DL,
9701 Ops: {Op.getOperand(i: ArgOffset + I), DAG.getPOISON(VT: MVT::f16)});
9702 VAddrs.push_back(Elt: Bias);
9703 } else {
9704 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9705 "Bias needs to be converted to 16 bit in A16 mode");
9706 VAddrs.push_back(Elt: Op.getOperand(i: ArgOffset + I));
9707 }
9708 }
9709
9710 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9711 // 16 bit gradients are supported, but are tied to the A16 control
9712 // so both gradients and addresses must be 16 bit
9713 LLVM_DEBUG(
9714 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9715 "require 16 bit args for both gradients and addresses");
9716 return Op;
9717 }
9718
9719 if (IsA16) {
9720 if (!ST->hasA16()) {
9721 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9722 "support 16 bit addresses\n");
9723 return Op;
9724 }
9725 }
9726
9727 // We've dealt with incorrect input so we know that if IsA16, IsG16
9728 // are set then we have to compress/pack operands (either address,
9729 // gradient or both)
9730 // In the case where a16 and gradients are tied (no G16 support) then we
9731 // have already verified that both IsA16 and IsG16 are true
9732 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9733 // Activate g16
9734 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9735 AMDGPU::getMIMGG16MappingInfo(G: Intr->BaseOpcode);
9736 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9737 }
9738
9739 // Add gradients (packed or unpacked)
9740 if (IsG16) {
9741 // Pack the gradients
9742 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9743 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: GradPackVectorVT, PackedAddrs&: VAddrs,
9744 DimIdx: ArgOffset + Intr->GradientStart,
9745 EndIdx: ArgOffset + Intr->CoordStart, NumGradients: Intr->NumGradients);
9746 } else {
9747 for (unsigned I = ArgOffset + Intr->GradientStart;
9748 I < ArgOffset + Intr->CoordStart; I++)
9749 VAddrs.push_back(Elt: Op.getOperand(i: I));
9750 }
9751
9752 // Add addresses (packed or unpacked)
9753 if (IsA16) {
9754 packImage16bitOpsToDwords(DAG, Op, PackVectorVT: AddrPackVectorVT, PackedAddrs&: VAddrs,
9755 DimIdx: ArgOffset + Intr->CoordStart, EndIdx: VAddrEnd,
9756 NumGradients: 0 /* No gradients */);
9757 } else {
9758 // Add uncompressed address
9759 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9760 VAddrs.push_back(Elt: Op.getOperand(i: I));
9761 }
9762
9763 // If the register allocator cannot place the address registers contiguously
9764 // without introducing moves, then using the non-sequential address encoding
9765 // is always preferable, since it saves VALU instructions and is usually a
9766 // wash in terms of code size or even better.
9767 //
9768 // However, we currently have no way of hinting to the register allocator that
9769 // MIMG addresses should be placed contiguously when it is possible to do so,
9770 // so force non-NSA for the common 2-address case as a heuristic.
9771 //
9772 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9773 // allocation when possible.
9774 //
9775 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9776 // set of the remaining addresses.
9777 const unsigned NSAMaxSize = ST->getNSAMaxSize(HasSampler: BaseOpcode->Sampler);
9778 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9779 const bool UseNSA = ST->hasNSAEncoding() &&
9780 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9781 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9782 const bool UsePartialNSA =
9783 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9784
9785 SDValue VAddr;
9786 if (UsePartialNSA) {
9787 VAddr = getBuildDwordsVector(DAG, DL,
9788 Elts: ArrayRef(VAddrs).drop_front(N: NSAMaxSize - 1));
9789 } else if (!UseNSA) {
9790 VAddr = getBuildDwordsVector(DAG, DL, Elts: VAddrs);
9791 }
9792
9793 SDValue True = DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1);
9794 SDValue False = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1);
9795 SDValue Unorm;
9796 if (!BaseOpcode->Sampler) {
9797 Unorm = True;
9798 } else {
9799 uint64_t UnormConst =
9800 Op.getConstantOperandVal(i: ArgOffset + Intr->UnormIndex);
9801
9802 Unorm = UnormConst ? True : False;
9803 }
9804
9805 SDValue TFE;
9806 SDValue LWE;
9807 SDValue TexFail = Op.getOperand(i: ArgOffset + Intr->TexFailCtrlIndex);
9808 bool IsTexFail = false;
9809 if (!parseTexFail(TexFailCtrl: TexFail, DAG, TFE: &TFE, LWE: &LWE, IsTexFail))
9810 return Op;
9811
9812 if (IsTexFail) {
9813 if (!DMaskLanes) {
9814 // Expecting to get an error flag since TFC is on - and dmask is 0
9815 // Force dmask to be at least 1 otherwise the instruction will fail
9816 DMask = 0x1;
9817 DMaskLanes = 1;
9818 NumVDataDwords = 1;
9819 }
9820 NumVDataDwords += 1;
9821 AdjustRetType = true;
9822 }
9823
9824 // Has something earlier tagged that the return type needs adjusting
9825 // This happens if the instruction is a load or has set TexFailCtrl flags
9826 if (AdjustRetType) {
9827 // NumVDataDwords reflects the true number of dwords required in the return
9828 // type
9829 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9830 // This is a no-op load. This can be eliminated
9831 SDValue Undef = DAG.getPOISON(VT: Op.getValueType());
9832 if (isa<MemSDNode>(Val: Op))
9833 return DAG.getMergeValues(Ops: {Undef, Op.getOperand(i: 0)}, dl: DL);
9834 return Undef;
9835 }
9836
9837 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(Context&: *DAG.getContext(),
9838 VT: MVT::i32, NumElements: NumVDataDwords)
9839 : MVT::i32;
9840
9841 ResultTypes[0] = NewVT;
9842 if (ResultTypes.size() == 3) {
9843 // Original result was aggregate type used for TexFailCtrl results
9844 // The actual instruction returns as a vector type which has now been
9845 // created. Remove the aggregate result.
9846 ResultTypes.erase(CI: &ResultTypes[1]);
9847 }
9848 }
9849
9850 unsigned CPol = Op.getConstantOperandVal(i: ArgOffset + Intr->CachePolicyIndex);
9851 // Keep GLC only when the atomic's result is actually used.
9852 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
9853 CPol |= AMDGPU::CPol::GLC;
9854 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9855 AMDGPU::CPol::VOLATILE))
9856 return Op;
9857
9858 SmallVector<SDValue, 26> Ops;
9859 if (BaseOpcode->Store || BaseOpcode->Atomic)
9860 Ops.push_back(Elt: VData); // vdata
9861 if (UsePartialNSA) {
9862 append_range(C&: Ops, R: ArrayRef(VAddrs).take_front(N: NSAMaxSize - 1));
9863 Ops.push_back(Elt: VAddr);
9864 } else if (UseNSA)
9865 append_range(C&: Ops, R&: VAddrs);
9866 else
9867 Ops.push_back(Elt: VAddr);
9868 SDValue Rsrc = Op.getOperand(i: ArgOffset + Intr->RsrcIndex);
9869 EVT RsrcVT = Rsrc.getValueType();
9870 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9871 return Op;
9872 Ops.push_back(Elt: Rsrc);
9873 if (BaseOpcode->Sampler) {
9874 SDValue Samp = Op.getOperand(i: ArgOffset + Intr->SampIndex);
9875 if (Samp.getValueType() != MVT::v4i32)
9876 return Op;
9877 Ops.push_back(Elt: Samp);
9878 }
9879 Ops.push_back(Elt: DAG.getTargetConstant(Val: DMask, DL, VT: MVT::i32));
9880 if (IsGFX10Plus)
9881 Ops.push_back(Elt: DAG.getTargetConstant(Val: DimInfo->Encoding, DL, VT: MVT::i32));
9882 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9883 Ops.push_back(Elt: Unorm);
9884 Ops.push_back(Elt: DAG.getTargetConstant(Val: CPol, DL, VT: MVT::i32));
9885 Ops.push_back(Elt: IsA16 && // r128, a16 for gfx9
9886 ST->hasFeature(Feature: AMDGPU::FeatureR128A16)
9887 ? True
9888 : False);
9889 if (IsGFX10Plus)
9890 Ops.push_back(Elt: IsA16 ? True : False);
9891
9892 if (!Subtarget->hasGFX90AInsts())
9893 Ops.push_back(Elt: TFE); // tfe
9894 else if (TFE->getAsZExtVal()) {
9895 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9896 DAG.getMachineFunction().getFunction(),
9897 "TFE is not supported on this GPU", DL.getDebugLoc()));
9898 }
9899
9900 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9901 Ops.push_back(Elt: LWE); // lwe
9902 if (!IsGFX10Plus)
9903 Ops.push_back(Elt: DimInfo->DA ? True : False);
9904 if (BaseOpcode->HasD16)
9905 Ops.push_back(Elt: IsD16 ? True : False);
9906 if (isa<MemSDNode>(Val: Op))
9907 Ops.push_back(Elt: Op.getOperand(i: 0)); // chain
9908
9909 int NumVAddrDwords =
9910 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9911 int Opcode = -1;
9912
9913 if (IsGFX12Plus) {
9914 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx12,
9915 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9916 } else if (IsGFX11Plus) {
9917 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9918 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx11NSA
9919 : AMDGPU::MIMGEncGfx11Default,
9920 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9921 } else if (IsGFX10Plus) {
9922 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode,
9923 MIMGEncoding: UseNSA ? AMDGPU::MIMGEncGfx10NSA
9924 : AMDGPU::MIMGEncGfx10Default,
9925 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9926 } else {
9927 if (Subtarget->hasGFX90AInsts()) {
9928 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx90a,
9929 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9930 if (Opcode == -1) {
9931 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
9932 DAG.getMachineFunction().getFunction(),
9933 "requested image instruction is not supported on this GPU",
9934 DL.getDebugLoc()));
9935
9936 unsigned Idx = 0;
9937 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9938 for (EVT VT : OrigResultTypes) {
9939 if (VT == MVT::Other)
9940 RetValues[Idx++] = Op.getOperand(i: 0); // Chain
9941 else
9942 RetValues[Idx++] = DAG.getPOISON(VT);
9943 }
9944
9945 return DAG.getMergeValues(Ops: RetValues, dl: DL);
9946 }
9947 }
9948 if (Opcode == -1 &&
9949 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9950 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx8,
9951 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9952 if (Opcode == -1)
9953 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: IntrOpcode, MIMGEncoding: AMDGPU::MIMGEncGfx6,
9954 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
9955 }
9956 if (Opcode == -1)
9957 return Op;
9958
9959 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, dl: DL, ResultTys: ResultTypes, Ops);
9960 if (auto *MemOp = dyn_cast<MemSDNode>(Val&: Op)) {
9961 MachineMemOperand *MemRef = MemOp->getMemOperand();
9962 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
9963 }
9964
9965 if (BaseOpcode->NoReturn) {
9966 if (BaseOpcode->Atomic)
9967 return DAG.getMergeValues(
9968 Ops: {DAG.getPOISON(VT: OrigResultTypes[0]), SDValue(NewNode, 0)}, dl: DL);
9969
9970 return SDValue(NewNode, 0);
9971 }
9972
9973 if (BaseOpcode->AtomicX2) {
9974 SmallVector<SDValue, 1> Elt;
9975 DAG.ExtractVectorElements(Op: SDValue(NewNode, 0), Args&: Elt, Start: 0, Count: 1);
9976 return DAG.getMergeValues(Ops: {Elt[0], SDValue(NewNode, 1)}, dl: DL);
9977 }
9978
9979 return constructRetValue(DAG, Result: NewNode, ResultTypes: OrigResultTypes, IsTexFail,
9980 Unpacked: Subtarget->hasUnpackedD16VMem(), IsD16, DMaskPop: DMaskLanes,
9981 NumVDataDwords, IsAtomicPacked16Bit, DL);
9982}
9983
9984SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9985 SDValue Offset, SDValue CachePolicy,
9986 SelectionDAG &DAG) const {
9987 MachineFunction &MF = DAG.getMachineFunction();
9988
9989 const DataLayout &DataLayout = DAG.getDataLayout();
9990 Align Alignment =
9991 DataLayout.getABITypeAlign(Ty: VT.getTypeForEVT(Context&: *DAG.getContext()));
9992
9993 MachineMemOperand *MMO = MF.getMachineMemOperand(
9994 PtrInfo: MachinePointerInfo(),
9995 F: MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
9996 MachineMemOperand::MOInvariant,
9997 Size: VT.getStoreSize(), BaseAlignment: Alignment);
9998
9999 if (!Offset->isDivergent()) {
10000 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10001
10002 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10003 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10004 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10005 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10006 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10007 SDValue BufferLoad =
10008 DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD_USHORT, dl: DL,
10009 VTList: DAG.getVTList(VT: MVT::i32), Ops, MemVT: VT, MMO);
10010 return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
10011 }
10012
10013 // Widen vec3 load to vec4.
10014 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10015 !Subtarget->hasScalarDwordx3Loads()) {
10016 EVT WidenedVT =
10017 EVT::getVectorVT(Context&: *DAG.getContext(), VT: VT.getVectorElementType(), NumElements: 4);
10018 auto WidenedOp = DAG.getMemIntrinsicNode(
10019 Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL, VTList: DAG.getVTList(VT: WidenedVT), Ops, MemVT: WidenedVT,
10020 MMO: MF.getMachineMemOperand(MMO, Offset: 0, Size: WidenedVT.getStoreSize()));
10021 auto Subvector = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: WidenedOp,
10022 N2: DAG.getVectorIdxConstant(Val: 0, DL));
10023 return Subvector;
10024 }
10025
10026 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_LOAD, dl: DL,
10027 VTList: DAG.getVTList(VT), Ops, MemVT: VT, MMO);
10028 }
10029
10030 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10031 // assume that the buffer is unswizzled.
10032 SDValue Ops[] = {
10033 DAG.getEntryNode(), // Chain
10034 Rsrc, // rsrc
10035 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10036 {}, // voffset
10037 {}, // soffset
10038 {}, // offset
10039 CachePolicy, // cachepolicy
10040 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10041 };
10042 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10043 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3], Alignment: Align(4));
10044 return handleByteShortBufferLoads(DAG, LoadVT: VT, DL, Ops, MMO);
10045 }
10046
10047 SmallVector<SDValue, 4> Loads;
10048 unsigned NumLoads = 1;
10049 MVT LoadVT = VT.getSimpleVT();
10050 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10051 assert((LoadVT.getScalarType() == MVT::i32 ||
10052 LoadVT.getScalarType() == MVT::f32));
10053
10054 if (NumElts == 8 || NumElts == 16) {
10055 NumLoads = NumElts / 4;
10056 LoadVT = MVT::getVectorVT(VT: LoadVT.getScalarType(), NumElements: 4);
10057 }
10058
10059 SDVTList VTList = DAG.getVTList(VTs: {LoadVT, MVT::Other});
10060
10061 // Use the alignment to ensure that the required offsets will fit into the
10062 // immediate offsets.
10063 setBufferOffsets(CombinedOffset: Offset, DAG, Offsets: &Ops[3],
10064 Alignment: NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10065
10066 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10067 for (unsigned i = 0; i < NumLoads; ++i) {
10068 Ops[5] = DAG.getTargetConstant(Val: InstOffset + 16 * i, DL, VT: MVT::i32);
10069 Loads.push_back(Elt: getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10070 MemVT: LoadVT, MMO, DAG));
10071 }
10072
10073 if (NumElts == 8 || NumElts == 16)
10074 return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: Loads);
10075
10076 return Loads[0];
10077}
10078
10079SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10080 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10081 if (!Subtarget->hasArchitectedSGPRs())
10082 return {};
10083 SDLoc SL(Op);
10084 MVT VT = MVT::i32;
10085 SDValue TTMP8 = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: SL, Reg: AMDGPU::TTMP8, VT);
10086 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT, N1: TTMP8,
10087 N2: DAG.getConstant(Val: 25, DL: SL, VT), N3: DAG.getConstant(Val: 5, DL: SL, VT));
10088}
10089
10090SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10091 AMDGPU::Hwreg::Id HwReg,
10092 unsigned LowBit,
10093 unsigned Width) const {
10094 SDLoc SL(Op);
10095 using namespace AMDGPU::Hwreg;
10096 return {DAG.getMachineNode(
10097 Opcode: AMDGPU::S_GETREG_B32_const, dl: SL, VT: MVT::i32,
10098 Op1: DAG.getTargetConstant(Val: HwregEncoding::encode(Values: HwReg, Values: LowBit, Values: Width),
10099 DL: SL, VT: MVT::i32)),
10100 0};
10101}
10102
10103SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10104 unsigned Dim,
10105 const ArgDescriptor &Arg) const {
10106 SDLoc SL(Op);
10107 MachineFunction &MF = DAG.getMachineFunction();
10108 unsigned MaxID = Subtarget->getMaxWorkitemID(Kernel: MF.getFunction(), Dimension: Dim);
10109 if (MaxID == 0)
10110 return DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32);
10111
10112 // It's undefined behavior if a function marked with the amdgpu-no-*
10113 // attributes uses the corresponding intrinsic.
10114 if (!Arg)
10115 return DAG.getPOISON(VT: Op->getValueType(ResNo: 0));
10116
10117 SDValue Val = loadInputValue(DAG, RC: &AMDGPU::VGPR_32RegClass, VT: MVT::i32,
10118 SL: SDLoc(DAG.getEntryNode()), Arg);
10119
10120 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10121 // masking operations anyway.
10122 //
10123 // TODO: We could assert the top bit is 0 for the source copy.
10124 if (Arg.isMasked())
10125 return Val;
10126
10127 // Preserve the known bits after expansion to a copy.
10128 EVT SmallVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: llvm::bit_width(Value: MaxID));
10129 return DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT: MVT::i32, N1: Val,
10130 N2: DAG.getValueType(SmallVT));
10131}
10132
10133SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10134 SelectionDAG &DAG) const {
10135 MachineFunction &MF = DAG.getMachineFunction();
10136 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10137
10138 EVT VT = Op.getValueType();
10139 SDLoc DL(Op);
10140 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
10141
10142 // TODO: Should this propagate fast-math-flags?
10143
10144 switch (IntrinsicID) {
10145 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10146 if (getSubtarget()->isAmdHsaOrMesa(F: MF.getFunction()))
10147 return emitNonHSAIntrinsicError(DAG, DL, VT);
10148 return getPreloadedValue(DAG, MFI: *MFI, VT,
10149 PVID: AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
10150 }
10151 case Intrinsic::amdgcn_dispatch_ptr:
10152 case Intrinsic::amdgcn_queue_ptr: {
10153 if (!Subtarget->isAmdHsaOrMesa(F: MF.getFunction())) {
10154 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10155 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10156 DL.getDebugLoc()));
10157 return DAG.getPOISON(VT);
10158 }
10159
10160 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10161 ? AMDGPUFunctionArgInfo::DISPATCH_PTR
10162 : AMDGPUFunctionArgInfo::QUEUE_PTR;
10163 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: RegID);
10164 }
10165 case Intrinsic::amdgcn_implicitarg_ptr: {
10166 if (MFI->isEntryFunction())
10167 return getImplicitArgPtr(DAG, SL: DL);
10168 return getPreloadedValue(DAG, MFI: *MFI, VT,
10169 PVID: AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
10170 }
10171 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10172 if (!AMDGPU::isKernel(F: MF.getFunction())) {
10173 // This only makes sense to call in a kernel, so just lower to null.
10174 return DAG.getConstant(Val: 0, DL, VT);
10175 }
10176
10177 return getPreloadedValue(DAG, MFI: *MFI, VT,
10178 PVID: AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
10179 }
10180 case Intrinsic::amdgcn_dispatch_id: {
10181 return getPreloadedValue(DAG, MFI: *MFI, VT, PVID: AMDGPUFunctionArgInfo::DISPATCH_ID);
10182 }
10183 case Intrinsic::amdgcn_rcp:
10184 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL, VT, Operand: Op.getOperand(i: 1));
10185 case Intrinsic::amdgcn_rsq:
10186 return DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10187 case Intrinsic::amdgcn_rsq_legacy:
10188 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10189 return emitRemovedIntrinsicError(DAG, DL, VT);
10190 return SDValue();
10191 case Intrinsic::amdgcn_rcp_legacy:
10192 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10193 return emitRemovedIntrinsicError(DAG, DL, VT);
10194 return DAG.getNode(Opcode: AMDGPUISD::RCP_LEGACY, DL, VT, Operand: Op.getOperand(i: 1));
10195 case Intrinsic::amdgcn_rsq_clamp: {
10196 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10197 return DAG.getNode(Opcode: AMDGPUISD::RSQ_CLAMP, DL, VT, Operand: Op.getOperand(i: 1));
10198
10199 Type *Type = VT.getTypeForEVT(Context&: *DAG.getContext());
10200 APFloat Max = APFloat::getLargest(Sem: Type->getFltSemantics());
10201 APFloat Min = APFloat::getLargest(Sem: Type->getFltSemantics(), Negative: true);
10202
10203 SDValue Rsq = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: Op.getOperand(i: 1));
10204 SDValue Tmp =
10205 DAG.getNode(Opcode: ISD::FMINNUM, DL, VT, N1: Rsq, N2: DAG.getConstantFP(Val: Max, DL, VT));
10206 return DAG.getNode(Opcode: ISD::FMAXNUM, DL, VT, N1: Tmp,
10207 N2: DAG.getConstantFP(Val: Min, DL, VT));
10208 }
10209 case Intrinsic::r600_read_ngroups_x:
10210 if (Subtarget->isAmdHsaOS())
10211 return emitNonHSAIntrinsicError(DAG, DL, VT);
10212
10213 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10214 Offset: SI::KernelInputOffsets::NGROUPS_X, Alignment: Align(4),
10215 Signed: false);
10216 case Intrinsic::r600_read_ngroups_y:
10217 if (Subtarget->isAmdHsaOS())
10218 return emitNonHSAIntrinsicError(DAG, DL, VT);
10219
10220 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10221 Offset: SI::KernelInputOffsets::NGROUPS_Y, Alignment: Align(4),
10222 Signed: false);
10223 case Intrinsic::r600_read_ngroups_z:
10224 if (Subtarget->isAmdHsaOS())
10225 return emitNonHSAIntrinsicError(DAG, DL, VT);
10226
10227 return lowerKernargMemParameter(DAG, VT, MemVT: VT, SL: DL, Chain: DAG.getEntryNode(),
10228 Offset: SI::KernelInputOffsets::NGROUPS_Z, Alignment: Align(4),
10229 Signed: false);
10230 case Intrinsic::r600_read_local_size_x:
10231 if (Subtarget->isAmdHsaOS())
10232 return emitNonHSAIntrinsicError(DAG, DL, VT);
10233
10234 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10235 Offset: SI::KernelInputOffsets::LOCAL_SIZE_X);
10236 case Intrinsic::r600_read_local_size_y:
10237 if (Subtarget->isAmdHsaOS())
10238 return emitNonHSAIntrinsicError(DAG, DL, VT);
10239
10240 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10241 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Y);
10242 case Intrinsic::r600_read_local_size_z:
10243 if (Subtarget->isAmdHsaOS())
10244 return emitNonHSAIntrinsicError(DAG, DL, VT);
10245
10246 return lowerImplicitZextParam(DAG, Op, VT: MVT::i16,
10247 Offset: SI::KernelInputOffsets::LOCAL_SIZE_Z);
10248 case Intrinsic::amdgcn_workgroup_id_x:
10249 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10250 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
10251 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X,
10252 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X);
10253 case Intrinsic::amdgcn_workgroup_id_y:
10254 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10255 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
10256 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y,
10257 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y);
10258 case Intrinsic::amdgcn_workgroup_id_z:
10259 return lowerWorkGroupId(DAG, MFI: *MFI, VT,
10260 WorkGroupIdPV: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
10261 ClusterMaxIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z,
10262 ClusterWorkGroupIdPV: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z);
10263 case Intrinsic::amdgcn_cluster_id_x:
10264 return Subtarget->hasClusters()
10265 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10266 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_X)
10267 : DAG.getPOISON(VT);
10268 case Intrinsic::amdgcn_cluster_id_y:
10269 return Subtarget->hasClusters()
10270 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10271 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Y)
10272 : DAG.getPOISON(VT);
10273 case Intrinsic::amdgcn_cluster_id_z:
10274 return Subtarget->hasClusters()
10275 ? getPreloadedValue(DAG, MFI: *MFI, VT,
10276 PVID: AMDGPUFunctionArgInfo::WORKGROUP_ID_Z)
10277 : DAG.getPOISON(VT);
10278 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10279 return Subtarget->hasClusters()
10280 ? getPreloadedValue(
10281 DAG, MFI: *MFI, VT,
10282 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_X)
10283 : DAG.getPOISON(VT);
10284 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10285 return Subtarget->hasClusters()
10286 ? getPreloadedValue(
10287 DAG, MFI: *MFI, VT,
10288 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Y)
10289 : DAG.getPOISON(VT);
10290 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10291 return Subtarget->hasClusters()
10292 ? getPreloadedValue(
10293 DAG, MFI: *MFI, VT,
10294 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_ID_Z)
10295 : DAG.getPOISON(VT);
10296 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10297 return Subtarget->hasClusters()
10298 ? lowerConstHwRegRead(DAG, Op, HwReg: AMDGPU::Hwreg::ID_IB_STS2, LowBit: 21, Width: 4)
10299 : SDValue();
10300 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10301 return Subtarget->hasClusters()
10302 ? getPreloadedValue(
10303 DAG, MFI: *MFI, VT,
10304 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_X)
10305 : DAG.getPOISON(VT);
10306 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10307 return Subtarget->hasClusters()
10308 ? getPreloadedValue(
10309 DAG, MFI: *MFI, VT,
10310 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Y)
10311 : DAG.getPOISON(VT);
10312 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10313 return Subtarget->hasClusters()
10314 ? getPreloadedValue(
10315 DAG, MFI: *MFI, VT,
10316 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_ID_Z)
10317 : DAG.getPOISON(VT);
10318 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10319 return Subtarget->hasClusters()
10320 ? getPreloadedValue(
10321 DAG, MFI: *MFI, VT,
10322 PVID: AMDGPUFunctionArgInfo::CLUSTER_WORKGROUP_MAX_FLAT_ID)
10323 : DAG.getPOISON(VT);
10324 case Intrinsic::amdgcn_wave_id:
10325 return lowerWaveID(DAG, Op);
10326 case Intrinsic::amdgcn_lds_kernel_id: {
10327 if (MFI->isEntryFunction())
10328 return getLDSKernelId(DAG, SL: DL);
10329 return getPreloadedValue(DAG, MFI: *MFI, VT,
10330 PVID: AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
10331 }
10332 case Intrinsic::amdgcn_workitem_id_x:
10333 return lowerWorkitemID(DAG, Op, Dim: 0, Arg: MFI->getArgInfo().WorkItemIDX);
10334 case Intrinsic::amdgcn_workitem_id_y:
10335 return lowerWorkitemID(DAG, Op, Dim: 1, Arg: MFI->getArgInfo().WorkItemIDY);
10336 case Intrinsic::amdgcn_workitem_id_z:
10337 return lowerWorkitemID(DAG, Op, Dim: 2, Arg: MFI->getArgInfo().WorkItemIDZ);
10338 case Intrinsic::amdgcn_wavefrontsize:
10339 return DAG.getConstant(Val: MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10340 DL: SDLoc(Op), VT: MVT::i32);
10341 case Intrinsic::amdgcn_s_buffer_load: {
10342 unsigned CPol = Op.getConstantOperandVal(i: 3);
10343 // s_buffer_load, because of how it's optimized, can't be volatile
10344 // so reject ones with the volatile bit set.
10345 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10346 ? AMDGPU::CPol::ALL
10347 : AMDGPU::CPol::ALL_pregfx12))
10348 return Op;
10349 return lowerSBuffer(VT, DL, Rsrc: Op.getOperand(i: 1), Offset: Op.getOperand(i: 2),
10350 CachePolicy: Op.getOperand(i: 3), DAG);
10351 }
10352 case Intrinsic::amdgcn_fdiv_fast:
10353 return lowerFDIV_FAST(Op, DAG);
10354 case Intrinsic::amdgcn_sin:
10355 return DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL, VT, Operand: Op.getOperand(i: 1));
10356
10357 case Intrinsic::amdgcn_cos:
10358 return DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL, VT, Operand: Op.getOperand(i: 1));
10359
10360 case Intrinsic::amdgcn_mul_u24:
10361 return DAG.getNode(Opcode: AMDGPUISD::MUL_U24, DL, VT, N1: Op.getOperand(i: 1),
10362 N2: Op.getOperand(i: 2));
10363 case Intrinsic::amdgcn_mul_i24:
10364 return DAG.getNode(Opcode: AMDGPUISD::MUL_I24, DL, VT, N1: Op.getOperand(i: 1),
10365 N2: Op.getOperand(i: 2));
10366
10367 case Intrinsic::amdgcn_log_clamp: {
10368 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10369 return SDValue();
10370
10371 return emitRemovedIntrinsicError(DAG, DL, VT);
10372 }
10373 case Intrinsic::amdgcn_fract:
10374 return DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: Op.getOperand(i: 1));
10375
10376 case Intrinsic::amdgcn_class:
10377 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT, N1: Op.getOperand(i: 1),
10378 N2: Op.getOperand(i: 2));
10379 case Intrinsic::amdgcn_div_fmas:
10380 return DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL, VT, N1: Op.getOperand(i: 1),
10381 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10382
10383 case Intrinsic::amdgcn_div_fixup:
10384 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL, VT, N1: Op.getOperand(i: 1),
10385 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10386
10387 case Intrinsic::amdgcn_div_scale: {
10388 const ConstantSDNode *Param = cast<ConstantSDNode>(Val: Op.getOperand(i: 3));
10389
10390 // Translate to the operands expected by the machine instruction. The
10391 // first parameter must be the same as the first instruction.
10392 SDValue Numerator = Op.getOperand(i: 1);
10393 SDValue Denominator = Op.getOperand(i: 2);
10394
10395 // Note this order is opposite of the machine instruction's operations,
10396 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10397 // intrinsic has the numerator as the first operand to match a normal
10398 // division operation.
10399
10400 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10401
10402 return DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL, VTList: Op->getVTList(), N1: Src0,
10403 N2: Denominator, N3: Numerator);
10404 }
10405 case Intrinsic::amdgcn_icmp: {
10406 // There is a Pat that handles this variant, so return it as-is.
10407 if (Op.getOperand(i: 1).getValueType() == MVT::i1 &&
10408 Op.getConstantOperandVal(i: 2) == 0 &&
10409 Op.getConstantOperandVal(i: 3) == ICmpInst::Predicate::ICMP_NE)
10410 return Op;
10411 return lowerICMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10412 }
10413 case Intrinsic::amdgcn_fcmp: {
10414 return lowerFCMPIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10415 }
10416 case Intrinsic::amdgcn_ballot:
10417 return lowerBALLOTIntrinsic(TLI: *this, N: Op.getNode(), DAG);
10418 case Intrinsic::amdgcn_fmed3:
10419 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL, VT, N1: Op.getOperand(i: 1),
10420 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10421 case Intrinsic::amdgcn_fdot2:
10422 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL, VT, N1: Op.getOperand(i: 1),
10423 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3), N4: Op.getOperand(i: 4));
10424 case Intrinsic::amdgcn_fmul_legacy:
10425 return DAG.getNode(Opcode: AMDGPUISD::FMUL_LEGACY, DL, VT, N1: Op.getOperand(i: 1),
10426 N2: Op.getOperand(i: 2));
10427 case Intrinsic::amdgcn_sffbh:
10428 return DAG.getNode(Opcode: AMDGPUISD::FFBH_I32, DL, VT, Operand: Op.getOperand(i: 1));
10429 case Intrinsic::amdgcn_sbfe:
10430 return DAG.getNode(Opcode: AMDGPUISD::BFE_I32, DL, VT, N1: Op.getOperand(i: 1),
10431 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10432 case Intrinsic::amdgcn_ubfe:
10433 return DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL, VT, N1: Op.getOperand(i: 1),
10434 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10435 case Intrinsic::amdgcn_cvt_pkrtz:
10436 case Intrinsic::amdgcn_cvt_pknorm_i16:
10437 case Intrinsic::amdgcn_cvt_pknorm_u16:
10438 case Intrinsic::amdgcn_cvt_pk_i16:
10439 case Intrinsic::amdgcn_cvt_pk_u16: {
10440 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10441 EVT VT = Op.getValueType();
10442 unsigned Opcode;
10443
10444 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10445 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10446 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10447 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10448 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10449 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10450 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10451 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10452 else
10453 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10454
10455 if (isTypeLegal(VT))
10456 return DAG.getNode(Opcode, DL, VT, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10457
10458 SDValue Node =
10459 DAG.getNode(Opcode, DL, VT: MVT::i32, N1: Op.getOperand(i: 1), N2: Op.getOperand(i: 2));
10460 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Node);
10461 }
10462 case Intrinsic::amdgcn_fmad_ftz:
10463 return DAG.getNode(Opcode: AMDGPUISD::FMAD_FTZ, DL, VT, N1: Op.getOperand(i: 1),
10464 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10465
10466 case Intrinsic::amdgcn_if_break:
10467 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_IF_BREAK, dl: DL, VT,
10468 Op1: Op->getOperand(Num: 1), Op2: Op->getOperand(Num: 2)),
10469 0);
10470
10471 case Intrinsic::amdgcn_groupstaticsize: {
10472 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
10473 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10474 return Op;
10475
10476 const Module *M = MF.getFunction().getParent();
10477 const GlobalValue *GV =
10478 Intrinsic::getDeclarationIfExists(M, id: Intrinsic::amdgcn_groupstaticsize);
10479 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, VT: MVT::i32, offset: 0,
10480 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10481 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10482 }
10483 case Intrinsic::amdgcn_is_shared:
10484 case Intrinsic::amdgcn_is_private: {
10485 SDLoc SL(Op);
10486 SDValue SrcVec =
10487 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
10488 SDValue SrcHi = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: SrcVec,
10489 N2: DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32));
10490
10491 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10492 ? AMDGPUAS::LOCAL_ADDRESS
10493 : AMDGPUAS::PRIVATE_ADDRESS;
10494 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10495 Subtarget->hasGloballyAddressableScratch()) {
10496 SDValue FlatScratchBaseHi(
10497 DAG.getMachineNode(
10498 Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32,
10499 Op1: DAG.getRegister(Reg: AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, VT: MVT::i32)),
10500 0);
10501 // Test bits 63..58 against the aperture address.
10502 return DAG.getSetCC(
10503 DL: SL, VT: MVT::i1,
10504 LHS: DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i32, N1: SrcHi, N2: FlatScratchBaseHi),
10505 RHS: DAG.getConstant(Val: 1u << 26, DL: SL, VT: MVT::i32), Cond: ISD::SETULT);
10506 }
10507
10508 SDValue Aperture = getSegmentAperture(AS, DL: SL, DAG);
10509 return DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: SrcHi, RHS: Aperture, Cond: ISD::SETEQ);
10510 }
10511 case Intrinsic::amdgcn_perm:
10512 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op.getOperand(i: 1),
10513 N2: Op.getOperand(i: 2), N3: Op.getOperand(i: 3));
10514 case Intrinsic::amdgcn_reloc_constant: {
10515 Module *M = MF.getFunction().getParent();
10516 const MDNode *Metadata = cast<MDNodeSDNode>(Val: Op.getOperand(i: 1))->getMD();
10517 auto SymbolName = cast<MDString>(Val: Metadata->getOperand(I: 0))->getString();
10518 auto *RelocSymbol = cast<GlobalVariable>(
10519 Val: M->getOrInsertGlobal(Name: SymbolName, Ty: Type::getInt32Ty(C&: M->getContext())));
10520 SDValue GA = DAG.getTargetGlobalAddress(GV: RelocSymbol, DL, VT: MVT::i32, offset: 0,
10521 TargetFlags: SIInstrInfo::MO_ABS32_LO);
10522 return {DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: GA), 0};
10523 }
10524 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10525 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10532 if (Op.getOperand(i: 4).getValueType() == MVT::i32)
10533 return SDValue();
10534
10535 SDLoc SL(Op);
10536 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i32);
10537 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10538 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: Op.getOperand(i: 2),
10539 N4: Op.getOperand(i: 3), N5: IndexKeyi32);
10540 }
10541 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10542 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10543 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10544 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10545 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10546 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10547 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10548 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10549 if (Op.getOperand(i: 4).getValueType() == MVT::i64)
10550 return SDValue();
10551
10552 SDLoc SL(Op);
10553 auto IndexKeyi64 =
10554 Op.getOperand(i: 4).getValueType() == MVT::v2i32
10555 ? DAG.getBitcast(VT: MVT::i64, V: Op.getOperand(i: 4))
10556 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 4), DL: SL, VT: MVT::i64);
10557 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10558 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10559 Op.getOperand(i: 3), IndexKeyi64, Op.getOperand(i: 5),
10560 Op.getOperand(i: 6)});
10561 }
10562 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10563 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10564 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10565 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10566 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10567 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10568 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10569 ? MVT::i64
10570 : MVT::i32;
10571 if (Op.getOperand(i: 6).getValueType() == IndexKeyTy)
10572 return SDValue();
10573
10574 SDLoc SL(Op);
10575 auto IndexKey =
10576 Op.getOperand(i: 6).getValueType().isVector()
10577 ? DAG.getBitcast(VT: IndexKeyTy, V: Op.getOperand(i: 6))
10578 : DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: IndexKeyTy);
10579 SmallVector<SDValue> Args{
10580 Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10581 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10582 IndexKey, Op.getOperand(i: 7), Op.getOperand(i: 8)};
10583 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10584 Args.push_back(Elt: Op.getOperand(i: 9));
10585 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(), Ops: Args);
10586 }
10587 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10588 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10589 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10590 if (Op.getOperand(i: 6).getValueType() == MVT::i32)
10591 return SDValue();
10592
10593 SDLoc SL(Op);
10594 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: 6), DL: SL, VT: MVT::i32);
10595 return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: Op.getValueType(),
10596 Ops: {Op.getOperand(i: 0), Op.getOperand(i: 1), Op.getOperand(i: 2),
10597 Op.getOperand(i: 3), Op.getOperand(i: 4), Op.getOperand(i: 5),
10598 IndexKeyi32, Op.getOperand(i: 7)});
10599 }
10600 case Intrinsic::amdgcn_addrspacecast_nonnull:
10601 return lowerADDRSPACECAST(Op, DAG);
10602 case Intrinsic::amdgcn_readlane:
10603 case Intrinsic::amdgcn_readfirstlane:
10604 case Intrinsic::amdgcn_writelane:
10605 case Intrinsic::amdgcn_permlane16:
10606 case Intrinsic::amdgcn_permlanex16:
10607 case Intrinsic::amdgcn_permlane64:
10608 case Intrinsic::amdgcn_set_inactive:
10609 case Intrinsic::amdgcn_set_inactive_chain_arg:
10610 case Intrinsic::amdgcn_mov_dpp8:
10611 case Intrinsic::amdgcn_update_dpp:
10612 return lowerLaneOp(TLI: *this, N: Op.getNode(), DAG);
10613 case Intrinsic::amdgcn_dead: {
10614 SmallVector<SDValue, 8> Poisons;
10615 for (const EVT ValTy : Op.getNode()->values())
10616 Poisons.push_back(Elt: DAG.getPOISON(VT: ValTy));
10617 return DAG.getMergeValues(Ops: Poisons, dl: SDLoc(Op));
10618 }
10619 case Intrinsic::amdgcn_wave_shuffle:
10620 return lowerWaveShuffle(TLI: *this, N: Op.getNode(), DAG);
10621 default:
10622 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10623 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
10624 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: false);
10625
10626 return Op;
10627 }
10628}
10629
10630// On targets not supporting constant in soffset field, turn zero to
10631// SGPR_NULL to avoid generating an extra s_mov with zero.
10632static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
10633 const GCNSubtarget *Subtarget) {
10634 if (Subtarget->hasRestrictedSOffset() && isNullConstant(V: SOffset))
10635 return DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32);
10636 return SOffset;
10637}
10638
10639SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10640 SelectionDAG &DAG,
10641 unsigned NewOpcode) const {
10642 SDLoc DL(Op);
10643
10644 SDValue VData = Op.getOperand(i: 2);
10645 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10646 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10647 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10648 SDValue Ops[] = {
10649 Op.getOperand(i: 0), // Chain
10650 VData, // vdata
10651 Rsrc, // rsrc
10652 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10653 VOffset, // voffset
10654 SOffset, // soffset
10655 Offset, // offset
10656 Op.getOperand(i: 6), // cachepolicy
10657 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10658 };
10659
10660 auto *M = cast<MemSDNode>(Val&: Op);
10661
10662 EVT MemVT = VData.getValueType();
10663 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10664 MMO: M->getMemOperand());
10665}
10666
10667SDValue
10668SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10669 unsigned NewOpcode) const {
10670 SDLoc DL(Op);
10671
10672 SDValue VData = Op.getOperand(i: 2);
10673 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
10674 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10675 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10676 SDValue Ops[] = {
10677 Op.getOperand(i: 0), // Chain
10678 VData, // vdata
10679 Rsrc, // rsrc
10680 Op.getOperand(i: 4), // vindex
10681 VOffset, // voffset
10682 SOffset, // soffset
10683 Offset, // offset
10684 Op.getOperand(i: 7), // cachepolicy
10685 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10686 };
10687
10688 auto *M = cast<MemSDNode>(Val&: Op);
10689
10690 EVT MemVT = VData.getValueType();
10691 return DAG.getMemIntrinsicNode(Opcode: NewOpcode, dl: DL, VTList: Op->getVTList(), Ops, MemVT,
10692 MMO: M->getMemOperand());
10693}
10694
10695SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10696 SelectionDAG &DAG) const {
10697 unsigned IntrID = Op.getConstantOperandVal(i: 1);
10698 SDLoc DL(Op);
10699
10700 switch (IntrID) {
10701 case Intrinsic::amdgcn_ds_ordered_add:
10702 case Intrinsic::amdgcn_ds_ordered_swap: {
10703 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10704 SDValue Chain = M->getOperand(Num: 0);
10705 SDValue M0 = M->getOperand(Num: 2);
10706 SDValue Value = M->getOperand(Num: 3);
10707 unsigned IndexOperand = M->getConstantOperandVal(Num: 7);
10708 unsigned WaveRelease = M->getConstantOperandVal(Num: 8);
10709 unsigned WaveDone = M->getConstantOperandVal(Num: 9);
10710
10711 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10712 IndexOperand &= ~0x3f;
10713 unsigned CountDw = 0;
10714
10715 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10716 CountDw = (IndexOperand >> 24) & 0xf;
10717 IndexOperand &= ~(0xf << 24);
10718
10719 if (CountDw < 1 || CountDw > 4) {
10720 const Function &Fn = DAG.getMachineFunction().getFunction();
10721 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10722 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10723 DL.getDebugLoc()));
10724 CountDw = 1;
10725 }
10726 }
10727
10728 if (IndexOperand) {
10729 const Function &Fn = DAG.getMachineFunction().getFunction();
10730 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10731 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10732 }
10733
10734 if (WaveDone && !WaveRelease) {
10735 // TODO: Move this to IR verifier
10736 const Function &Fn = DAG.getMachineFunction().getFunction();
10737 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
10738 Fn, "ds_ordered_count: wave_done requires wave_release",
10739 DL.getDebugLoc()));
10740 }
10741
10742 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10743 unsigned ShaderType =
10744 SIInstrInfo::getDSShaderTypeValue(MF: DAG.getMachineFunction());
10745 unsigned Offset0 = OrderedCountIndex << 2;
10746 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10747
10748 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10749 Offset1 |= (CountDw - 1) << 6;
10750
10751 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10752 Offset1 |= ShaderType << 2;
10753
10754 unsigned Offset = Offset0 | (Offset1 << 8);
10755
10756 SDValue Ops[] = {
10757 Chain, Value, DAG.getTargetConstant(Val: Offset, DL, VT: MVT::i16),
10758 copyToM0(DAG, Chain, DL, V: M0).getValue(R: 1), // Glue
10759 };
10760 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::DS_ORDERED_COUNT, dl: DL,
10761 VTList: M->getVTList(), Ops, MemVT: M->getMemoryVT(),
10762 MMO: M->getMemOperand());
10763 }
10764 case Intrinsic::amdgcn_raw_buffer_load:
10765 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10766 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10767 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10768 case Intrinsic::amdgcn_raw_buffer_load_format:
10769 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10770 const bool IsFormat =
10771 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10772 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10773
10774 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10775 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10776 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10777 SDValue Ops[] = {
10778 Op.getOperand(i: 0), // Chain
10779 Rsrc, // rsrc
10780 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10781 VOffset, // voffset
10782 SOffset, // soffset
10783 Offset, // offset
10784 Op.getOperand(i: 5), // cachepolicy, swizzled buffer
10785 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10786 };
10787
10788 auto *M = cast<MemSDNode>(Val&: Op);
10789 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10790 }
10791 case Intrinsic::amdgcn_struct_buffer_load:
10792 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10793 case Intrinsic::amdgcn_struct_buffer_load_format:
10794 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10795 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10796 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10797 const bool IsFormat =
10798 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10799 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10800
10801 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10802 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10803 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10804 SDValue Ops[] = {
10805 Op.getOperand(i: 0), // Chain
10806 Rsrc, // rsrc
10807 Op.getOperand(i: 3), // vindex
10808 VOffset, // voffset
10809 SOffset, // soffset
10810 Offset, // offset
10811 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10812 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10813 };
10814
10815 return lowerIntrinsicLoad(M: cast<MemSDNode>(Val&: Op), IsFormat, DAG, Ops);
10816 }
10817 case Intrinsic::amdgcn_raw_tbuffer_load:
10818 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10819 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10820 EVT LoadVT = Op.getValueType();
10821 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10822 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 3), DAG);
10823 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 4), DAG, Subtarget);
10824
10825 SDValue Ops[] = {
10826 Op.getOperand(i: 0), // Chain
10827 Rsrc, // rsrc
10828 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10829 VOffset, // voffset
10830 SOffset, // soffset
10831 Offset, // offset
10832 Op.getOperand(i: 5), // format
10833 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
10834 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
10835 };
10836
10837 if (LoadVT.getScalarType() == MVT::f16)
10838 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10839 Ops);
10840 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10841 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10842 DAG);
10843 }
10844 case Intrinsic::amdgcn_struct_tbuffer_load:
10845 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10846 MemSDNode *M = cast<MemSDNode>(Val&: Op);
10847 EVT LoadVT = Op.getValueType();
10848 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
10849 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
10850 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
10851
10852 SDValue Ops[] = {
10853 Op.getOperand(i: 0), // Chain
10854 Rsrc, // rsrc
10855 Op.getOperand(i: 3), // vindex
10856 VOffset, // voffset
10857 SOffset, // soffset
10858 Offset, // offset
10859 Op.getOperand(i: 6), // format
10860 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
10861 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
10862 };
10863
10864 if (LoadVT.getScalarType() == MVT::f16)
10865 return adjustLoadValueType(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10866 Ops);
10867 return getMemIntrinsicNode(Opcode: AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10868 VTList: Op->getVTList(), Ops, MemVT: LoadVT, MMO: M->getMemOperand(),
10869 DAG);
10870 }
10871 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10873 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10874 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10876 return lowerStructBufferAtomicIntrin(Op, DAG,
10877 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FADD);
10878 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10880 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10881 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10882 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10883 return lowerStructBufferAtomicIntrin(Op, DAG,
10884 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMIN);
10885 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10886 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10887 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10888 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10890 return lowerStructBufferAtomicIntrin(Op, DAG,
10891 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_FMAX);
10892 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10893 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10894 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10895 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10897 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10898 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10900 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10901 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10903 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10904 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10905 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10906 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10907 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10909 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10910 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10912 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10915 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10916 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10918 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10919 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10921 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10922 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10924 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10925 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10927 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10928 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10930 return lowerStructBufferAtomicIntrin(Op, DAG,
10931 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SWAP);
10932 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10934 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_ADD);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10937 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SUB);
10938 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10940 return lowerStructBufferAtomicIntrin(Op, DAG,
10941 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMIN);
10942 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10944 return lowerStructBufferAtomicIntrin(Op, DAG,
10945 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMIN);
10946 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10948 return lowerStructBufferAtomicIntrin(Op, DAG,
10949 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_SMAX);
10950 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10952 return lowerStructBufferAtomicIntrin(Op, DAG,
10953 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_UMAX);
10954 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10956 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_AND);
10957 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10959 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_OR);
10960 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10962 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_XOR);
10963 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10965 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_INC);
10966 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10967 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10968 return lowerStructBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_DEC);
10969 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10970 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10971 return lowerRawBufferAtomicIntrin(Op, DAG, NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10972 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10974 return lowerStructBufferAtomicIntrin(Op, DAG,
10975 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_CSUB);
10976 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10977 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10978 return lowerRawBufferAtomicIntrin(Op, DAG,
10979 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10980 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10981 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10982 return lowerStructBufferAtomicIntrin(Op, DAG,
10983 NewOpcode: AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10984 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10986 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 4), DAG);
10987 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
10988 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
10989 SDValue Ops[] = {
10990 Op.getOperand(i: 0), // Chain
10991 Op.getOperand(i: 2), // src
10992 Op.getOperand(i: 3), // cmp
10993 Rsrc, // rsrc
10994 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
10995 VOffset, // voffset
10996 SOffset, // soffset
10997 Offset, // offset
10998 Op.getOperand(i: 7), // cachepolicy
10999 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11000 };
11001 EVT VT = Op.getValueType();
11002 auto *M = cast<MemSDNode>(Val&: Op);
11003
11004 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11005 VTList: Op->getVTList(), Ops, MemVT: VT,
11006 MMO: M->getMemOperand());
11007 }
11008 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11009 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11010 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op->getOperand(Num: 4), DAG);
11011 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 6), DAG);
11012 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 7), DAG, Subtarget);
11013 SDValue Ops[] = {
11014 Op.getOperand(i: 0), // Chain
11015 Op.getOperand(i: 2), // src
11016 Op.getOperand(i: 3), // cmp
11017 Rsrc, // rsrc
11018 Op.getOperand(i: 5), // vindex
11019 VOffset, // voffset
11020 SOffset, // soffset
11021 Offset, // offset
11022 Op.getOperand(i: 8), // cachepolicy
11023 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11024 };
11025 EVT VT = Op.getValueType();
11026 auto *M = cast<MemSDNode>(Val&: Op);
11027
11028 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, dl: DL,
11029 VTList: Op->getVTList(), Ops, MemVT: VT,
11030 MMO: M->getMemOperand());
11031 }
11032 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11033 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11034 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11035 SDValue NodePtr = M->getOperand(Num: 2);
11036 SDValue RayExtent = M->getOperand(Num: 3);
11037 SDValue InstanceMask = M->getOperand(Num: 4);
11038 SDValue RayOrigin = M->getOperand(Num: 5);
11039 SDValue RayDir = M->getOperand(Num: 6);
11040 SDValue Offsets = M->getOperand(Num: 7);
11041 SDValue TDescr = M->getOperand(Num: 8);
11042
11043 assert(NodePtr.getValueType() == MVT::i64);
11044 assert(RayDir.getValueType() == MVT::v3f32);
11045
11046 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11047 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11048 return SDValue();
11049 }
11050
11051 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11052 const unsigned NumVDataDwords = 10;
11053 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11054 int Opcode = AMDGPU::getMIMGOpcode(
11055 BaseOpcode: IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11056 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11057 MIMGEncoding: AMDGPU::MIMGEncGfx12, VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11058 assert(Opcode != -1);
11059
11060 SmallVector<SDValue, 7> Ops;
11061 Ops.push_back(Elt: NodePtr);
11062 Ops.push_back(Elt: DAG.getBuildVector(
11063 VT: MVT::v2i32, DL,
11064 Ops: {DAG.getBitcast(VT: MVT::i32, V: RayExtent),
11065 DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: InstanceMask)}));
11066 Ops.push_back(Elt: RayOrigin);
11067 Ops.push_back(Elt: RayDir);
11068 Ops.push_back(Elt: Offsets);
11069 Ops.push_back(Elt: TDescr);
11070 Ops.push_back(Elt: M->getChain());
11071
11072 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11073 MachineMemOperand *MemRef = M->getMemOperand();
11074 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11075 return SDValue(NewNode, 0);
11076 }
11077 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11078 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11079 SDValue NodePtr = M->getOperand(Num: 2);
11080 SDValue RayExtent = M->getOperand(Num: 3);
11081 SDValue RayOrigin = M->getOperand(Num: 4);
11082 SDValue RayDir = M->getOperand(Num: 5);
11083 SDValue RayInvDir = M->getOperand(Num: 6);
11084 SDValue TDescr = M->getOperand(Num: 7);
11085
11086 assert(NodePtr.getValueType() == MVT::i32 ||
11087 NodePtr.getValueType() == MVT::i64);
11088 assert(RayDir.getValueType() == MVT::v3f16 ||
11089 RayDir.getValueType() == MVT::v3f32);
11090
11091 if (!Subtarget->hasGFX10_AEncoding()) {
11092 emitRemovedIntrinsicError(DAG, DL, VT: Op.getValueType());
11093 return SDValue();
11094 }
11095
11096 const bool IsGFX11 = AMDGPU::isGFX11(STI: *Subtarget);
11097 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI: *Subtarget);
11098 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11099 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11100 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11101 const unsigned NumVDataDwords = 4;
11102 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11103 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11104 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11105 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11106 IsGFX12Plus;
11107 const unsigned BaseOpcodes[2][2] = {
11108 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11109 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11110 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11111 int Opcode;
11112 if (UseNSA) {
11113 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11114 MIMGEncoding: IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11115 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11116 : AMDGPU::MIMGEncGfx10NSA,
11117 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11118 } else {
11119 assert(!IsGFX12Plus);
11120 Opcode = AMDGPU::getMIMGOpcode(BaseOpcode: BaseOpcodes[Is64][IsA16],
11121 MIMGEncoding: IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11122 : AMDGPU::MIMGEncGfx10Default,
11123 VDataDwords: NumVDataDwords, VAddrDwords: NumVAddrDwords);
11124 }
11125 assert(Opcode != -1);
11126
11127 SmallVector<SDValue, 16> Ops;
11128
11129 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11130 SmallVector<SDValue, 3> Lanes;
11131 DAG.ExtractVectorElements(Op, Args&: Lanes, Start: 0, Count: 3);
11132 if (Lanes[0].getValueSizeInBits() == 32) {
11133 for (unsigned I = 0; I < 3; ++I)
11134 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: Lanes[I]));
11135 } else {
11136 if (IsAligned) {
11137 Ops.push_back(Elt: DAG.getBitcast(
11138 VT: MVT::i32,
11139 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[0], Lanes[1]})));
11140 Ops.push_back(Elt: Lanes[2]);
11141 } else {
11142 SDValue Elt0 = Ops.pop_back_val();
11143 Ops.push_back(Elt: DAG.getBitcast(
11144 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Elt0, Lanes[0]})));
11145 Ops.push_back(Elt: DAG.getBitcast(
11146 VT: MVT::i32,
11147 V: DAG.getBuildVector(VT: MVT::v2f16, DL, Ops: {Lanes[1], Lanes[2]})));
11148 }
11149 }
11150 };
11151
11152 if (UseNSA && IsGFX11Plus) {
11153 Ops.push_back(Elt: NodePtr);
11154 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11155 Ops.push_back(Elt: RayOrigin);
11156 if (IsA16) {
11157 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11158 DAG.ExtractVectorElements(Op: RayDir, Args&: DirLanes, Start: 0, Count: 3);
11159 DAG.ExtractVectorElements(Op: RayInvDir, Args&: InvDirLanes, Start: 0, Count: 3);
11160 for (unsigned I = 0; I < 3; ++I) {
11161 MergedLanes.push_back(Elt: DAG.getBitcast(
11162 VT: MVT::i32, V: DAG.getBuildVector(VT: MVT::v2f16, DL,
11163 Ops: {DirLanes[I], InvDirLanes[I]})));
11164 }
11165 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v3i32, DL, Ops: MergedLanes));
11166 } else {
11167 Ops.push_back(Elt: RayDir);
11168 Ops.push_back(Elt: RayInvDir);
11169 }
11170 } else {
11171 if (Is64)
11172 DAG.ExtractVectorElements(Op: DAG.getBitcast(VT: MVT::v2i32, V: NodePtr), Args&: Ops, Start: 0,
11173 Count: 2);
11174 else
11175 Ops.push_back(Elt: NodePtr);
11176
11177 Ops.push_back(Elt: DAG.getBitcast(VT: MVT::i32, V: RayExtent));
11178 packLanes(RayOrigin, true);
11179 packLanes(RayDir, true);
11180 packLanes(RayInvDir, false);
11181 }
11182
11183 if (!UseNSA) {
11184 // Build a single vector containing all the operands so far prepared.
11185 if (NumVAddrDwords > 12) {
11186 SDValue Undef = DAG.getPOISON(VT: MVT::i32);
11187 Ops.append(NumInputs: 16 - Ops.size(), Elt: Undef);
11188 }
11189 assert(Ops.size() >= 8 && Ops.size() <= 12);
11190 SDValue MergedOps =
11191 DAG.getBuildVector(VT: MVT::getVectorVT(VT: MVT::i32, NumElements: Ops.size()), DL, Ops);
11192 Ops.clear();
11193 Ops.push_back(Elt: MergedOps);
11194 }
11195
11196 Ops.push_back(Elt: TDescr);
11197 Ops.push_back(Elt: DAG.getTargetConstant(Val: IsA16, DL, VT: MVT::i1));
11198 Ops.push_back(Elt: M->getChain());
11199
11200 auto *NewNode = DAG.getMachineNode(Opcode, dl: DL, VTs: M->getVTList(), Ops);
11201 MachineMemOperand *MemRef = M->getMemOperand();
11202 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: {MemRef});
11203 return SDValue(NewNode, 0);
11204 }
11205 case Intrinsic::amdgcn_global_atomic_fmin_num:
11206 case Intrinsic::amdgcn_global_atomic_fmax_num:
11207 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11208 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11209 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11210 SDValue Ops[] = {
11211 M->getOperand(Num: 0), // Chain
11212 M->getOperand(Num: 2), // Ptr
11213 M->getOperand(Num: 3) // Value
11214 };
11215 unsigned Opcode = 0;
11216 switch (IntrID) {
11217 case Intrinsic::amdgcn_global_atomic_fmin_num:
11218 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11219 Opcode = ISD::ATOMIC_LOAD_FMIN;
11220 break;
11221 }
11222 case Intrinsic::amdgcn_global_atomic_fmax_num:
11223 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11224 Opcode = ISD::ATOMIC_LOAD_FMAX;
11225 break;
11226 }
11227 default:
11228 llvm_unreachable("unhandled atomic opcode");
11229 }
11230 return DAG.getAtomic(Opcode, dl: SDLoc(Op), MemVT: M->getMemoryVT(), VTList: M->getVTList(),
11231 Ops, MMO: M->getMemOperand());
11232 }
11233 case Intrinsic::amdgcn_s_alloc_vgpr: {
11234 SDValue NumVGPRs = Op.getOperand(i: 2);
11235 if (!NumVGPRs->isDivergent())
11236 return Op;
11237
11238 SDValue ReadFirstLaneID =
11239 DAG.getTargetConstant(Val: Intrinsic::amdgcn_readfirstlane, DL, VT: MVT::i32);
11240 NumVGPRs = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i32,
11241 N1: ReadFirstLaneID, N2: NumVGPRs);
11242
11243 return DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, VTList: Op->getVTList(),
11244 N1: Op.getOperand(i: 0), N2: Op.getOperand(i: 1), N3: NumVGPRs);
11245 }
11246 case Intrinsic::amdgcn_s_get_barrier_state:
11247 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11248 SDValue Chain = Op->getOperand(Num: 0);
11249 SmallVector<SDValue, 2> Ops;
11250 unsigned Opc;
11251
11252 if (isa<ConstantSDNode>(Val: Op->getOperand(Num: 2))) {
11253 uint64_t BarID = cast<ConstantSDNode>(Val: Op->getOperand(Num: 2))->getZExtValue();
11254 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11255 BarID = (BarID >> 4) & 0x3F;
11256 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11257 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11258 Ops.push_back(Elt: K);
11259 Ops.push_back(Elt: Chain);
11260 } else {
11261 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11262 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11263 SDValue M0Val;
11264 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: Op->getOperand(Num: 2),
11265 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11266 M0Val = SDValue(
11267 DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11268 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11269 0);
11270 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11271 } else
11272 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: Op->getOperand(Num: 2)).getValue(R: 0));
11273 }
11274
11275 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11276 return SDValue(NewMI, 0);
11277 }
11278 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11279 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11280 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11281 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11282 SDValue Chain = Op->getOperand(Num: 0);
11283 SDValue Ptr = Op->getOperand(Num: 2);
11284 EVT VT = Op->getValueType(ResNo: 0);
11285 return DAG.getAtomicLoad(ExtType: ISD::NON_EXTLOAD, dl: DL, MemVT: MII->getMemoryVT(), VT,
11286 Chain, Ptr, MMO: MII->getMemOperand());
11287 }
11288 case Intrinsic::amdgcn_flat_load_monitor_b32:
11289 case Intrinsic::amdgcn_flat_load_monitor_b64:
11290 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11291 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11292 SDValue Chain = Op->getOperand(Num: 0);
11293 SDValue Ptr = Op->getOperand(Num: 2);
11294 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::FLAT_LOAD_MONITOR, dl: DL,
11295 VTList: Op->getVTList(), Ops: {Chain, Ptr},
11296 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11297 }
11298 case Intrinsic::amdgcn_global_load_monitor_b32:
11299 case Intrinsic::amdgcn_global_load_monitor_b64:
11300 case Intrinsic::amdgcn_global_load_monitor_b128: {
11301 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11302 SDValue Chain = Op->getOperand(Num: 0);
11303 SDValue Ptr = Op->getOperand(Num: 2);
11304 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::GLOBAL_LOAD_MONITOR, dl: DL,
11305 VTList: Op->getVTList(), Ops: {Chain, Ptr},
11306 MemVT: MII->getMemoryVT(), MMO: MII->getMemOperand());
11307 }
11308 default:
11309
11310 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11311 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrID))
11312 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11313
11314 return SDValue();
11315 }
11316}
11317
11318// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11319// dwordx4 if on SI and handle TFE loads.
11320SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11321 SDVTList VTList,
11322 ArrayRef<SDValue> Ops, EVT MemVT,
11323 MachineMemOperand *MMO,
11324 SelectionDAG &DAG) const {
11325 LLVMContext &C = *DAG.getContext();
11326 MachineFunction &MF = DAG.getMachineFunction();
11327 EVT VT = VTList.VTs[0];
11328
11329 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11330 bool IsTFE = VTList.NumVTs == 3;
11331 if (IsTFE) {
11332 unsigned NumValueDWords = divideCeil(Numerator: VT.getSizeInBits(), Denominator: 32);
11333 unsigned NumOpDWords = NumValueDWords + 1;
11334 EVT OpDWordsVT = EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumOpDWords);
11335 SDVTList OpDWordsVTList = DAG.getVTList(VT1: OpDWordsVT, VT2: VTList.VTs[2]);
11336 MachineMemOperand *OpDWordsMMO =
11337 MF.getMachineMemOperand(MMO, Offset: 0, Size: NumOpDWords * 4);
11338 SDValue Op = getMemIntrinsicNode(Opcode, DL, VTList: OpDWordsVTList, Ops,
11339 MemVT: OpDWordsVT, MMO: OpDWordsMMO, DAG);
11340 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
11341 N2: DAG.getVectorIdxConstant(Val: NumValueDWords, DL));
11342 SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: 0, DL);
11343 SDValue ValueDWords =
11344 NumValueDWords == 1
11345 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op, N2: ZeroIdx)
11346 : DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL,
11347 VT: EVT::getVectorVT(Context&: C, VT: MVT::i32, NumElements: NumValueDWords), N1: Op,
11348 N2: ZeroIdx);
11349 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: ValueDWords);
11350 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
11351 }
11352
11353 if (!Subtarget->hasDwordx3LoadStores() &&
11354 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11355 EVT WidenedVT = EVT::getVectorVT(Context&: C, VT: VT.getVectorElementType(), NumElements: 4);
11356 EVT WidenedMemVT = EVT::getVectorVT(Context&: C, VT: MemVT.getVectorElementType(), NumElements: 4);
11357 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 16);
11358 SDVTList WidenedVTList = DAG.getVTList(VT1: WidenedVT, VT2: VTList.VTs[1]);
11359 SDValue Op = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: WidenedVTList, Ops,
11360 MemVT: WidenedMemVT, MMO: WidenedMMO);
11361 SDValue Value = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Op,
11362 N2: DAG.getVectorIdxConstant(Val: 0, DL));
11363 return DAG.getMergeValues(Ops: {Value, SDValue(Op.getNode(), 1)}, dl: DL);
11364 }
11365
11366 return DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList, Ops, MemVT, MMO);
11367}
11368
11369SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11370 bool ImageStore) const {
11371 EVT StoreVT = VData.getValueType();
11372
11373 // No change for f16 and legal vector D16 types.
11374 if (!StoreVT.isVector())
11375 return VData;
11376
11377 SDLoc DL(VData);
11378 unsigned NumElements = StoreVT.getVectorNumElements();
11379
11380 if (Subtarget->hasUnpackedD16VMem()) {
11381 // We need to unpack the packed data to store.
11382 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11383 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11384
11385 EVT EquivStoreVT =
11386 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements);
11387 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: EquivStoreVT, Operand: IntVData);
11388 return DAG.UnrollVectorOp(N: ZExt.getNode());
11389 }
11390
11391 // The sq block of gfx8.1 does not estimate register use correctly for d16
11392 // image store instructions. The data operand is computed as if it were not a
11393 // d16 image instruction.
11394 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11395 // Bitcast to i16
11396 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11397 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11398
11399 // Decompose into scalars
11400 SmallVector<SDValue, 4> Elts;
11401 DAG.ExtractVectorElements(Op: IntVData, Args&: Elts);
11402
11403 // Group pairs of i16 into v2i16 and bitcast to i32
11404 SmallVector<SDValue, 4> PackedElts;
11405 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11406 SDValue Pair =
11407 DAG.getBuildVector(VT: MVT::v2i16, DL, Ops: {Elts[I * 2], Elts[I * 2 + 1]});
11408 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11409 PackedElts.push_back(Elt: IntPair);
11410 }
11411 if ((NumElements % 2) == 1) {
11412 // Handle v3i16
11413 unsigned I = Elts.size() / 2;
11414 SDValue Pair = DAG.getBuildVector(VT: MVT::v2i16, DL,
11415 Ops: {Elts[I * 2], DAG.getPOISON(VT: MVT::i16)});
11416 SDValue IntPair = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Pair);
11417 PackedElts.push_back(Elt: IntPair);
11418 }
11419
11420 // Pad using UNDEF
11421 PackedElts.resize(N: Elts.size(), NV: DAG.getPOISON(VT: MVT::i32));
11422
11423 // Build final vector
11424 EVT VecVT =
11425 EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i32, NumElements: PackedElts.size());
11426 return DAG.getBuildVector(VT: VecVT, DL, Ops: PackedElts);
11427 }
11428
11429 if (NumElements == 3) {
11430 EVT IntStoreVT =
11431 EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: StoreVT.getStoreSizeInBits());
11432 SDValue IntVData = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntStoreVT, Operand: VData);
11433
11434 EVT WidenedStoreVT = EVT::getVectorVT(
11435 Context&: *DAG.getContext(), VT: StoreVT.getVectorElementType(), NumElements: NumElements + 1);
11436 EVT WidenedIntVT = EVT::getIntegerVT(Context&: *DAG.getContext(),
11437 BitWidth: WidenedStoreVT.getStoreSizeInBits());
11438 SDValue ZExt = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: WidenedIntVT, Operand: IntVData);
11439 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: WidenedStoreVT, Operand: ZExt);
11440 }
11441
11442 assert(isTypeLegal(StoreVT));
11443 return VData;
11444}
11445
11446static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
11447 switch (Intr) {
11448 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11449 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11450 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11451 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11452 case Intrinsic::amdgcn_load_async_to_lds:
11453 case Intrinsic::amdgcn_global_load_async_lds:
11454 return true;
11455 }
11456 return false;
11457}
11458
11459SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
11460 SelectionDAG &DAG) const {
11461 SDLoc DL(Op);
11462 SDValue Chain = Op.getOperand(i: 0);
11463 unsigned IntrinsicID = Op.getConstantOperandVal(i: 1);
11464
11465 switch (IntrinsicID) {
11466 case Intrinsic::amdgcn_exp_compr: {
11467 if (!Subtarget->hasCompressedExport()) {
11468 DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported(
11469 DAG.getMachineFunction().getFunction(),
11470 "intrinsic not supported on subtarget", DL.getDebugLoc()));
11471 }
11472 SDValue Src0 = Op.getOperand(i: 4);
11473 SDValue Src1 = Op.getOperand(i: 5);
11474 // Hack around illegal type on SI by directly selecting it.
11475 if (isTypeLegal(VT: Src0.getValueType()))
11476 return SDValue();
11477
11478 const ConstantSDNode *Done = cast<ConstantSDNode>(Val: Op.getOperand(i: 6));
11479 SDValue Undef = DAG.getPOISON(VT: MVT::f32);
11480 const SDValue Ops[] = {
11481 Op.getOperand(i: 2), // tgt
11482 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src0), // src0
11483 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Src1), // src1
11484 Undef, // src2
11485 Undef, // src3
11486 Op.getOperand(i: 7), // vm
11487 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // compr
11488 Op.getOperand(i: 3), // en
11489 Op.getOperand(i: 0) // Chain
11490 };
11491
11492 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11493 return SDValue(DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops), 0);
11494 }
11495
11496 case Intrinsic::amdgcn_struct_tbuffer_store:
11497 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11498 SDValue VData = Op.getOperand(i: 2);
11499 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11500 if (IsD16)
11501 VData = handleD16VData(VData, DAG);
11502 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11503 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11504 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11505 SDValue Ops[] = {
11506 Chain,
11507 VData, // vdata
11508 Rsrc, // rsrc
11509 Op.getOperand(i: 4), // vindex
11510 VOffset, // voffset
11511 SOffset, // soffset
11512 Offset, // offset
11513 Op.getOperand(i: 7), // format
11514 Op.getOperand(i: 8), // cachepolicy, swizzled buffer
11515 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11516 };
11517 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11518 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11519 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11520 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11521 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11522 }
11523
11524 case Intrinsic::amdgcn_raw_tbuffer_store:
11525 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11526 SDValue VData = Op.getOperand(i: 2);
11527 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11528 if (IsD16)
11529 VData = handleD16VData(VData, DAG);
11530 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11531 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11532 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11533 SDValue Ops[] = {
11534 Chain,
11535 VData, // vdata
11536 Rsrc, // rsrc
11537 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11538 VOffset, // voffset
11539 SOffset, // soffset
11540 Offset, // offset
11541 Op.getOperand(i: 6), // format
11542 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11543 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11544 };
11545 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11546 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11547 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11548 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11549 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11550 }
11551
11552 case Intrinsic::amdgcn_raw_buffer_store:
11553 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11554 case Intrinsic::amdgcn_raw_buffer_store_format:
11555 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11556 const bool IsFormat =
11557 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11558 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11559
11560 SDValue VData = Op.getOperand(i: 2);
11561 EVT VDataVT = VData.getValueType();
11562 EVT EltType = VDataVT.getScalarType();
11563 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11564 if (IsD16) {
11565 VData = handleD16VData(VData, DAG);
11566 VDataVT = VData.getValueType();
11567 }
11568
11569 if (!isTypeLegal(VT: VDataVT)) {
11570 VData =
11571 DAG.getNode(Opcode: ISD::BITCAST, DL,
11572 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11573 }
11574
11575 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11576 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 4), DAG);
11577 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 5), DAG, Subtarget);
11578 SDValue Ops[] = {
11579 Chain,
11580 VData,
11581 Rsrc,
11582 DAG.getConstant(Val: 0, DL, VT: MVT::i32), // vindex
11583 VOffset, // voffset
11584 SOffset, // soffset
11585 Offset, // offset
11586 Op.getOperand(i: 6), // cachepolicy, swizzled buffer
11587 DAG.getTargetConstant(Val: 0, DL, VT: MVT::i1), // idxen
11588 };
11589 unsigned Opc =
11590 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11591 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11592 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11593
11594 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11595 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11596 return handleByteShortBufferStores(DAG, VDataType: VDataVT, DL, Ops, M);
11597
11598 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11599 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11600 }
11601
11602 case Intrinsic::amdgcn_struct_buffer_store:
11603 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11604 case Intrinsic::amdgcn_struct_buffer_store_format:
11605 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11606 const bool IsFormat =
11607 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11608 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11609
11610 SDValue VData = Op.getOperand(i: 2);
11611 EVT VDataVT = VData.getValueType();
11612 EVT EltType = VDataVT.getScalarType();
11613 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11614
11615 if (IsD16) {
11616 VData = handleD16VData(VData, DAG);
11617 VDataVT = VData.getValueType();
11618 }
11619
11620 if (!isTypeLegal(VT: VDataVT)) {
11621 VData =
11622 DAG.getNode(Opcode: ISD::BITCAST, DL,
11623 VT: getEquivalentMemType(Context&: *DAG.getContext(), VT: VDataVT), Operand: VData);
11624 }
11625
11626 auto Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 3), DAG);
11627 auto [VOffset, Offset] = splitBufferOffsets(Offset: Op.getOperand(i: 5), DAG);
11628 auto SOffset = selectSOffset(SOffset: Op.getOperand(i: 6), DAG, Subtarget);
11629 SDValue Ops[] = {
11630 Chain,
11631 VData,
11632 Rsrc,
11633 Op.getOperand(i: 4), // vindex
11634 VOffset, // voffset
11635 SOffset, // soffset
11636 Offset, // offset
11637 Op.getOperand(i: 7), // cachepolicy, swizzled buffer
11638 DAG.getTargetConstant(Val: 1, DL, VT: MVT::i1), // idxen
11639 };
11640 unsigned Opc =
11641 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11642 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
11643 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11644
11645 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11646 EVT VDataType = VData.getValueType().getScalarType();
11647 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11648 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11649
11650 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: Op->getVTList(), Ops,
11651 MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
11652 }
11653 case Intrinsic::amdgcn_raw_buffer_load_lds:
11654 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11655 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11656 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11657 case Intrinsic::amdgcn_struct_buffer_load_lds:
11658 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11659 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11660 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11661 if (!Subtarget->hasVMemToLDSLoad())
11662 return SDValue();
11663 unsigned Opc;
11664 bool HasVIndex =
11665 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11666 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11667 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11668 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11669 unsigned OpOffset = HasVIndex ? 1 : 0;
11670 SDValue VOffset = Op.getOperand(i: 5 + OpOffset);
11671 bool HasVOffset = !isNullConstant(V: VOffset);
11672 unsigned Size = Op->getConstantOperandVal(Num: 4);
11673
11674 switch (Size) {
11675 default:
11676 return SDValue();
11677 case 1:
11678 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11679 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11680 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11681 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11682 break;
11683 case 2:
11684 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11685 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11686 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11687 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11688 break;
11689 case 4:
11690 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11691 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11692 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11693 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11694 break;
11695 case 12:
11696 if (!Subtarget->hasLDSLoadB96_B128())
11697 return SDValue();
11698 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11699 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11700 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11701 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11702 break;
11703 case 16:
11704 if (!Subtarget->hasLDSLoadB96_B128())
11705 return SDValue();
11706 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11707 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11708 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11709 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11710 break;
11711 }
11712
11713 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11714
11715 SmallVector<SDValue, 8> Ops;
11716
11717 if (HasVIndex && HasVOffset)
11718 Ops.push_back(Elt: DAG.getBuildVector(VT: MVT::v2i32, DL,
11719 Ops: {Op.getOperand(i: 5), // VIndex
11720 VOffset}));
11721 else if (HasVIndex)
11722 Ops.push_back(Elt: Op.getOperand(i: 5));
11723 else if (HasVOffset)
11724 Ops.push_back(Elt: VOffset);
11725
11726 SDValue Rsrc = bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG);
11727 Ops.push_back(Elt: Rsrc);
11728 Ops.push_back(Elt: Op.getOperand(i: 6 + OpOffset)); // soffset
11729 Ops.push_back(Elt: Op.getOperand(i: 7 + OpOffset)); // imm offset
11730 bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI: *Subtarget);
11731 unsigned Aux = Op.getConstantOperandVal(i: 8 + OpOffset);
11732 Ops.push_back(Elt: DAG.getTargetConstant(
11733 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11734 DL, VT: MVT::i8)); // cpol
11735 Ops.push_back(Elt: DAG.getTargetConstant(
11736 Val: Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11737 ? 1
11738 : 0,
11739 DL, VT: MVT::i8)); // swz
11740 Ops.push_back(
11741 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11742 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11743 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11744
11745 auto *M = cast<MemSDNode>(Val&: Op);
11746 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: M->getVTList(), Ops);
11747 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11748
11749 return SDValue(Load, 0);
11750 }
11751 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11752 // for "trust me" that the remaining cases are global pointers until
11753 // such time as we can put two mem operands on an intrinsic.
11754 case Intrinsic::amdgcn_load_to_lds:
11755 case Intrinsic::amdgcn_load_async_to_lds:
11756 case Intrinsic::amdgcn_global_load_lds:
11757 case Intrinsic::amdgcn_global_load_async_lds: {
11758 if (!Subtarget->hasVMemToLDSLoad())
11759 return SDValue();
11760
11761 unsigned Opc;
11762 unsigned Size = Op->getConstantOperandVal(Num: 4);
11763 switch (Size) {
11764 default:
11765 return SDValue();
11766 case 1:
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11768 break;
11769 case 2:
11770 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11771 break;
11772 case 4:
11773 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11774 break;
11775 case 12:
11776 if (!Subtarget->hasLDSLoadB96_B128())
11777 return SDValue();
11778 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11779 break;
11780 case 16:
11781 if (!Subtarget->hasLDSLoadB96_B128())
11782 return SDValue();
11783 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11784 break;
11785 }
11786
11787 SDValue M0Val = copyToM0(DAG, Chain, DL, V: Op.getOperand(i: 3));
11788
11789 SmallVector<SDValue, 6> Ops;
11790
11791 SDValue Addr = Op.getOperand(i: 2); // Global ptr
11792 SDValue VOffset;
11793 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11794 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11795 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11796 SDValue LHS = Addr.getOperand(i: 0);
11797 SDValue RHS = Addr.getOperand(i: 1);
11798
11799 if (LHS->isDivergent())
11800 std::swap(a&: LHS, b&: RHS);
11801
11802 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11803 RHS.getOperand(i: 0).getValueType() == MVT::i32) {
11804 // add (i64 sgpr), (zero_extend (i32 vgpr))
11805 Addr = LHS;
11806 VOffset = RHS.getOperand(i: 0);
11807 }
11808 }
11809
11810 Ops.push_back(Elt: Addr);
11811 if (!Addr->isDivergent()) {
11812 Opc = AMDGPU::getGlobalSaddrOp(Opcode: Opc);
11813 if (!VOffset)
11814 VOffset =
11815 SDValue(DAG.getMachineNode(Opcode: AMDGPU::V_MOV_B32_e32, dl: DL, VT: MVT::i32,
11816 Op1: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32)),
11817 0);
11818 Ops.push_back(Elt: VOffset);
11819 }
11820
11821 Ops.push_back(Elt: Op.getOperand(i: 5)); // Offset
11822
11823 unsigned Aux = Op.getConstantOperandVal(i: 6);
11824 Ops.push_back(Elt: DAG.getTargetConstant(Val: Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
11825 VT: MVT::i32)); // CPol
11826 Ops.push_back(
11827 Elt: DAG.getTargetConstant(Val: isAsyncLDSDMA(Intr: IntrinsicID), DL, VT: MVT::i8));
11828
11829 Ops.push_back(Elt: M0Val.getValue(R: 0)); // Chain
11830 Ops.push_back(Elt: M0Val.getValue(R: 1)); // Glue
11831
11832 auto *M = cast<MemSDNode>(Val&: Op);
11833 auto *Load = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11834 DAG.setNodeMemRefs(N: Load, NewMemRefs: M->memoperands());
11835
11836 return SDValue(Load, 0);
11837 }
11838 case Intrinsic::amdgcn_end_cf:
11839 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::SI_END_CF, dl: DL, VT: MVT::Other,
11840 Op1: Op->getOperand(Num: 2), Op2: Chain),
11841 0);
11842 case Intrinsic::amdgcn_s_barrier_init:
11843 case Intrinsic::amdgcn_s_barrier_signal_var: {
11844 // these two intrinsics have two operands: barrier pointer and member count
11845 SDValue Chain = Op->getOperand(Num: 0);
11846 SmallVector<SDValue, 2> Ops;
11847 SDValue BarOp = Op->getOperand(Num: 2);
11848 SDValue CntOp = Op->getOperand(Num: 3);
11849 SDValue M0Val;
11850 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11851 ? AMDGPU::S_BARRIER_INIT_M0
11852 : AMDGPU::S_BARRIER_SIGNAL_M0;
11853 // extract the BarrierID from bits 4-9 of BarOp
11854 SDValue BarID;
11855 BarID = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11856 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11857 BarID =
11858 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: BarID,
11859 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11860 0);
11861 // Member count should be put into M0[ShAmt:+6]
11862 // Barrier ID should be put into M0[5:0]
11863 M0Val =
11864 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: CntOp,
11865 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11866 0);
11867 constexpr unsigned ShAmt = 16;
11868 M0Val = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: CntOp,
11869 N2: DAG.getShiftAmountConstant(Val: ShAmt, VT: MVT::i32, DL));
11870
11871 M0Val = SDValue(
11872 DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: M0Val, Op2: BarID), 0);
11873
11874 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11875
11876 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11877 return SDValue(NewMI, 0);
11878 }
11879 case Intrinsic::amdgcn_s_wakeup_barrier: {
11880 if (!Subtarget->hasSWakeupBarrier())
11881 return SDValue();
11882 [[fallthrough]];
11883 }
11884 case Intrinsic::amdgcn_s_barrier_join: {
11885 // these three intrinsics have one operand: barrier pointer
11886 SDValue Chain = Op->getOperand(Num: 0);
11887 SmallVector<SDValue, 2> Ops;
11888 SDValue BarOp = Op->getOperand(Num: 2);
11889 unsigned Opc;
11890
11891 if (isa<ConstantSDNode>(Val: BarOp)) {
11892 uint64_t BarVal = cast<ConstantSDNode>(Val&: BarOp)->getZExtValue();
11893 switch (IntrinsicID) {
11894 default:
11895 return SDValue();
11896 case Intrinsic::amdgcn_s_barrier_join:
11897 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11898 break;
11899 case Intrinsic::amdgcn_s_wakeup_barrier:
11900 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11901 break;
11902 }
11903 // extract the BarrierID from bits 4-9 of the immediate
11904 unsigned BarID = (BarVal >> 4) & 0x3F;
11905 SDValue K = DAG.getTargetConstant(Val: BarID, DL, VT: MVT::i32);
11906 Ops.push_back(Elt: K);
11907 Ops.push_back(Elt: Chain);
11908 } else {
11909 switch (IntrinsicID) {
11910 default:
11911 return SDValue();
11912 case Intrinsic::amdgcn_s_barrier_join:
11913 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11914 break;
11915 case Intrinsic::amdgcn_s_wakeup_barrier:
11916 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11917 break;
11918 }
11919 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11920 SDValue M0Val;
11921 M0Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: BarOp,
11922 N2: DAG.getShiftAmountConstant(Val: 4, VT: MVT::i32, DL));
11923 M0Val =
11924 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_AND_B32, dl: DL, VT: MVT::i32, Op1: M0Val,
11925 Op2: DAG.getTargetConstant(Val: 0x3F, DL, VT: MVT::i32)),
11926 0);
11927 Ops.push_back(Elt: copyToM0(DAG, Chain, DL, V: M0Val).getValue(R: 0));
11928 }
11929
11930 auto *NewMI = DAG.getMachineNode(Opcode: Opc, dl: DL, VTs: Op->getVTList(), Ops);
11931 return SDValue(NewMI, 0);
11932 }
11933 case Intrinsic::amdgcn_s_prefetch_data: {
11934 // For non-global address space preserve the chain and remove the call.
11935 if (!AMDGPU::isFlatGlobalAddrSpace(AS: cast<MemSDNode>(Val&: Op)->getAddressSpace()))
11936 return Op.getOperand(i: 0);
11937 return Op;
11938 }
11939 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11940 SDValue Ops[] = {
11941 Chain, bufferRsrcPtrToVector(MaybePointer: Op.getOperand(i: 2), DAG),
11942 Op.getOperand(i: 3), // offset
11943 Op.getOperand(i: 4), // length
11944 };
11945
11946 MemSDNode *M = cast<MemSDNode>(Val&: Op);
11947 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::SBUFFER_PREFETCH_DATA, dl: DL,
11948 VTList: Op->getVTList(), Ops, MemVT: M->getMemoryVT(),
11949 MMO: M->getMemOperand());
11950 }
11951 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11952 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11953 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11954 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Val&: Op);
11955 SDValue Chain = Op->getOperand(Num: 0);
11956 SDValue Ptr = Op->getOperand(Num: 2);
11957 SDValue Val = Op->getOperand(Num: 3);
11958 return DAG.getAtomic(Opcode: ISD::ATOMIC_STORE, dl: DL, MemVT: MII->getMemoryVT(), Chain, Ptr: Val,
11959 Val: Ptr, MMO: MII->getMemOperand());
11960 }
11961 default: {
11962 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11963 AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinsicID))
11964 return lowerImage(Op, Intr: ImageDimIntr, DAG, WithChain: true);
11965
11966 return Op;
11967 }
11968 }
11969}
11970
11971// Return whether the operation has NoUnsignedWrap property.
11972static bool isNoUnsignedWrap(SDValue Addr) {
11973 return (Addr.getOpcode() == ISD::ADD &&
11974 Addr->getFlags().hasNoUnsignedWrap()) ||
11975 Addr->getOpcode() == ISD::OR;
11976}
11977
11978bool SITargetLowering::shouldPreservePtrArith(const Function &F,
11979 EVT PtrVT) const {
11980 return PtrVT == MVT::i64;
11981}
11982
11983bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
11984 EVT PtrVT) const {
11985 return true;
11986}
11987
11988// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11989// offset (the offset that is included in bounds checking and swizzling, to be
11990// split between the instruction's voffset and immoffset fields) and soffset
11991// (the offset that is excluded from bounds checking and swizzling, to go in
11992// the instruction's soffset field). This function takes the first kind of
11993// offset and figures out how to split it between voffset and immoffset.
11994std::pair<SDValue, SDValue>
11995SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11996 SDLoc DL(Offset);
11997 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST: *Subtarget);
11998 SDValue N0 = Offset;
11999 ConstantSDNode *C1 = nullptr;
12000
12001 if ((C1 = dyn_cast<ConstantSDNode>(Val&: N0)))
12002 N0 = SDValue();
12003 else if (DAG.isBaseWithConstantOffset(Op: N0)) {
12004 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12005 // being added, so we can only safely match a 32-bit addition with no
12006 // unsigned overflow.
12007 bool CheckNUW = Subtarget->hasGFX1250Insts();
12008 if (!CheckNUW || isNoUnsignedWrap(Addr: N0)) {
12009 C1 = cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
12010 N0 = N0.getOperand(i: 0);
12011 }
12012 }
12013
12014 if (C1) {
12015 unsigned ImmOffset = C1->getZExtValue();
12016 // If the immediate value is too big for the immoffset field, put only bits
12017 // that would normally fit in the immoffset field. The remaining value that
12018 // is copied/added for the voffset field is a large power of 2, and it
12019 // stands more chance of being CSEd with the copy/add for another similar
12020 // load/store.
12021 // However, do not do that rounding down if that is a negative
12022 // number, as it appears to be illegal to have a negative offset in the
12023 // vgpr, even if adding the immediate offset makes it positive.
12024 unsigned Overflow = ImmOffset & ~MaxImm;
12025 ImmOffset -= Overflow;
12026 if ((int32_t)Overflow < 0) {
12027 Overflow += ImmOffset;
12028 ImmOffset = 0;
12029 }
12030 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32));
12031 if (Overflow) {
12032 auto OverflowVal = DAG.getConstant(Val: Overflow, DL, VT: MVT::i32);
12033 if (!N0)
12034 N0 = OverflowVal;
12035 else {
12036 SDValue Ops[] = {N0, OverflowVal};
12037 N0 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops);
12038 }
12039 }
12040 }
12041 if (!N0)
12042 N0 = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12043 if (!C1)
12044 C1 = cast<ConstantSDNode>(Val: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
12045 return {N0, SDValue(C1, 0)};
12046}
12047
12048// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12049// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12050// pointed to by Offsets.
12051void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12052 SelectionDAG &DAG, SDValue *Offsets,
12053 Align Alignment) const {
12054 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12055 SDLoc DL(CombinedOffset);
12056 if (auto *C = dyn_cast<ConstantSDNode>(Val&: CombinedOffset)) {
12057 uint32_t Imm = C->getZExtValue();
12058 uint32_t SOffset, ImmOffset;
12059 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12060 Offsets[0] = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12061 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12062 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12063 return;
12064 }
12065 }
12066 if (DAG.isBaseWithConstantOffset(Op: CombinedOffset)) {
12067 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12068 // being added, so we can only safely match a 32-bit addition with no
12069 // unsigned overflow.
12070 bool CheckNUW = Subtarget->hasGFX1250Insts();
12071 SDValue N0 = CombinedOffset.getOperand(i: 0);
12072 SDValue N1 = CombinedOffset.getOperand(i: 1);
12073 uint32_t SOffset, ImmOffset;
12074 int Offset = cast<ConstantSDNode>(Val&: N1)->getSExtValue();
12075 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(Addr: CombinedOffset)) &&
12076 TII->splitMUBUFOffset(Imm: Offset, SOffset, ImmOffset, Alignment)) {
12077 Offsets[0] = N0;
12078 Offsets[1] = DAG.getConstant(Val: SOffset, DL, VT: MVT::i32);
12079 Offsets[2] = DAG.getTargetConstant(Val: ImmOffset, DL, VT: MVT::i32);
12080 return;
12081 }
12082 }
12083
12084 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12085 ? DAG.getRegister(Reg: AMDGPU::SGPR_NULL, VT: MVT::i32)
12086 : DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12087
12088 Offsets[0] = CombinedOffset;
12089 Offsets[1] = SOffsetZero;
12090 Offsets[2] = DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32);
12091}
12092
12093SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12094 SelectionDAG &DAG) const {
12095 if (!MaybePointer.getValueType().isScalarInteger())
12096 return MaybePointer;
12097
12098 SDValue Rsrc = DAG.getBitcast(VT: MVT::v4i32, V: MaybePointer);
12099 return Rsrc;
12100}
12101
12102// Wrap a global or flat pointer into a buffer intrinsic using the flags
12103// specified in the intrinsic.
12104SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12105 SelectionDAG &DAG) const {
12106 SDLoc Loc(Op);
12107
12108 SDValue Pointer = Op->getOperand(Num: 1);
12109 SDValue Stride = Op->getOperand(Num: 2);
12110 SDValue NumRecords = Op->getOperand(Num: 3);
12111 SDValue Flags = Op->getOperand(Num: 4);
12112
12113 SDValue ExtStride = DAG.getAnyExtOrTrunc(Op: Stride, DL: Loc, VT: MVT::i32);
12114 SDValue Rsrc;
12115
12116 if (Subtarget->has45BitNumRecordsBufferResource()) {
12117 SDValue Zero = DAG.getConstant(Val: 0, DL: Loc, VT: MVT::i32);
12118 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12119 // num_records.
12120 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Op: Pointer, DL: Loc, VT: MVT::i64);
12121 SDValue NumRecordsLHS =
12122 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12123 N2: DAG.getShiftAmountConstant(Val: 57, VT: MVT::i32, DL: Loc));
12124 SDValue LowHalf =
12125 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: ExtPointer, N2: NumRecordsLHS);
12126
12127 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12128 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12129 SDValue NumRecordsRHS =
12130 DAG.getNode(Opcode: ISD::SRL, DL: Loc, VT: MVT::i64, N1: NumRecords,
12131 N2: DAG.getShiftAmountConstant(Val: 7, VT: MVT::i32, DL: Loc));
12132 SDValue ShiftedStride =
12133 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12134 N2: DAG.getShiftAmountConstant(Val: 12, VT: MVT::i32, DL: Loc));
12135 SDValue ExtShiftedStrideVec =
12136 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedStride);
12137 SDValue ExtShiftedStride =
12138 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedStrideVec);
12139 SDValue ShiftedFlags =
12140 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: Flags,
12141 N2: DAG.getShiftAmountConstant(Val: 28, VT: MVT::i32, DL: Loc));
12142 SDValue ExtShiftedFlagsVec =
12143 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i32, N1: Zero, N2: ShiftedFlags);
12144 SDValue ExtShiftedFlags =
12145 DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i64, Operand: ExtShiftedFlagsVec);
12146 SDValue CombinedFields =
12147 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: NumRecordsRHS, N2: ExtShiftedStride);
12148 SDValue HighHalf =
12149 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i64, N1: CombinedFields, N2: ExtShiftedFlags);
12150
12151 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v2i64, N1: LowHalf, N2: HighHalf);
12152 } else {
12153 NumRecords = DAG.getAnyExtOrTrunc(Op: NumRecords, DL: Loc, VT: MVT::i32);
12154 auto [LowHalf, HighHalf] =
12155 DAG.SplitScalar(N: Pointer, DL: Loc, LoVT: MVT::i32, HiVT: MVT::i32);
12156 SDValue Mask = DAG.getConstant(Val: 0x0000ffff, DL: Loc, VT: MVT::i32);
12157 SDValue Masked = DAG.getNode(Opcode: ISD::AND, DL: Loc, VT: MVT::i32, N1: HighHalf, N2: Mask);
12158 SDValue ShiftedStride =
12159 DAG.getNode(Opcode: ISD::SHL, DL: Loc, VT: MVT::i32, N1: ExtStride,
12160 N2: DAG.getShiftAmountConstant(Val: 16, VT: MVT::i32, DL: Loc));
12161 SDValue NewHighHalf =
12162 DAG.getNode(Opcode: ISD::OR, DL: Loc, VT: MVT::i32, N1: Masked, N2: ShiftedStride);
12163
12164 Rsrc = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: Loc, VT: MVT::v4i32, N1: LowHalf, N2: NewHighHalf,
12165 N3: NumRecords, N4: Flags);
12166 }
12167
12168 SDValue RsrcPtr = DAG.getNode(Opcode: ISD::BITCAST, DL: Loc, VT: MVT::i128, Operand: Rsrc);
12169 return RsrcPtr;
12170}
12171
12172// Handle 8 bit and 16 bit buffer loads
12173SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12174 EVT LoadVT, SDLoc DL,
12175 ArrayRef<SDValue> Ops,
12176 MachineMemOperand *MMO,
12177 bool IsTFE) const {
12178 EVT IntVT = LoadVT.changeTypeToInteger();
12179
12180 if (IsTFE) {
12181 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12182 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12183 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12184 MachineFunction &MF = DAG.getMachineFunction();
12185 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, Offset: 0, Size: 8);
12186 SDVTList VTs = DAG.getVTList(VT1: MVT::v2i32, VT2: MVT::Other);
12187 SDValue Op = getMemIntrinsicNode(Opcode: Opc, DL, VTList: VTs, Ops, MemVT: MVT::v2i32, MMO: OpMMO, DAG);
12188 SDValue Status = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12189 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
12190 SDValue Data = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: Op,
12191 N2: DAG.getConstant(Val: 0, DL, VT: MVT::i32));
12192 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: Data);
12193 SDValue Value = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: Trunc);
12194 return DAG.getMergeValues(Ops: {Value, Status, SDValue(Op.getNode(), 1)}, dl: DL);
12195 }
12196
12197 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12198 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12199 : AMDGPUISD::BUFFER_LOAD_USHORT;
12200
12201 SDVTList ResList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::Other);
12202 SDValue BufferLoad =
12203 DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: IntVT, MMO);
12204 SDValue LoadVal = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: IntVT, Operand: BufferLoad);
12205 LoadVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: LoadVT, Operand: LoadVal);
12206
12207 return DAG.getMergeValues(Ops: {LoadVal, BufferLoad.getValue(R: 1)}, dl: DL);
12208}
12209
12210// Handle 8 bit and 16 bit buffer stores
12211SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12212 EVT VDataType, SDLoc DL,
12213 SDValue Ops[],
12214 MemSDNode *M) const {
12215 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12216 Ops[1] = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i16, Operand: Ops[1]);
12217
12218 SDValue BufferStoreExt = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Ops[1]);
12219 Ops[1] = BufferStoreExt;
12220 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12221 : AMDGPUISD::BUFFER_STORE_SHORT;
12222 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12223 return DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: M->getVTList(), Ops: OpsRef, MemVT: VDataType,
12224 MMO: M->getMemOperand());
12225}
12226
12227static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType,
12228 SDValue Op, const SDLoc &SL, EVT VT) {
12229 if (VT.bitsLT(VT: Op.getValueType()))
12230 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Op);
12231
12232 switch (ExtType) {
12233 case ISD::SEXTLOAD:
12234 return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SL, VT, Operand: Op);
12235 case ISD::ZEXTLOAD:
12236 return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SL, VT, Operand: Op);
12237 case ISD::EXTLOAD:
12238 return DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT, Operand: Op);
12239 case ISD::NON_EXTLOAD:
12240 return Op;
12241 }
12242
12243 llvm_unreachable("invalid ext type");
12244}
12245
12246// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12247// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12248SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12249 DAGCombinerInfo &DCI) const {
12250 SelectionDAG &DAG = DCI.DAG;
12251 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12252 return SDValue();
12253
12254 // FIXME: Constant loads should all be marked invariant.
12255 unsigned AS = Ld->getAddressSpace();
12256 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12257 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
12258 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12259 return SDValue();
12260
12261 // Don't do this early, since it may interfere with adjacent load merging for
12262 // illegal types. We can avoid losing alignment information for exotic types
12263 // pre-legalize.
12264 EVT MemVT = Ld->getMemoryVT();
12265 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12266 MemVT.getSizeInBits() >= 32)
12267 return SDValue();
12268
12269 SDLoc SL(Ld);
12270
12271 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12272 "unexpected vector extload");
12273
12274 // TODO: Drop only high part of range.
12275 SDValue Ptr = Ld->getBasePtr();
12276 SDValue NewLoad = DAG.getLoad(
12277 AM: ISD::UNINDEXED, ExtType: ISD::NON_EXTLOAD, VT: MVT::i32, dl: SL, Chain: Ld->getChain(), Ptr,
12278 Offset: Ld->getOffset(), PtrInfo: Ld->getPointerInfo(), MemVT: MVT::i32, Alignment: Ld->getAlign(),
12279 MMOFlags: Ld->getMemOperand()->getFlags(), AAInfo: Ld->getAAInfo(),
12280 Ranges: nullptr); // Drop ranges
12281
12282 EVT TruncVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getSizeInBits());
12283 if (MemVT.isFloatingPoint()) {
12284 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
12285 "unexpected fp extload");
12286 TruncVT = MemVT.changeTypeToInteger();
12287 }
12288
12289 SDValue Cvt = NewLoad;
12290 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12291 Cvt = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: SL, VT: MVT::i32, N1: NewLoad,
12292 N2: DAG.getValueType(TruncVT));
12293 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12294 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
12295 Cvt = DAG.getZeroExtendInReg(Op: NewLoad, DL: SL, VT: TruncVT);
12296 } else {
12297 assert(Ld->getExtensionType() == ISD::EXTLOAD);
12298 }
12299
12300 EVT VT = Ld->getValueType(ResNo: 0);
12301 EVT IntVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VT.getSizeInBits());
12302
12303 DCI.AddToWorklist(N: Cvt.getNode());
12304
12305 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12306 // the appropriate extension from the 32-bit load.
12307 Cvt = getLoadExtOrTrunc(DAG, ExtType: Ld->getExtensionType(), Op: Cvt, SL, VT: IntVT);
12308 DCI.AddToWorklist(N: Cvt.getNode());
12309
12310 // Handle conversion back to floating point if necessary.
12311 Cvt = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Cvt);
12312
12313 return DAG.getMergeValues(Ops: {Cvt, NewLoad.getValue(R: 1)}, dl: SL);
12314}
12315
12316static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
12317 const SIMachineFunctionInfo &Info) {
12318 // TODO: Should check if the address can definitely not access stack.
12319 if (Info.isEntryFunction())
12320 return Info.getUserSGPRInfo().hasFlatScratchInit();
12321 return true;
12322}
12323
12324SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12325 SDLoc DL(Op);
12326 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
12327 ISD::LoadExtType ExtType = Load->getExtensionType();
12328 EVT MemVT = Load->getMemoryVT();
12329 MachineMemOperand *MMO = Load->getMemOperand();
12330
12331 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12332 if (MemVT == MVT::i16 && isTypeLegal(VT: MVT::i16))
12333 return SDValue();
12334
12335 // FIXME: Copied from PPC
12336 // First, load into 32 bits, then truncate to 1 bit.
12337
12338 SDValue Chain = Load->getChain();
12339 SDValue BasePtr = Load->getBasePtr();
12340
12341 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12342
12343 SDValue NewLD = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: DL, VT: MVT::i32, Chain, Ptr: BasePtr,
12344 MemVT: RealMemVT, MMO);
12345
12346 if (!MemVT.isVector()) {
12347 SDValue Ops[] = {DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: NewLD),
12348 NewLD.getValue(R: 1)};
12349
12350 return DAG.getMergeValues(Ops, dl: DL);
12351 }
12352
12353 SmallVector<SDValue, 3> Elts;
12354 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12355 SDValue Elt = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: NewLD,
12356 N2: DAG.getConstant(Val: I, DL, VT: MVT::i32));
12357
12358 Elts.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Elt));
12359 }
12360
12361 SDValue Ops[] = {DAG.getBuildVector(VT: MemVT, DL, Ops: Elts), NewLD.getValue(R: 1)};
12362
12363 return DAG.getMergeValues(Ops, dl: DL);
12364 }
12365
12366 if (!MemVT.isVector())
12367 return SDValue();
12368
12369 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12370 "Custom lowering for non-i32 vectors hasn't been implemented.");
12371
12372 Align Alignment = Load->getAlign();
12373 unsigned AS = Load->getAddressSpace();
12374 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12375 AS == AMDGPUAS::FLAT_ADDRESS &&
12376 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12377 return SplitVectorLoad(Op, DAG);
12378 }
12379
12380 MachineFunction &MF = DAG.getMachineFunction();
12381 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12382 // If there is a possibility that flat instruction access scratch memory
12383 // then we need to use the same legalization rules we use for private.
12384 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12385 !Subtarget->hasMultiDwordFlatScratchAddressing())
12386 AS = addressMayBeAccessedAsPrivate(MMO: Load->getMemOperand(), Info: *MFI)
12387 ? AMDGPUAS::PRIVATE_ADDRESS
12388 : AMDGPUAS::GLOBAL_ADDRESS;
12389
12390 unsigned NumElements = MemVT.getVectorNumElements();
12391
12392 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12393 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12394 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12395 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12396 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(N: Load)))) {
12397 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12398 Alignment >= Align(4) && NumElements < 32) {
12399 if (MemVT.isPow2VectorType() ||
12400 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12401 return SDValue();
12402 return WidenOrSplitVectorLoad(Op, DAG);
12403 }
12404 // Non-uniform loads will be selected to MUBUF instructions, so they
12405 // have the same legalization requirements as global and private
12406 // loads.
12407 //
12408 }
12409 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12410 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
12411 AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
12412 if (NumElements > 4)
12413 return SplitVectorLoad(Op, DAG);
12414 // v3 loads not supported on SI.
12415 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12416 return WidenOrSplitVectorLoad(Op, DAG);
12417
12418 // v3 and v4 loads are supported for private and global memory.
12419 return SDValue();
12420 }
12421 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12422 // Depending on the setting of the private_element_size field in the
12423 // resource descriptor, we can only make private accesses up to a certain
12424 // size.
12425 switch (Subtarget->getMaxPrivateElementSize()) {
12426 case 4: {
12427 auto [Op0, Op1] = scalarizeVectorLoad(LD: Load, DAG);
12428 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12429 }
12430 case 8:
12431 if (NumElements > 2)
12432 return SplitVectorLoad(Op, DAG);
12433 return SDValue();
12434 case 16:
12435 // Same as global/flat
12436 if (NumElements > 4)
12437 return SplitVectorLoad(Op, DAG);
12438 // v3 loads not supported on SI.
12439 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12440 return WidenOrSplitVectorLoad(Op, DAG);
12441
12442 return SDValue();
12443 default:
12444 llvm_unreachable("unsupported private_element_size");
12445 }
12446 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12447 unsigned Fast = 0;
12448 auto Flags = Load->getMemOperand()->getFlags();
12449 if (allowsMisalignedMemoryAccessesImpl(Size: MemVT.getSizeInBits(), AddrSpace: AS,
12450 Alignment: Load->getAlign(), Flags, IsFast: &Fast) &&
12451 Fast > 1)
12452 return SDValue();
12453
12454 if (MemVT.isVector())
12455 return SplitVectorLoad(Op, DAG);
12456 }
12457
12458 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
12459 VT: MemVT, MMO: *Load->getMemOperand())) {
12460 auto [Op0, Op1] = expandUnalignedLoad(LD: Load, DAG);
12461 return DAG.getMergeValues(Ops: {Op0, Op1}, dl: DL);
12462 }
12463
12464 return SDValue();
12465}
12466
12467SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
12468 EVT VT = Op.getValueType();
12469 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
12470 VT.getSizeInBits() == 512)
12471 return splitTernaryVectorOp(Op, DAG);
12472
12473 assert(VT.getSizeInBits() == 64);
12474
12475 SDLoc DL(Op);
12476 SDValue Cond = DAG.getFreeze(V: Op.getOperand(i: 0));
12477
12478 SDValue Zero = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
12479 SDValue One = DAG.getConstant(Val: 1, DL, VT: MVT::i32);
12480
12481 SDValue LHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 1));
12482 SDValue RHS = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Op.getOperand(i: 2));
12483
12484 SDValue Lo0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: Zero);
12485 SDValue Lo1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: Zero);
12486
12487 SDValue Lo = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Lo0, RHS: Lo1);
12488
12489 SDValue Hi0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: LHS, N2: One);
12490 SDValue Hi1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: RHS, N2: One);
12491
12492 SDValue Hi = DAG.getSelect(DL, VT: MVT::i32, Cond, LHS: Hi0, RHS: Hi1);
12493
12494 SDValue Res = DAG.getBuildVector(VT: MVT::v2i32, DL, Ops: {Lo, Hi});
12495 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Res);
12496}
12497
12498// Catch division cases where we can use shortcuts with rcp and rsq
12499// instructions.
12500SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
12501 SelectionDAG &DAG) const {
12502 SDLoc SL(Op);
12503 SDValue LHS = Op.getOperand(i: 0);
12504 SDValue RHS = Op.getOperand(i: 1);
12505 EVT VT = Op.getValueType();
12506 const SDNodeFlags Flags = Op->getFlags();
12507
12508 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
12509
12510 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
12511 // Without !fpmath accuracy information, we can't do more because we don't
12512 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
12513 // f16 is always accurate enough
12514 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12515 return SDValue();
12516
12517 if (CLHS->isExactlyValue(V: 1.0)) {
12518 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12519 // the CI documentation has a worst case error of 1 ulp.
12520 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12521 // use it as long as we aren't trying to use denormals.
12522 //
12523 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12524
12525 // 1.0 / sqrt(x) -> rsq(x)
12526
12527 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12528 // error seems really high at 2^29 ULP.
12529 // 1.0 / x -> rcp(x)
12530 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12531 }
12532
12533 // Same as for 1.0, but expand the sign out of the constant.
12534 if (CLHS->isExactlyValue(V: -1.0)) {
12535 // -1.0 / x -> rcp (fneg x)
12536 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
12537 return DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: FNegRHS);
12538 }
12539 }
12540
12541 // For f16 and bf16 require afn or arcp.
12542 // For f32 require afn.
12543 if (!AllowInaccurateRcp &&
12544 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12545 return SDValue();
12546
12547 // Turn into multiply by the reciprocal.
12548 // x / y -> x * (1.0 / y)
12549 SDValue Recip = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: RHS);
12550 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: LHS, N2: Recip, Flags);
12551}
12552
12553SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12554 SelectionDAG &DAG) const {
12555 SDLoc SL(Op);
12556 SDValue X = Op.getOperand(i: 0);
12557 SDValue Y = Op.getOperand(i: 1);
12558 EVT VT = Op.getValueType();
12559 const SDNodeFlags Flags = Op->getFlags();
12560
12561 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12562 if (!AllowInaccurateDiv)
12563 return SDValue();
12564
12565 SDValue NegY = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Y);
12566 SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT);
12567
12568 SDValue R = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT, Operand: Y);
12569 SDValue Tmp0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12570
12571 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp0, N2: R, N3: R);
12572 SDValue Tmp1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: R, N3: One);
12573 R = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp1, N2: R, N3: R);
12574 SDValue Ret = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT, N1: X, N2: R);
12575 SDValue Tmp2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: NegY, N2: Ret, N3: X);
12576 return DAG.getNode(Opcode: ISD::FMA, DL: SL, VT, N1: Tmp2, N2: R, N3: Ret);
12577}
12578
12579static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12580 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12581 SDNodeFlags Flags) {
12582 if (GlueChain->getNumValues() <= 1) {
12583 return DAG.getNode(Opcode, DL: SL, VT, N1: A, N2: B, Flags);
12584 }
12585
12586 assert(GlueChain->getNumValues() == 3);
12587
12588 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12589 switch (Opcode) {
12590 default:
12591 llvm_unreachable("no chain equivalent for opcode");
12592 case ISD::FMUL:
12593 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12594 break;
12595 }
12596
12597 return DAG.getNode(Opcode, DL: SL, VTList,
12598 Ops: {GlueChain.getValue(R: 1), A, B, GlueChain.getValue(R: 2)},
12599 Flags);
12600}
12601
12602static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12603 EVT VT, SDValue A, SDValue B, SDValue C,
12604 SDValue GlueChain, SDNodeFlags Flags) {
12605 if (GlueChain->getNumValues() <= 1) {
12606 return DAG.getNode(Opcode, DL: SL, VT, Ops: {A, B, C}, Flags);
12607 }
12608
12609 assert(GlueChain->getNumValues() == 3);
12610
12611 SDVTList VTList = DAG.getVTList(VT1: VT, VT2: MVT::Other, VT3: MVT::Glue);
12612 switch (Opcode) {
12613 default:
12614 llvm_unreachable("no chain equivalent for opcode");
12615 case ISD::FMA:
12616 Opcode = AMDGPUISD::FMA_W_CHAIN;
12617 break;
12618 }
12619
12620 return DAG.getNode(Opcode, DL: SL, VTList,
12621 Ops: {GlueChain.getValue(R: 1), A, B, C, GlueChain.getValue(R: 2)},
12622 Flags);
12623}
12624
12625SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12626 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12627 return FastLowered;
12628
12629 SDLoc SL(Op);
12630 EVT VT = Op.getValueType();
12631 SDValue LHS = Op.getOperand(i: 0);
12632 SDValue RHS = Op.getOperand(i: 1);
12633
12634 SDValue LHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: LHS);
12635 SDValue RHSExt = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: RHS);
12636
12637 if (VT == MVT::bf16) {
12638 SDValue ExtDiv =
12639 DAG.getNode(Opcode: ISD::FDIV, DL: SL, VT: MVT::f32, N1: LHSExt, N2: RHSExt, Flags: Op->getFlags());
12640 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::bf16, N1: ExtDiv,
12641 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12642 }
12643
12644 assert(VT == MVT::f16);
12645
12646 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12647 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12648 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12649 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12650 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12651 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12652 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12653 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12654 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12655 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12656 // q16.u = opx(V_CVT_F16_F32, q32.u);
12657 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12658
12659 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12660 unsigned FMADOpCode =
12661 isOperationLegal(Op: ISD::FMAD, VT: MVT::f32) ? ISD::FMAD : ISD::FMA;
12662 SDValue NegRHSExt = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: RHSExt);
12663 SDValue Rcp =
12664 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: RHSExt, Flags: Op->getFlags());
12665 SDValue Quot =
12666 DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHSExt, N2: Rcp, Flags: Op->getFlags());
12667 SDValue Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12668 Flags: Op->getFlags());
12669 Quot = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, N3: Quot, Flags: Op->getFlags());
12670 Err = DAG.getNode(Opcode: FMADOpCode, DL: SL, VT: MVT::f32, N1: NegRHSExt, N2: Quot, N3: LHSExt,
12671 Flags: Op->getFlags());
12672 SDValue Tmp = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: Err, N2: Rcp, Flags: Op->getFlags());
12673 SDValue TmpCast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i32, Operand: Tmp);
12674 TmpCast = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: TmpCast,
12675 N2: DAG.getConstant(Val: 0xff800000, DL: SL, VT: MVT::i32));
12676 Tmp = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::f32, Operand: TmpCast);
12677 Quot = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT: MVT::f32, N1: Tmp, N2: Quot, Flags: Op->getFlags());
12678 SDValue RDst = DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Quot,
12679 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32));
12680 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f16, N1: RDst, N2: RHS, N3: LHS,
12681 Flags: Op->getFlags());
12682}
12683
12684// Faster 2.5 ULP division that does not support denormals.
12685SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12686 SDNodeFlags Flags = Op->getFlags();
12687 SDLoc SL(Op);
12688 SDValue LHS = Op.getOperand(i: 1);
12689 SDValue RHS = Op.getOperand(i: 2);
12690
12691 // TODO: The combiner should probably handle elimination of redundant fabs.
12692 SDValue r1 = DAG.SignBitIsZeroFP(Op: RHS)
12693 ? RHS
12694 : DAG.getNode(Opcode: ISD::FABS, DL: SL, VT: MVT::f32, Operand: RHS, Flags);
12695
12696 const APFloat K0Val(0x1p+96f);
12697 const SDValue K0 = DAG.getConstantFP(Val: K0Val, DL: SL, VT: MVT::f32);
12698
12699 const APFloat K1Val(0x1p-32f);
12700 const SDValue K1 = DAG.getConstantFP(Val: K1Val, DL: SL, VT: MVT::f32);
12701
12702 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12703
12704 EVT SetCCVT =
12705 getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT: MVT::f32);
12706
12707 SDValue r2 = DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: r1, RHS: K0, Cond: ISD::SETOGT);
12708
12709 SDValue r3 = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: MVT::f32, N1: r2, N2: K1, N3: One, Flags);
12710
12711 r1 = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: RHS, N2: r3, Flags);
12712
12713 // rcp does not support denormals.
12714 SDValue r0 = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: r1, Flags);
12715
12716 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: LHS, N2: r0, Flags);
12717
12718 return DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f32, N1: r3, N2: Mul, Flags);
12719}
12720
12721// Returns immediate value for setting the F32 denorm mode when using the
12722// S_DENORM_MODE instruction.
12723static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
12724 const SIMachineFunctionInfo *Info,
12725 const GCNSubtarget *ST) {
12726 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12727 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12728 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12729 return DAG.getTargetConstant(Val: Mode, DL: SDLoc(), VT: MVT::i32);
12730}
12731
12732SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12733 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12734 return FastLowered;
12735
12736 // The selection matcher assumes anything with a chain selecting to a
12737 // mayRaiseFPException machine instruction. Since we're introducing a chain
12738 // here, we need to explicitly report nofpexcept for the regular fdiv
12739 // lowering.
12740 SDNodeFlags Flags = Op->getFlags();
12741 Flags.setNoFPExcept(true);
12742
12743 SDLoc SL(Op);
12744 SDValue LHS = Op.getOperand(i: 0);
12745 SDValue RHS = Op.getOperand(i: 1);
12746
12747 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f32);
12748
12749 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f32, VT2: MVT::i1);
12750
12751 SDValue DenominatorScaled =
12752 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {RHS, RHS, LHS}, Flags);
12753 SDValue NumeratorScaled =
12754 DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, Ops: {LHS, RHS, LHS}, Flags);
12755
12756 // Denominator is scaled to not be denormal, so using rcp is ok.
12757 SDValue ApproxRcp =
12758 DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12759 SDValue NegDivScale0 =
12760 DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f32, Operand: DenominatorScaled, Flags);
12761
12762 using namespace AMDGPU::Hwreg;
12763 const unsigned Denorm32Reg = HwregEncoding::encode(Values: ID_MODE, Values: 4, Values: 2);
12764 const SDValue BitField = DAG.getTargetConstant(Val: Denorm32Reg, DL: SL, VT: MVT::i32);
12765
12766 const MachineFunction &MF = DAG.getMachineFunction();
12767 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12768 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12769
12770 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12771 const bool HasDynamicDenormals =
12772 (DenormMode.Input == DenormalMode::Dynamic) ||
12773 (DenormMode.Output == DenormalMode::Dynamic);
12774
12775 SDValue SavedDenormMode;
12776
12777 if (!PreservesDenormals) {
12778 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12779 // lowering. The chain dependence is insufficient, and we need glue. We do
12780 // not need the glue variants in a strictfp function.
12781
12782 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12783
12784 SDValue Glue = DAG.getEntryNode();
12785 if (HasDynamicDenormals) {
12786 SDNode *GetReg = DAG.getMachineNode(Opcode: AMDGPU::S_GETREG_B32, dl: SL,
12787 VTs: DAG.getVTList(VT1: MVT::i32, VT2: MVT::Glue),
12788 Ops: {BitField, Glue});
12789 SavedDenormMode = SDValue(GetReg, 0);
12790
12791 Glue = DAG.getMergeValues(
12792 Ops: {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, dl: SL);
12793 }
12794
12795 SDNode *EnableDenorm;
12796 if (Subtarget->hasDenormModeInst()) {
12797 const SDValue EnableDenormValue =
12798 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, ST: Subtarget);
12799
12800 EnableDenorm = DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs, N1: Glue,
12801 N2: EnableDenormValue)
12802 .getNode();
12803 } else {
12804 const SDValue EnableDenormValue =
12805 DAG.getConstant(FP_DENORM_FLUSH_NONE, DL: SL, VT: MVT::i32);
12806 EnableDenorm = DAG.getMachineNode(Opcode: AMDGPU::S_SETREG_B32, dl: SL, VTs: BindParamVTs,
12807 Ops: {EnableDenormValue, BitField, Glue});
12808 }
12809
12810 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12811 SDValue(EnableDenorm, 1)};
12812
12813 NegDivScale0 = DAG.getMergeValues(Ops, dl: SL);
12814 }
12815
12816 SDValue Fma0 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0,
12817 B: ApproxRcp, C: One, GlueChain: NegDivScale0, Flags);
12818
12819 SDValue Fma1 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma0, B: ApproxRcp,
12820 C: ApproxRcp, GlueChain: Fma0, Flags);
12821
12822 SDValue Mul = getFPBinOp(DAG, Opcode: ISD::FMUL, SL, VT: MVT::f32, A: NumeratorScaled, B: Fma1,
12823 GlueChain: Fma1, Flags);
12824
12825 SDValue Fma2 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Mul,
12826 C: NumeratorScaled, GlueChain: Mul, Flags);
12827
12828 SDValue Fma3 =
12829 getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: Fma2, B: Fma1, C: Mul, GlueChain: Fma2, Flags);
12830
12831 SDValue Fma4 = getFPTernOp(DAG, Opcode: ISD::FMA, SL, VT: MVT::f32, A: NegDivScale0, B: Fma3,
12832 C: NumeratorScaled, GlueChain: Fma3, Flags);
12833
12834 if (!PreservesDenormals) {
12835 SDNode *DisableDenorm;
12836 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12837 const SDValue DisableDenormValue = getSPDenormModeValue(
12838 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, ST: Subtarget);
12839
12840 SDVTList BindParamVTs = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
12841 DisableDenorm =
12842 DAG.getNode(Opcode: AMDGPUISD::DENORM_MODE, DL: SL, VTList: BindParamVTs,
12843 N1: Fma4.getValue(R: 1), N2: DisableDenormValue, N3: Fma4.getValue(R: 2))
12844 .getNode();
12845 } else {
12846 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12847 const SDValue DisableDenormValue =
12848 HasDynamicDenormals
12849 ? SavedDenormMode
12850 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, DL: SL, VT: MVT::i32);
12851
12852 DisableDenorm = DAG.getMachineNode(
12853 Opcode: AMDGPU::S_SETREG_B32, dl: SL, VT: MVT::Other,
12854 Ops: {DisableDenormValue, BitField, Fma4.getValue(R: 1), Fma4.getValue(R: 2)});
12855 }
12856
12857 SDValue OutputChain = DAG.getNode(Opcode: ISD::TokenFactor, DL: SL, VT: MVT::Other,
12858 N1: SDValue(DisableDenorm, 0), N2: DAG.getRoot());
12859 DAG.setRoot(OutputChain);
12860 }
12861
12862 SDValue Scale = NumeratorScaled.getValue(R: 1);
12863 SDValue Fmas = DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f32,
12864 Ops: {Fma4, Fma1, Fma3, Scale}, Flags);
12865
12866 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f32, N1: Fmas, N2: RHS, N3: LHS, Flags);
12867}
12868
12869SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12870 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12871 return FastLowered;
12872
12873 SDLoc SL(Op);
12874 SDValue X = Op.getOperand(i: 0);
12875 SDValue Y = Op.getOperand(i: 1);
12876
12877 const SDValue One = DAG.getConstantFP(Val: 1.0, DL: SL, VT: MVT::f64);
12878
12879 SDVTList ScaleVT = DAG.getVTList(VT1: MVT::f64, VT2: MVT::i1);
12880
12881 SDValue DivScale0 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: Y, N2: Y, N3: X);
12882
12883 SDValue NegDivScale0 = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT: MVT::f64, Operand: DivScale0);
12884
12885 SDValue Rcp = DAG.getNode(Opcode: AMDGPUISD::RCP, DL: SL, VT: MVT::f64, Operand: DivScale0);
12886
12887 SDValue Fma0 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Rcp, N3: One);
12888
12889 SDValue Fma1 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Rcp, N2: Fma0, N3: Rcp);
12890
12891 SDValue Fma2 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Fma1, N3: One);
12892
12893 SDValue DivScale1 = DAG.getNode(Opcode: AMDGPUISD::DIV_SCALE, DL: SL, VTList: ScaleVT, N1: X, N2: Y, N3: X);
12894
12895 SDValue Fma3 = DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: Fma1, N2: Fma2, N3: Fma1);
12896 SDValue Mul = DAG.getNode(Opcode: ISD::FMUL, DL: SL, VT: MVT::f64, N1: DivScale1, N2: Fma3);
12897
12898 SDValue Fma4 =
12899 DAG.getNode(Opcode: ISD::FMA, DL: SL, VT: MVT::f64, N1: NegDivScale0, N2: Mul, N3: DivScale1);
12900
12901 SDValue Scale;
12902
12903 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12904 // Workaround a hardware bug on SI where the condition output from div_scale
12905 // is not usable.
12906
12907 const SDValue Hi = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
12908
12909 // Figure out if the scale to use for div_fmas.
12910 SDValue NumBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: X);
12911 SDValue DenBC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: Y);
12912 SDValue Scale0BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale0);
12913 SDValue Scale1BC = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::v2i32, Operand: DivScale1);
12914
12915 SDValue NumHi =
12916 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: NumBC, N2: Hi);
12917 SDValue DenHi =
12918 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: DenBC, N2: Hi);
12919
12920 SDValue Scale0Hi =
12921 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale0BC, N2: Hi);
12922 SDValue Scale1Hi =
12923 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Scale1BC, N2: Hi);
12924
12925 SDValue CmpDen = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: DenHi, RHS: Scale0Hi, Cond: ISD::SETEQ);
12926 SDValue CmpNum = DAG.getSetCC(DL: SL, VT: MVT::i1, LHS: NumHi, RHS: Scale1Hi, Cond: ISD::SETEQ);
12927 Scale = DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: CmpNum, N2: CmpDen);
12928 } else {
12929 Scale = DivScale1.getValue(R: 1);
12930 }
12931
12932 SDValue Fmas =
12933 DAG.getNode(Opcode: AMDGPUISD::DIV_FMAS, DL: SL, VT: MVT::f64, N1: Fma4, N2: Fma3, N3: Mul, N4: Scale);
12934
12935 return DAG.getNode(Opcode: AMDGPUISD::DIV_FIXUP, DL: SL, VT: MVT::f64, N1: Fmas, N2: Y, N3: X);
12936}
12937
12938SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12939 EVT VT = Op.getValueType();
12940
12941 if (VT == MVT::f32)
12942 return LowerFDIV32(Op, DAG);
12943
12944 if (VT == MVT::f64)
12945 return LowerFDIV64(Op, DAG);
12946
12947 if (VT == MVT::f16 || VT == MVT::bf16)
12948 return LowerFDIV16(Op, DAG);
12949
12950 llvm_unreachable("Unexpected type for fdiv");
12951}
12952
12953SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12954 SDLoc dl(Op);
12955 SDValue Val = Op.getOperand(i: 0);
12956 EVT VT = Val.getValueType();
12957 EVT ResultExpVT = Op->getValueType(ResNo: 1);
12958 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12959
12960 SDValue Mant = DAG.getNode(
12961 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12962 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_mant, DL: dl, VT: MVT::i32), N2: Val);
12963
12964 SDValue Exp = DAG.getNode(
12965 Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: InstrExpVT,
12966 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_frexp_exp, DL: dl, VT: MVT::i32), N2: Val);
12967
12968 if (Subtarget->hasFractBug()) {
12969 SDValue Fabs = DAG.getNode(Opcode: ISD::FABS, DL: dl, VT, Operand: Val);
12970 SDValue Inf =
12971 DAG.getConstantFP(Val: APFloat::getInf(Sem: VT.getFltSemantics()), DL: dl, VT);
12972
12973 SDValue IsFinite = DAG.getSetCC(DL: dl, VT: MVT::i1, LHS: Fabs, RHS: Inf, Cond: ISD::SETOLT);
12974 SDValue Zero = DAG.getConstant(Val: 0, DL: dl, VT: InstrExpVT);
12975 Exp = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT: InstrExpVT, N1: IsFinite, N2: Exp, N3: Zero);
12976 Mant = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: IsFinite, N2: Mant, N3: Val);
12977 }
12978
12979 SDValue CastExp = DAG.getSExtOrTrunc(Op: Exp, DL: dl, VT: ResultExpVT);
12980 return DAG.getMergeValues(Ops: {Mant, CastExp}, dl);
12981}
12982
12983SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12984 SDLoc DL(Op);
12985 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
12986 EVT VT = Store->getMemoryVT();
12987
12988 if (VT == MVT::i1) {
12989 return DAG.getTruncStore(
12990 Chain: Store->getChain(), dl: DL,
12991 Val: DAG.getSExtOrTrunc(Op: Store->getValue(), DL, VT: MVT::i32),
12992 Ptr: Store->getBasePtr(), SVT: MVT::i1, MMO: Store->getMemOperand());
12993 }
12994
12995 assert(VT.isVector() &&
12996 Store->getValue().getValueType().getScalarType() == MVT::i32);
12997
12998 unsigned AS = Store->getAddressSpace();
12999 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13000 AS == AMDGPUAS::FLAT_ADDRESS &&
13001 Store->getAlign().value() < VT.getStoreSize() &&
13002 VT.getSizeInBits() > 32) {
13003 return SplitVectorStore(Op, DAG);
13004 }
13005
13006 MachineFunction &MF = DAG.getMachineFunction();
13007 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13008 // If there is a possibility that flat instruction access scratch memory
13009 // then we need to use the same legalization rules we use for private.
13010 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13011 !Subtarget->hasMultiDwordFlatScratchAddressing())
13012 AS = addressMayBeAccessedAsPrivate(MMO: Store->getMemOperand(), Info: *MFI)
13013 ? AMDGPUAS::PRIVATE_ADDRESS
13014 : AMDGPUAS::GLOBAL_ADDRESS;
13015
13016 unsigned NumElements = VT.getVectorNumElements();
13017 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
13018 if (NumElements > 4)
13019 return SplitVectorStore(Op, DAG);
13020 // v3 stores not supported on SI.
13021 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13022 return SplitVectorStore(Op, DAG);
13023
13024 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
13025 VT, MMO: *Store->getMemOperand()))
13026 return expandUnalignedStore(ST: Store, DAG);
13027
13028 return SDValue();
13029 }
13030 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13031 switch (Subtarget->getMaxPrivateElementSize()) {
13032 case 4:
13033 return scalarizeVectorStore(ST: Store, DAG);
13034 case 8:
13035 if (NumElements > 2)
13036 return SplitVectorStore(Op, DAG);
13037 return SDValue();
13038 case 16:
13039 if (NumElements > 4 ||
13040 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13041 return SplitVectorStore(Op, DAG);
13042 return SDValue();
13043 default:
13044 llvm_unreachable("unsupported private_element_size");
13045 }
13046 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13047 unsigned Fast = 0;
13048 auto Flags = Store->getMemOperand()->getFlags();
13049 if (allowsMisalignedMemoryAccessesImpl(Size: VT.getSizeInBits(), AddrSpace: AS,
13050 Alignment: Store->getAlign(), Flags, IsFast: &Fast) &&
13051 Fast > 1)
13052 return SDValue();
13053
13054 if (VT.isVector())
13055 return SplitVectorStore(Op, DAG);
13056
13057 return expandUnalignedStore(ST: Store, DAG);
13058 }
13059
13060 // Probably an invalid store. If so we'll end up emitting a selection error.
13061 return SDValue();
13062}
13063
13064// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13065SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13066 SDLoc SL(Op);
13067 assert(!Subtarget->has16BitInsts());
13068 SDNodeFlags Flags = Op->getFlags();
13069 SDValue Ext =
13070 DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SL, VT: MVT::f32, Operand: Op.getOperand(i: 0), Flags);
13071
13072 SDValue SqrtID = DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL: SL, VT: MVT::i32);
13073 SDValue Sqrt =
13074 DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::f32, N1: SqrtID, N2: Ext, Flags);
13075
13076 return DAG.getNode(Opcode: ISD::FP_ROUND, DL: SL, VT: MVT::f16, N1: Sqrt,
13077 N2: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i32), Flags);
13078}
13079
13080SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13081 SDLoc DL(Op);
13082 SDNodeFlags Flags = Op->getFlags();
13083 MVT VT = Op.getValueType().getSimpleVT();
13084 const SDValue X = Op.getOperand(i: 0);
13085
13086 if (allowApproxFunc(DAG, Flags)) {
13087 // Instruction is 1ulp but ignores denormals.
13088 return DAG.getNode(
13089 Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
13090 N1: DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32), N2: X, Flags);
13091 }
13092
13093 SDValue ScaleThreshold = DAG.getConstantFP(Val: 0x1.0p-96f, DL, VT);
13094 SDValue NeedScale = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleThreshold, Cond: ISD::SETOLT);
13095
13096 SDValue ScaleUpFactor = DAG.getConstantFP(Val: 0x1.0p+32f, DL, VT);
13097
13098 SDValue ScaledX = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: X, N2: ScaleUpFactor, Flags);
13099
13100 SDValue SqrtX =
13101 DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledX, N3: X, Flags);
13102
13103 SDValue SqrtS;
13104 if (needsDenormHandlingF32(DAG, Src: X, Flags)) {
13105 SDValue SqrtID =
13106 DAG.getTargetConstant(Val: Intrinsic::amdgcn_sqrt, DL, VT: MVT::i32);
13107 SqrtS = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: SqrtID, N2: SqrtX, Flags);
13108
13109 SDValue SqrtSAsInt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: SqrtS);
13110 SDValue SqrtSNextDownInt =
13111 DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13112 N2: DAG.getAllOnesConstant(DL, VT: MVT::i32));
13113 SDValue SqrtSNextDown = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextDownInt);
13114
13115 SDValue NegSqrtSNextDown =
13116 DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextDown, Flags);
13117
13118 SDValue SqrtVP =
13119 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextDown, N2: SqrtS, N3: SqrtX, Flags);
13120
13121 SDValue SqrtSNextUpInt = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: SqrtSAsInt,
13122 N2: DAG.getConstant(Val: 1, DL, VT: MVT::i32));
13123 SDValue SqrtSNextUp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: SqrtSNextUpInt);
13124
13125 SDValue NegSqrtSNextUp = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtSNextUp, Flags);
13126 SDValue SqrtVS =
13127 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtSNextUp, N2: SqrtS, N3: SqrtX, Flags);
13128
13129 SDValue Zero = DAG.getConstantFP(Val: 0.0f, DL, VT);
13130 SDValue SqrtVPLE0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVP, RHS: Zero, Cond: ISD::SETOLE);
13131
13132 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPLE0, N2: SqrtSNextDown, N3: SqrtS,
13133 Flags);
13134
13135 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, VT: MVT::i1, LHS: SqrtVS, RHS: Zero, Cond: ISD::SETOGT);
13136 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: SqrtVPVSGT0, N2: SqrtSNextUp, N3: SqrtS,
13137 Flags);
13138 } else {
13139 SDValue SqrtR = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT, Operand: SqrtX, Flags);
13140
13141 SqrtS = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtX, N2: SqrtR, Flags);
13142
13143 SDValue Half = DAG.getConstantFP(Val: 0.5f, DL, VT);
13144 SDValue SqrtH = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtR, N2: Half, Flags);
13145 SDValue NegSqrtH = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtH, Flags);
13146
13147 SDValue SqrtE = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtH, N2: SqrtS, N3: Half, Flags);
13148 SqrtH = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtH, N2: SqrtE, N3: SqrtH, Flags);
13149 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtS, N2: SqrtE, N3: SqrtS, Flags);
13150
13151 SDValue NegSqrtS = DAG.getNode(Opcode: ISD::FNEG, DL, VT, Operand: SqrtS, Flags);
13152 SDValue SqrtD =
13153 DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: NegSqrtS, N2: SqrtS, N3: SqrtX, Flags);
13154 SqrtS = DAG.getNode(Opcode: ISD::FMA, DL, VT, N1: SqrtD, N2: SqrtH, N3: SqrtS, Flags);
13155 }
13156
13157 SDValue ScaleDownFactor = DAG.getConstantFP(Val: 0x1.0p-16f, DL, VT);
13158
13159 SDValue ScaledDown =
13160 DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: SqrtS, N2: ScaleDownFactor, Flags);
13161
13162 SqrtS = DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: NeedScale, N2: ScaledDown, N3: SqrtS, Flags);
13163 SDValue IsZeroOrInf =
13164 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13165 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13166
13167 return DAG.getNode(Opcode: ISD::SELECT, DL, VT, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtS, Flags);
13168}
13169
13170SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13171 // For double type, the SQRT and RSQ instructions don't have required
13172 // precision, we apply Goldschmidt's algorithm to improve the result:
13173 //
13174 // y0 = rsq(x)
13175 // g0 = x * y0
13176 // h0 = 0.5 * y0
13177 //
13178 // r0 = 0.5 - h0 * g0
13179 // g1 = g0 * r0 + g0
13180 // h1 = h0 * r0 + h0
13181 //
13182 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13183 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13184 // h2 = h1 * r1 + h1
13185 //
13186 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13187 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13188 //
13189 // sqrt(x) = g3
13190
13191 SDNodeFlags Flags = Op->getFlags();
13192
13193 SDLoc DL(Op);
13194
13195 SDValue X = Op.getOperand(i: 0);
13196 SDValue ScaleConstant = DAG.getConstantFP(Val: 0x1.0p-767, DL, VT: MVT::f64);
13197
13198 SDValue Scaling = DAG.getSetCC(DL, VT: MVT::i1, LHS: X, RHS: ScaleConstant, Cond: ISD::SETOLT);
13199
13200 SDValue ZeroInt = DAG.getConstant(Val: 0, DL, VT: MVT::i32);
13201
13202 // Scale up input if it is too small.
13203 SDValue ScaleUpFactor = DAG.getConstant(Val: 256, DL, VT: MVT::i32);
13204 SDValue ScaleUp =
13205 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleUpFactor, N3: ZeroInt);
13206 SDValue SqrtX = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: X, N2: ScaleUp, Flags);
13207
13208 SDValue SqrtY = DAG.getNode(Opcode: AMDGPUISD::RSQ, DL, VT: MVT::f64, Operand: SqrtX);
13209
13210 SDValue SqrtS0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtX, N2: SqrtY);
13211
13212 SDValue Half = DAG.getConstantFP(Val: 0.5, DL, VT: MVT::f64);
13213 SDValue SqrtH0 = DAG.getNode(Opcode: ISD::FMUL, DL, VT: MVT::f64, N1: SqrtY, N2: Half);
13214
13215 SDValue NegSqrtH0 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtH0);
13216 SDValue SqrtR0 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtH0, N2: SqrtS0, N3: Half);
13217
13218 SDValue SqrtH1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtH0, N2: SqrtR0, N3: SqrtH0);
13219
13220 SDValue SqrtS1 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtS0, N2: SqrtR0, N3: SqrtS0);
13221
13222 SDValue NegSqrtS1 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS1);
13223 SDValue SqrtD0 =
13224 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS1, N2: SqrtS1, N3: SqrtX);
13225
13226 SDValue SqrtS2 = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD0, N2: SqrtH1, N3: SqrtS1);
13227
13228 SDValue NegSqrtS2 = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f64, Operand: SqrtS2);
13229 SDValue SqrtD1 =
13230 DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: NegSqrtS2, N2: SqrtS2, N3: SqrtX);
13231
13232 SDValue SqrtRet = DAG.getNode(Opcode: ISD::FMA, DL, VT: MVT::f64, N1: SqrtD1, N2: SqrtH1, N3: SqrtS2);
13233
13234 SDValue ScaleDownFactor = DAG.getSignedConstant(Val: -128, DL, VT: MVT::i32);
13235 SDValue ScaleDown =
13236 DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::i32, N1: Scaling, N2: ScaleDownFactor, N3: ZeroInt);
13237 SqrtRet = DAG.getNode(Opcode: ISD::FLDEXP, DL, VT: MVT::f64, N1: SqrtRet, N2: ScaleDown, Flags);
13238
13239 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
13240 // with finite only or nsz because rsq(+/-0) = +/-inf
13241
13242 // TODO: Check for DAZ and expand to subnormals
13243 SDValue IsZeroOrInf =
13244 DAG.getNode(Opcode: ISD::IS_FPCLASS, DL, VT: MVT::i1, N1: SqrtX,
13245 N2: DAG.getTargetConstant(Val: fcZero | fcPosInf, DL, VT: MVT::i32));
13246
13247 // If x is +INF, +0, or -0, use its original value
13248 return DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f64, N1: IsZeroOrInf, N2: SqrtX, N3: SqrtRet,
13249 Flags);
13250}
13251
13252SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13253 SDLoc DL(Op);
13254 EVT VT = Op.getValueType();
13255 SDValue Arg = Op.getOperand(i: 0);
13256 SDValue TrigVal;
13257
13258 // Propagate fast-math flags so that the multiply we introduce can be folded
13259 // if Arg is already the result of a multiply by constant.
13260 auto Flags = Op->getFlags();
13261
13262 // AMDGPUISD nodes of vector type must be unrolled here since
13263 // they will not be expanded elsewhere.
13264 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13265 if (!V.getValueType().isVector())
13266 return V;
13267
13268 return DAG.UnrollVectorOp(N: cast<SDNode>(Val&: V));
13269 };
13270
13271 SDValue OneOver2Pi = DAG.getConstantFP(Val: 0.5 * numbers::inv_pi, DL, VT);
13272
13273 if (Subtarget->hasTrigReducedRange()) {
13274 SDValue MulVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13275 TrigVal = UnrollIfVec(DAG.getNode(Opcode: AMDGPUISD::FRACT, DL, VT, Operand: MulVal, Flags));
13276 } else {
13277 TrigVal = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Arg, N2: OneOver2Pi, Flags);
13278 }
13279
13280 switch (Op.getOpcode()) {
13281 case ISD::FCOS:
13282 TrigVal = DAG.getNode(Opcode: AMDGPUISD::COS_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13283 break;
13284 case ISD::FSIN:
13285 TrigVal = DAG.getNode(Opcode: AMDGPUISD::SIN_HW, DL: SDLoc(Op), VT, Operand: TrigVal, Flags);
13286 break;
13287 default:
13288 llvm_unreachable("Wrong trig opcode");
13289 }
13290
13291 return UnrollIfVec(TrigVal);
13292}
13293
13294SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13295 SelectionDAG &DAG) const {
13296 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Val&: Op);
13297 assert(AtomicNode->isCompareAndSwap());
13298 unsigned AS = AtomicNode->getAddressSpace();
13299
13300 // No custom lowering required for local address space
13301 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
13302 return Op;
13303
13304 // Non-local address space requires custom lowering for atomic compare
13305 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13306 SDLoc DL(Op);
13307 SDValue ChainIn = Op.getOperand(i: 0);
13308 SDValue Addr = Op.getOperand(i: 1);
13309 SDValue Old = Op.getOperand(i: 2);
13310 SDValue New = Op.getOperand(i: 3);
13311 EVT VT = Op.getValueType();
13312 MVT SimpleVT = VT.getSimpleVT();
13313 MVT VecType = MVT::getVectorVT(VT: SimpleVT, NumElements: 2);
13314
13315 SDValue NewOld = DAG.getBuildVector(VT: VecType, DL, Ops: {New, Old});
13316 SDValue Ops[] = {ChainIn, Addr, NewOld};
13317
13318 return DAG.getMemIntrinsicNode(Opcode: AMDGPUISD::ATOMIC_CMP_SWAP, dl: DL,
13319 VTList: Op->getVTList(), Ops, MemVT: VT,
13320 MMO: AtomicNode->getMemOperand());
13321}
13322
13323//===----------------------------------------------------------------------===//
13324// Custom DAG optimizations
13325//===----------------------------------------------------------------------===//
13326
13327SDValue
13328SITargetLowering::performUCharToFloatCombine(SDNode *N,
13329 DAGCombinerInfo &DCI) const {
13330 EVT VT = N->getValueType(ResNo: 0);
13331 EVT ScalarVT = VT.getScalarType();
13332 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13333 return SDValue();
13334
13335 SelectionDAG &DAG = DCI.DAG;
13336 SDLoc DL(N);
13337
13338 SDValue Src = N->getOperand(Num: 0);
13339 EVT SrcVT = Src.getValueType();
13340
13341 // TODO: We could try to match extracting the higher bytes, which would be
13342 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13343 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13344 // about in practice.
13345 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13346 if (DAG.MaskedValueIsZero(Op: Src, Mask: APInt::getHighBitsSet(numBits: 32, hiBitsSet: 24))) {
13347 SDValue Cvt = DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0, DL, VT: MVT::f32, Operand: Src);
13348 DCI.AddToWorklist(N: Cvt.getNode());
13349
13350 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13351 if (ScalarVT != MVT::f32) {
13352 Cvt = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Cvt,
13353 N2: DAG.getTargetConstant(Val: 0, DL, VT: MVT::i32));
13354 }
13355 return Cvt;
13356 }
13357 }
13358
13359 return SDValue();
13360}
13361
13362SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13363 DAGCombinerInfo &DCI) const {
13364 SDValue MagnitudeOp = N->getOperand(Num: 0);
13365 SDValue SignOp = N->getOperand(Num: 1);
13366
13367 // The generic combine for fcopysign + fp cast is too conservative with
13368 // vectors, and also gets confused by the splitting we will perform here, so
13369 // peek through FP casts.
13370 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13371 SignOp.getOpcode() == ISD::FP_ROUND)
13372 SignOp = SignOp.getOperand(i: 0);
13373
13374 SelectionDAG &DAG = DCI.DAG;
13375 SDLoc DL(N);
13376 EVT SignVT = SignOp.getValueType();
13377
13378 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13379 // lower half with a copy.
13380 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13381 EVT MagVT = MagnitudeOp.getValueType();
13382
13383 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13384
13385 if (MagVT.getScalarType() == MVT::f64) {
13386 EVT F32VT = MagVT.isVector()
13387 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13388 : MVT::v2f32;
13389
13390 SDValue MagAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: MagnitudeOp);
13391
13392 SmallVector<SDValue, 8> NewElts;
13393 for (unsigned I = 0; I != NumElts; ++I) {
13394 SDValue MagLo =
13395 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13396 N2: DAG.getConstant(Val: 2 * I, DL, VT: MVT::i32));
13397 SDValue MagHi =
13398 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: MagAsVector,
13399 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13400
13401 SDValue SignOpElt =
13402 MagVT.isVector()
13403 ? DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: SignVT.getScalarType(),
13404 N1: SignOp, N2: DAG.getConstant(Val: I, DL, VT: MVT::i32))
13405 : SignOp;
13406
13407 SDValue HiOp =
13408 DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: MVT::f32, N1: MagHi, N2: SignOpElt);
13409
13410 SDValue Vector =
13411 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MVT::v2f32, N1: MagLo, N2: HiOp);
13412
13413 SDValue NewElt = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: Vector);
13414 NewElts.push_back(Elt: NewElt);
13415 }
13416
13417 if (NewElts.size() == 1)
13418 return NewElts[0];
13419
13420 return DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: MagVT, Ops: NewElts);
13421 }
13422
13423 if (SignVT.getScalarType() != MVT::f64)
13424 return SDValue();
13425
13426 // Reduce width of sign operand, we only need the highest bit.
13427 //
13428 // fcopysign f64:x, f64:y ->
13429 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
13430 // TODO: In some cases it might make sense to go all the way to f16.
13431
13432 EVT F32VT = MagVT.isVector()
13433 ? EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: 2 * NumElts)
13434 : MVT::v2f32;
13435
13436 SDValue SignAsVector = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32VT, Operand: SignOp);
13437
13438 SmallVector<SDValue, 8> F32Signs;
13439 for (unsigned I = 0; I != NumElts; ++I) {
13440 // Take sign from odd elements of cast vector
13441 SDValue SignAsF32 =
13442 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::f32, N1: SignAsVector,
13443 N2: DAG.getConstant(Val: 2 * I + 1, DL, VT: MVT::i32));
13444 F32Signs.push_back(Elt: SignAsF32);
13445 }
13446
13447 SDValue NewSign =
13448 NumElts == 1
13449 ? F32Signs.back()
13450 : DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL,
13451 VT: EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::f32, NumElements: NumElts),
13452 Ops: F32Signs);
13453
13454 return DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: N->getValueType(ResNo: 0), N1: N->getOperand(Num: 0),
13455 N2: NewSign);
13456}
13457
13458// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
13459// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
13460// bits
13461
13462// This is a variant of
13463// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
13464//
13465// The normal DAG combiner will do this, but only if the add has one use since
13466// that would increase the number of instructions.
13467//
13468// This prevents us from seeing a constant offset that can be folded into a
13469// memory instruction's addressing mode. If we know the resulting add offset of
13470// a pointer can be folded into an addressing offset, we can replace the pointer
13471// operand with the add of new constant offset. This eliminates one of the uses,
13472// and may allow the remaining use to also be simplified.
13473//
13474SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
13475 EVT MemVT,
13476 DAGCombinerInfo &DCI) const {
13477 SDValue N0 = N->getOperand(Num: 0);
13478 SDValue N1 = N->getOperand(Num: 1);
13479
13480 // We only do this to handle cases where it's profitable when there are
13481 // multiple uses of the add, so defer to the standard combine.
13482 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
13483 return SDValue();
13484
13485 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val&: N1);
13486 if (!CN1)
13487 return SDValue();
13488
13489 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(Val: N0.getOperand(i: 1));
13490 if (!CAdd)
13491 return SDValue();
13492
13493 SelectionDAG &DAG = DCI.DAG;
13494
13495 if (N0->getOpcode() == ISD::OR &&
13496 !DAG.haveNoCommonBitsSet(A: N0.getOperand(i: 0), B: N0.getOperand(i: 1)))
13497 return SDValue();
13498
13499 // If the resulting offset is too large, we can't fold it into the
13500 // addressing mode offset.
13501 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
13502 Type *Ty = MemVT.getTypeForEVT(Context&: *DCI.DAG.getContext());
13503
13504 AddrMode AM;
13505 AM.HasBaseReg = true;
13506 AM.BaseOffs = Offset.getSExtValue();
13507 if (!isLegalAddressingMode(DL: DCI.DAG.getDataLayout(), AM, Ty, AS: AddrSpace))
13508 return SDValue();
13509
13510 SDLoc SL(N);
13511 EVT VT = N->getValueType(ResNo: 0);
13512
13513 SDValue ShlX = DAG.getNode(Opcode: ISD::SHL, DL: SL, VT, N1: N0.getOperand(i: 0), N2: N1);
13514 SDValue COffset = DAG.getConstant(Val: Offset, DL: SL, VT);
13515
13516 SDNodeFlags Flags;
13517 Flags.setNoUnsignedWrap(
13518 N->getFlags().hasNoUnsignedWrap() &&
13519 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
13520
13521 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
13522 // be sure that the new left operand is a proper base pointer.
13523 return DAG.getNode(Opcode: ISD::ADD, DL: SL, VT, N1: ShlX, N2: COffset, Flags);
13524}
13525
13526/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
13527/// by the chain and intrinsic ID. Theoretically we would also need to check the
13528/// specific intrinsic, but they all place the pointer operand first.
13529static unsigned getBasePtrIndex(const MemSDNode *N) {
13530 switch (N->getOpcode()) {
13531 case ISD::STORE:
13532 case ISD::INTRINSIC_W_CHAIN:
13533 case ISD::INTRINSIC_VOID:
13534 return 2;
13535 default:
13536 return 1;
13537 }
13538}
13539
13540SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13541 DAGCombinerInfo &DCI) const {
13542 SelectionDAG &DAG = DCI.DAG;
13543
13544 unsigned PtrIdx = getBasePtrIndex(N);
13545 SDValue Ptr = N->getOperand(Num: PtrIdx);
13546
13547 // TODO: We could also do this for multiplies.
13548 if (Ptr.getOpcode() == ISD::SHL) {
13549 SDValue NewPtr = performSHLPtrCombine(N: Ptr.getNode(), AddrSpace: N->getAddressSpace(),
13550 MemVT: N->getMemoryVT(), DCI);
13551 if (NewPtr) {
13552 SmallVector<SDValue, 8> NewOps(N->ops());
13553
13554 NewOps[PtrIdx] = NewPtr;
13555 return SDValue(DAG.UpdateNodeOperands(N, Ops: NewOps), 0);
13556 }
13557 }
13558
13559 return SDValue();
13560}
13561
13562static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13563 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13564 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13565 (Opc == ISD::XOR && Val == 0);
13566}
13567
13568// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13569// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13570// integer combine opportunities since most 64-bit operations are decomposed
13571// this way. TODO: We won't want this for SALU especially if it is an inline
13572// immediate.
13573SDValue SITargetLowering::splitBinaryBitConstantOp(
13574 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13575 const ConstantSDNode *CRHS) const {
13576 uint64_t Val = CRHS->getZExtValue();
13577 uint32_t ValLo = Lo_32(Value: Val);
13578 uint32_t ValHi = Hi_32(Value: Val);
13579 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13580
13581 if ((bitOpWithConstantIsReducible(Opc, Val: ValLo) ||
13582 bitOpWithConstantIsReducible(Opc, Val: ValHi)) ||
13583 (CRHS->hasOneUse() && !TII->isInlineConstant(Imm: CRHS->getAPIntValue()))) {
13584 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13585 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13586 !CRHS->user_begin()->isDivergent())
13587 return SDValue();
13588
13589 // If we need to materialize a 64-bit immediate, it will be split up later
13590 // anyway. Avoid creating the harder to understand 64-bit immediate
13591 // materialization.
13592 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13593 }
13594
13595 return SDValue();
13596}
13597
13598bool llvm::isBoolSGPR(SDValue V) {
13599 if (V.getValueType() != MVT::i1)
13600 return false;
13601 switch (V.getOpcode()) {
13602 default:
13603 break;
13604 case ISD::SETCC:
13605 case ISD::IS_FPCLASS:
13606 case AMDGPUISD::FP_CLASS:
13607 return true;
13608 case ISD::AND:
13609 case ISD::OR:
13610 case ISD::XOR:
13611 return isBoolSGPR(V: V.getOperand(i: 0)) && isBoolSGPR(V: V.getOperand(i: 1));
13612 case ISD::SADDO:
13613 case ISD::UADDO:
13614 case ISD::SSUBO:
13615 case ISD::USUBO:
13616 case ISD::SMULO:
13617 case ISD::UMULO:
13618 return V.getResNo() == 1;
13619 case ISD::INTRINSIC_WO_CHAIN: {
13620 unsigned IntrinsicID = V.getConstantOperandVal(i: 0);
13621 switch (IntrinsicID) {
13622 case Intrinsic::amdgcn_is_shared:
13623 case Intrinsic::amdgcn_is_private:
13624 return true;
13625 default:
13626 return false;
13627 }
13628
13629 return false;
13630 }
13631 }
13632 return false;
13633}
13634
13635// If a constant has all zeroes or all ones within each byte return it.
13636// Otherwise return 0.
13637static uint32_t getConstantPermuteMask(uint32_t C) {
13638 // 0xff for any zero byte in the mask
13639 uint32_t ZeroByteMask = 0;
13640 if (!(C & 0x000000ff))
13641 ZeroByteMask |= 0x000000ff;
13642 if (!(C & 0x0000ff00))
13643 ZeroByteMask |= 0x0000ff00;
13644 if (!(C & 0x00ff0000))
13645 ZeroByteMask |= 0x00ff0000;
13646 if (!(C & 0xff000000))
13647 ZeroByteMask |= 0xff000000;
13648 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13649 if ((NonZeroByteMask & C) != NonZeroByteMask)
13650 return 0; // Partial bytes selected.
13651 return C;
13652}
13653
13654// Check if a node selects whole bytes from its operand 0 starting at a byte
13655// boundary while masking the rest. Returns select mask as in the v_perm_b32
13656// or -1 if not succeeded.
13657// Note byte select encoding:
13658// value 0-3 selects corresponding source byte;
13659// value 0xc selects zero;
13660// value 0xff selects 0xff.
13661static uint32_t getPermuteMask(SDValue V) {
13662 assert(V.getValueSizeInBits() == 32);
13663
13664 if (V.getNumOperands() != 2)
13665 return ~0;
13666
13667 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: 1));
13668 if (!N1)
13669 return ~0;
13670
13671 uint32_t C = N1->getZExtValue();
13672
13673 switch (V.getOpcode()) {
13674 default:
13675 break;
13676 case ISD::AND:
13677 if (uint32_t ConstMask = getConstantPermuteMask(C))
13678 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13679 break;
13680
13681 case ISD::OR:
13682 if (uint32_t ConstMask = getConstantPermuteMask(C))
13683 return (0x03020100 & ~ConstMask) | ConstMask;
13684 break;
13685
13686 case ISD::SHL:
13687 if (C % 8)
13688 return ~0;
13689
13690 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13691
13692 case ISD::SRL:
13693 if (C % 8)
13694 return ~0;
13695
13696 return uint32_t(0x0c0c0c0c03020100ull >> C);
13697 }
13698
13699 return ~0;
13700}
13701
13702SDValue SITargetLowering::performAndCombine(SDNode *N,
13703 DAGCombinerInfo &DCI) const {
13704 if (DCI.isBeforeLegalize())
13705 return SDValue();
13706
13707 SelectionDAG &DAG = DCI.DAG;
13708 EVT VT = N->getValueType(ResNo: 0);
13709 SDValue LHS = N->getOperand(Num: 0);
13710 SDValue RHS = N->getOperand(Num: 1);
13711
13712 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
13713 if (VT == MVT::i64 && CRHS) {
13714 if (SDValue Split =
13715 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::AND, LHS, CRHS))
13716 return Split;
13717 }
13718
13719 if (CRHS && VT == MVT::i32) {
13720 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13721 // nb = number of trailing zeroes in mask
13722 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13723 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13724 uint64_t Mask = CRHS->getZExtValue();
13725 unsigned Bits = llvm::popcount(Value: Mask);
13726 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13727 (Bits == 8 || Bits == 16) && isShiftedMask_64(Value: Mask) && !(Mask & 1)) {
13728 if (auto *CShift = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1))) {
13729 unsigned Shift = CShift->getZExtValue();
13730 unsigned NB = CRHS->getAPIntValue().countr_zero();
13731 unsigned Offset = NB + Shift;
13732 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13733 SDLoc SL(N);
13734 SDValue BFE =
13735 DAG.getNode(Opcode: AMDGPUISD::BFE_U32, DL: SL, VT: MVT::i32, N1: LHS->getOperand(Num: 0),
13736 N2: DAG.getConstant(Val: Offset, DL: SL, VT: MVT::i32),
13737 N3: DAG.getConstant(Val: Bits, DL: SL, VT: MVT::i32));
13738 EVT NarrowVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: Bits);
13739 SDValue Ext = DAG.getNode(Opcode: ISD::AssertZext, DL: SL, VT, N1: BFE,
13740 N2: DAG.getValueType(NarrowVT));
13741 SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL: SDLoc(LHS), VT, N1: Ext,
13742 N2: DAG.getConstant(Val: NB, DL: SDLoc(CRHS), VT: MVT::i32));
13743 return Shl;
13744 }
13745 }
13746 }
13747
13748 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13749 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13750 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
13751 uint32_t Sel = getConstantPermuteMask(C: Mask);
13752 if (!Sel)
13753 return SDValue();
13754
13755 // Select 0xc for all zero bytes
13756 Sel = (LHS.getConstantOperandVal(i: 2) & Sel) | (~Sel & 0x0c0c0c0c);
13757 SDLoc DL(N);
13758 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13759 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13760 }
13761 }
13762
13763 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13764 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13765 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13766 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13767 ISD::CondCode RCC = cast<CondCodeSDNode>(Val: RHS.getOperand(i: 2))->get();
13768
13769 SDValue X = LHS.getOperand(i: 0);
13770 SDValue Y = RHS.getOperand(i: 0);
13771 if (Y.getOpcode() != ISD::FABS || Y.getOperand(i: 0) != X ||
13772 !isTypeLegal(VT: X.getValueType()))
13773 return SDValue();
13774
13775 if (LCC == ISD::SETO) {
13776 if (X != LHS.getOperand(i: 1))
13777 return SDValue();
13778
13779 if (RCC == ISD::SETUNE) {
13780 const ConstantFPSDNode *C1 =
13781 dyn_cast<ConstantFPSDNode>(Val: RHS.getOperand(i: 1));
13782 if (!C1 || !C1->isInfinity() || C1->isNegative())
13783 return SDValue();
13784
13785 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13786 SIInstrFlags::N_SUBNORMAL | SIInstrFlags::N_ZERO |
13787 SIInstrFlags::P_ZERO | SIInstrFlags::P_SUBNORMAL |
13788 SIInstrFlags::P_NORMAL;
13789
13790 static_assert(
13791 ((~(SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN |
13792 SIInstrFlags::N_INFINITY | SIInstrFlags::P_INFINITY)) &
13793 0x3ff) == Mask,
13794 "mask not equal");
13795
13796 SDLoc DL(N);
13797 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: X,
13798 N2: DAG.getConstant(Val: Mask, DL, VT: MVT::i32));
13799 }
13800 }
13801 }
13802
13803 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13804 std::swap(a&: LHS, b&: RHS);
13805
13806 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13807 RHS.hasOneUse()) {
13808 ISD::CondCode LCC = cast<CondCodeSDNode>(Val: LHS.getOperand(i: 2))->get();
13809 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13810 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13811 // | n_nan)
13812 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
13813 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13814 (RHS.getOperand(i: 0) == LHS.getOperand(i: 0) &&
13815 LHS.getOperand(i: 0) == LHS.getOperand(i: 1))) {
13816 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13817 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13818 : Mask->getZExtValue() & OrdMask;
13819
13820 SDLoc DL(N);
13821 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: RHS.getOperand(i: 0),
13822 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
13823 }
13824 }
13825
13826 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13827 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13828 // and x, (sext cc from i1) => select cc, x, 0
13829 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13830 std::swap(a&: LHS, b&: RHS);
13831 if (isBoolSGPR(V: RHS.getOperand(i: 0)))
13832 return DAG.getSelect(DL: SDLoc(N), VT: MVT::i32, Cond: RHS.getOperand(i: 0), LHS,
13833 RHS: DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i32));
13834 }
13835
13836 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13837 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13838 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13839 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
13840 uint32_t LHSMask = getPermuteMask(V: LHS);
13841 uint32_t RHSMask = getPermuteMask(V: RHS);
13842 if (LHSMask != ~0u && RHSMask != ~0u) {
13843 // Canonicalize the expression in an attempt to have fewer unique masks
13844 // and therefore fewer registers used to hold the masks.
13845 if (LHSMask > RHSMask) {
13846 std::swap(a&: LHSMask, b&: RHSMask);
13847 std::swap(a&: LHS, b&: RHS);
13848 }
13849
13850 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13851 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13852 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13853 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13854
13855 // Check of we need to combine values from two sources within a byte.
13856 if (!(LHSUsedLanes & RHSUsedLanes) &&
13857 // If we select high and lower word keep it for SDWA.
13858 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13859 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13860 // Each byte in each mask is either selector mask 0-3, or has higher
13861 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13862 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13863 // mask which is not 0xff wins. By anding both masks we have a correct
13864 // result except that 0x0c shall be corrected to give 0x0c only.
13865 uint32_t Mask = LHSMask & RHSMask;
13866 for (unsigned I = 0; I < 32; I += 8) {
13867 uint32_t ByteSel = 0xff << I;
13868 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13869 Mask &= (0x0c << I) & 0xffffffff;
13870 }
13871
13872 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13873 // or 0x0c.
13874 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13875 SDLoc DL(N);
13876
13877 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
13878 N2: RHS.getOperand(i: 0),
13879 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
13880 }
13881 }
13882 }
13883
13884 return SDValue();
13885}
13886
13887// A key component of v_perm is a mapping between byte position of the src
13888// operands, and the byte position of the dest. To provide such, we need: 1. the
13889// node that provides x byte of the dest of the OR, and 2. the byte of the node
13890// used to provide that x byte. calculateByteProvider finds which node provides
13891// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13892// and finds an ultimate src and byte position For example: The supported
13893// LoadCombine pattern for vector loads is as follows
13894// t1
13895// or
13896// / \
13897// t2 t3
13898// zext shl
13899// | | \
13900// t4 t5 16
13901// or anyext
13902// / \ |
13903// t6 t7 t8
13904// srl shl or
13905// / | / \ / \
13906// t9 t10 t11 t12 t13 t14
13907// trunc* 8 trunc* 8 and and
13908// | | / | | \
13909// t15 t16 t17 t18 t19 t20
13910// trunc* 255 srl -256
13911// | / \
13912// t15 t15 16
13913//
13914// *In this example, the truncs are from i32->i16
13915//
13916// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13917// respectively. calculateSrcByte would find (given node) -> ultimate src &
13918// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13919// After finding the mapping, we can combine the tree into vperm t15, t16,
13920// 0x05000407
13921
13922// Find the source and byte position from a node.
13923// \p DestByte is the byte position of the dest of the or that the src
13924// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13925// dest of the or byte. \p Depth tracks how many recursive iterations we have
13926// performed.
13927static const std::optional<ByteProvider<SDValue>>
13928calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13929 unsigned Depth = 0) {
13930 // We may need to recursively traverse a series of SRLs
13931 if (Depth >= 6)
13932 return std::nullopt;
13933
13934 if (Op.getValueSizeInBits() < 8)
13935 return std::nullopt;
13936
13937 if (Op.getValueType().isVector())
13938 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13939
13940 switch (Op->getOpcode()) {
13941 case ISD::TRUNCATE: {
13942 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13943 }
13944
13945 case ISD::ANY_EXTEND:
13946 case ISD::SIGN_EXTEND:
13947 case ISD::ZERO_EXTEND:
13948 case ISD::SIGN_EXTEND_INREG: {
13949 SDValue NarrowOp = Op->getOperand(Num: 0);
13950 auto NarrowVT = NarrowOp.getValueType();
13951 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13952 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
13953 NarrowVT = VTSign->getVT();
13954 }
13955 if (!NarrowVT.isByteSized())
13956 return std::nullopt;
13957 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13958
13959 if (SrcIndex >= NarrowByteWidth)
13960 return std::nullopt;
13961 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13962 }
13963
13964 case ISD::SRA:
13965 case ISD::SRL: {
13966 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
13967 if (!ShiftOp)
13968 return std::nullopt;
13969
13970 uint64_t BitShift = ShiftOp->getZExtValue();
13971
13972 if (BitShift % 8 != 0)
13973 return std::nullopt;
13974
13975 SrcIndex += BitShift / 8;
13976
13977 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte, SrcIndex, Depth: Depth + 1);
13978 }
13979
13980 default: {
13981 return ByteProvider<SDValue>::getSrc(Val: Op, ByteOffset: DestByte, VectorOffset: SrcIndex);
13982 }
13983 }
13984 llvm_unreachable("fully handled switch");
13985}
13986
13987// For a byte position in the result of an Or, traverse the tree and find the
13988// node (and the byte of the node) which ultimately provides this {Or,
13989// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13990// the byte position of the Op that corresponds with the originally requested
13991// byte of the Or \p Depth tracks how many recursive iterations we have
13992// performed. \p StartingIndex is the originally requested byte of the Or
13993static const std::optional<ByteProvider<SDValue>>
13994calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13995 unsigned StartingIndex = 0) {
13996 // Finding Src tree of RHS of or typically requires at least 1 additional
13997 // depth
13998 if (Depth > 6)
13999 return std::nullopt;
14000
14001 unsigned BitWidth = Op.getScalarValueSizeInBits();
14002 if (BitWidth % 8 != 0)
14003 return std::nullopt;
14004 if (Index > BitWidth / 8 - 1)
14005 return std::nullopt;
14006
14007 bool IsVec = Op.getValueType().isVector();
14008 switch (Op.getOpcode()) {
14009 case ISD::OR: {
14010 if (IsVec)
14011 return std::nullopt;
14012
14013 auto RHS = calculateByteProvider(Op: Op.getOperand(i: 1), Index, Depth: Depth + 1,
14014 StartingIndex);
14015 if (!RHS)
14016 return std::nullopt;
14017 auto LHS = calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14018 StartingIndex);
14019 if (!LHS)
14020 return std::nullopt;
14021 // A well formed Or will have two ByteProviders for each byte, one of which
14022 // is constant zero
14023 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14024 return std::nullopt;
14025 if (!LHS || LHS->isConstantZero())
14026 return RHS;
14027 if (!RHS || RHS->isConstantZero())
14028 return LHS;
14029 return std::nullopt;
14030 }
14031
14032 case ISD::AND: {
14033 if (IsVec)
14034 return std::nullopt;
14035
14036 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14037 if (!BitMaskOp)
14038 return std::nullopt;
14039
14040 uint32_t BitMask = BitMaskOp->getZExtValue();
14041 // Bits we expect for our StartingIndex
14042 uint32_t IndexMask = 0xFF << (Index * 8);
14043
14044 if ((IndexMask & BitMask) != IndexMask) {
14045 // If the result of the and partially provides the byte, then it
14046 // is not well formatted
14047 if (IndexMask & BitMask)
14048 return std::nullopt;
14049 return ByteProvider<SDValue>::getConstantZero();
14050 }
14051
14052 return calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex, SrcIndex: Index);
14053 }
14054
14055 case ISD::FSHR: {
14056 if (IsVec)
14057 return std::nullopt;
14058
14059 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14060 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14061 if (!ShiftOp || Op.getValueType().isVector())
14062 return std::nullopt;
14063
14064 uint64_t BitsProvided = Op.getValueSizeInBits();
14065 if (BitsProvided % 8 != 0)
14066 return std::nullopt;
14067
14068 uint64_t BitShift = ShiftOp->getAPIntValue().urem(RHS: BitsProvided);
14069 if (BitShift % 8)
14070 return std::nullopt;
14071
14072 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14073 uint64_t ByteShift = BitShift / 8;
14074
14075 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14076 uint64_t BytesProvided = BitsProvided / 8;
14077 SDValue NextOp = Op.getOperand(i: NewIndex >= BytesProvided ? 0 : 1);
14078 NewIndex %= BytesProvided;
14079 return calculateByteProvider(Op: NextOp, Index: NewIndex, Depth: Depth + 1, StartingIndex);
14080 }
14081
14082 case ISD::SRA:
14083 case ISD::SRL: {
14084 if (IsVec)
14085 return std::nullopt;
14086
14087 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14088 if (!ShiftOp)
14089 return std::nullopt;
14090
14091 uint64_t BitShift = ShiftOp->getZExtValue();
14092 if (BitShift % 8)
14093 return std::nullopt;
14094
14095 auto BitsProvided = Op.getScalarValueSizeInBits();
14096 if (BitsProvided % 8 != 0)
14097 return std::nullopt;
14098
14099 uint64_t BytesProvided = BitsProvided / 8;
14100 uint64_t ByteShift = BitShift / 8;
14101 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14102 // If the byte we are trying to provide (as tracked by index) falls in this
14103 // range, then the SRL provides the byte. The byte of interest of the src of
14104 // the SRL is Index + ByteShift
14105 return BytesProvided - ByteShift > Index
14106 ? calculateSrcByte(Op: Op->getOperand(Num: 0), DestByte: StartingIndex,
14107 SrcIndex: Index + ByteShift)
14108 : ByteProvider<SDValue>::getConstantZero();
14109 }
14110
14111 case ISD::SHL: {
14112 if (IsVec)
14113 return std::nullopt;
14114
14115 auto *ShiftOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14116 if (!ShiftOp)
14117 return std::nullopt;
14118
14119 uint64_t BitShift = ShiftOp->getZExtValue();
14120 if (BitShift % 8 != 0)
14121 return std::nullopt;
14122 uint64_t ByteShift = BitShift / 8;
14123
14124 // If we are shifting by an amount greater than (or equal to)
14125 // the index we are trying to provide, then it provides 0s. If not,
14126 // then this bytes are not definitively 0s, and the corresponding byte
14127 // of interest is Index - ByteShift of the src
14128 return Index < ByteShift
14129 ? ByteProvider<SDValue>::getConstantZero()
14130 : calculateByteProvider(Op: Op.getOperand(i: 0), Index: Index - ByteShift,
14131 Depth: Depth + 1, StartingIndex);
14132 }
14133 case ISD::ANY_EXTEND:
14134 case ISD::SIGN_EXTEND:
14135 case ISD::ZERO_EXTEND:
14136 case ISD::SIGN_EXTEND_INREG:
14137 case ISD::AssertZext:
14138 case ISD::AssertSext: {
14139 if (IsVec)
14140 return std::nullopt;
14141
14142 SDValue NarrowOp = Op->getOperand(Num: 0);
14143 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14144 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14145 Op->getOpcode() == ISD::AssertZext ||
14146 Op->getOpcode() == ISD::AssertSext) {
14147 auto *VTSign = cast<VTSDNode>(Val: Op->getOperand(Num: 1));
14148 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14149 }
14150 if (NarrowBitWidth % 8 != 0)
14151 return std::nullopt;
14152 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14153
14154 if (Index >= NarrowByteWidth)
14155 return Op.getOpcode() == ISD::ZERO_EXTEND
14156 ? std::optional<ByteProvider<SDValue>>(
14157 ByteProvider<SDValue>::getConstantZero())
14158 : std::nullopt;
14159 return calculateByteProvider(Op: NarrowOp, Index, Depth: Depth + 1, StartingIndex);
14160 }
14161
14162 case ISD::TRUNCATE: {
14163 if (IsVec)
14164 return std::nullopt;
14165
14166 uint64_t NarrowByteWidth = BitWidth / 8;
14167
14168 if (NarrowByteWidth >= Index) {
14169 return calculateByteProvider(Op: Op.getOperand(i: 0), Index, Depth: Depth + 1,
14170 StartingIndex);
14171 }
14172
14173 return std::nullopt;
14174 }
14175
14176 case ISD::CopyFromReg: {
14177 if (BitWidth / 8 > Index)
14178 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14179
14180 return std::nullopt;
14181 }
14182
14183 case ISD::LOAD: {
14184 auto *L = cast<LoadSDNode>(Val: Op.getNode());
14185
14186 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14187 if (NarrowBitWidth % 8 != 0)
14188 return std::nullopt;
14189 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14190
14191 // If the width of the load does not reach byte we are trying to provide for
14192 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14193 // question
14194 if (Index >= NarrowByteWidth) {
14195 return L->getExtensionType() == ISD::ZEXTLOAD
14196 ? std::optional<ByteProvider<SDValue>>(
14197 ByteProvider<SDValue>::getConstantZero())
14198 : std::nullopt;
14199 }
14200
14201 if (NarrowByteWidth > Index) {
14202 return calculateSrcByte(Op, DestByte: StartingIndex, SrcIndex: Index);
14203 }
14204
14205 return std::nullopt;
14206 }
14207
14208 case ISD::BSWAP: {
14209 if (IsVec)
14210 return std::nullopt;
14211
14212 return calculateByteProvider(Op: Op->getOperand(Num: 0), Index: BitWidth / 8 - Index - 1,
14213 Depth: Depth + 1, StartingIndex);
14214 }
14215
14216 case ISD::EXTRACT_VECTOR_ELT: {
14217 auto *IdxOp = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 1));
14218 if (!IdxOp)
14219 return std::nullopt;
14220 auto VecIdx = IdxOp->getZExtValue();
14221 auto ScalarSize = Op.getScalarValueSizeInBits();
14222 if (ScalarSize < 32)
14223 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14224 return calculateSrcByte(Op: ScalarSize >= 32 ? Op : Op.getOperand(i: 0),
14225 DestByte: StartingIndex, SrcIndex: Index);
14226 }
14227
14228 case AMDGPUISD::PERM: {
14229 if (IsVec)
14230 return std::nullopt;
14231
14232 auto *PermMask = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: 2));
14233 if (!PermMask)
14234 return std::nullopt;
14235
14236 auto IdxMask =
14237 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14238 if (IdxMask > 0x07 && IdxMask != 0x0c)
14239 return std::nullopt;
14240
14241 auto NextOp = Op.getOperand(i: IdxMask > 0x03 ? 0 : 1);
14242 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14243
14244 return IdxMask != 0x0c ? calculateSrcByte(Op: NextOp, DestByte: StartingIndex, SrcIndex: NextIndex)
14245 : ByteProvider<SDValue>(
14246 ByteProvider<SDValue>::getConstantZero());
14247 }
14248
14249 default: {
14250 return std::nullopt;
14251 }
14252 }
14253
14254 llvm_unreachable("fully handled switch");
14255}
14256
14257// Returns true if the Operand is a scalar and is 16 bits
14258static bool isExtendedFrom16Bits(SDValue &Operand) {
14259
14260 switch (Operand.getOpcode()) {
14261 case ISD::ANY_EXTEND:
14262 case ISD::SIGN_EXTEND:
14263 case ISD::ZERO_EXTEND: {
14264 auto OpVT = Operand.getOperand(i: 0).getValueType();
14265 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14266 }
14267 case ISD::LOAD: {
14268 LoadSDNode *L = cast<LoadSDNode>(Val: Operand.getNode());
14269 auto ExtType = cast<LoadSDNode>(Val: L)->getExtensionType();
14270 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14271 ExtType == ISD::EXTLOAD) {
14272 auto MemVT = L->getMemoryVT();
14273 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14274 }
14275 return L->getMemoryVT().getSizeInBits() == 16;
14276 }
14277 default:
14278 return false;
14279 }
14280}
14281
14282// Returns true if the mask matches consecutive bytes, and the first byte
14283// begins at a power of 2 byte offset from 0th byte
14284static bool addresses16Bits(int Mask) {
14285 int Low8 = Mask & 0xff;
14286 int Hi8 = (Mask & 0xff00) >> 8;
14287
14288 assert(Low8 < 8 && Hi8 < 8);
14289 // Are the bytes contiguous in the order of increasing addresses.
14290 bool IsConsecutive = (Hi8 - Low8 == 1);
14291 // Is the first byte at location that is aligned for 16 bit instructions.
14292 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14293 // In this case, we still need code to extract the 16 bit operand, so it
14294 // is better to use i8 v_perm
14295 bool Is16Aligned = !(Low8 % 2);
14296
14297 return IsConsecutive && Is16Aligned;
14298}
14299
14300// Do not lower into v_perm if the operands are actually 16 bit
14301// and the selected bits (based on PermMask) correspond with two
14302// easily addressable 16 bit operands.
14303static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
14304 SDValue &OtherOp) {
14305 int Low16 = PermMask & 0xffff;
14306 int Hi16 = (PermMask & 0xffff0000) >> 16;
14307
14308 auto TempOp = peekThroughBitcasts(V: Op);
14309 auto TempOtherOp = peekThroughBitcasts(V: OtherOp);
14310
14311 auto OpIs16Bit =
14312 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(Operand&: TempOp);
14313 if (!OpIs16Bit)
14314 return true;
14315
14316 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14317 isExtendedFrom16Bits(Operand&: TempOtherOp);
14318 if (!OtherOpIs16Bit)
14319 return true;
14320
14321 // Do we cleanly address both
14322 return !addresses16Bits(Mask: Low16) || !addresses16Bits(Mask: Hi16);
14323}
14324
14325static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src,
14326 unsigned DWordOffset) {
14327 SDValue Ret;
14328
14329 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14330 // ByteProvider must be at least 8 bits
14331 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14332
14333 if (TypeSize <= 32)
14334 return DAG.getBitcastedAnyExtOrTrunc(Op: Src, DL: SL, VT: MVT::i32);
14335
14336 if (Src.getValueType().isVector()) {
14337 auto ScalarTySize = Src.getScalarValueSizeInBits();
14338 auto ScalarTy = Src.getValueType().getScalarType();
14339 if (ScalarTySize == 32) {
14340 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Src,
14341 N2: DAG.getConstant(Val: DWordOffset, DL: SL, VT: MVT::i32));
14342 }
14343 if (ScalarTySize > 32) {
14344 Ret = DAG.getNode(
14345 Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ScalarTy, N1: Src,
14346 N2: DAG.getConstant(Val: DWordOffset / (ScalarTySize / 32), DL: SL, VT: MVT::i32));
14347 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14348 if (ShiftVal)
14349 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Ret.getValueType(), N1: Ret,
14350 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14351 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14352 }
14353
14354 assert(ScalarTySize < 32);
14355 auto NumElements = TypeSize / ScalarTySize;
14356 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14357 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14358 auto NumElementsIn32 = 32 / ScalarTySize;
14359 auto NumAvailElements = DWordOffset < Trunc32Elements
14360 ? NumElementsIn32
14361 : NumElements - NormalizedTrunc;
14362
14363 SmallVector<SDValue, 4> VecSrcs;
14364 DAG.ExtractVectorElements(Op: Src, Args&: VecSrcs, Start: DWordOffset * NumElementsIn32,
14365 Count: NumAvailElements);
14366
14367 Ret = DAG.getBuildVector(
14368 VT: MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarTySize), NumElements: NumAvailElements), DL: SL,
14369 Ops: VecSrcs);
14370 return Ret = DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14371 }
14372
14373 /// Scalar Type
14374 auto ShiftVal = 32 * DWordOffset;
14375 Ret = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: Src.getValueType(), N1: Src,
14376 N2: DAG.getConstant(Val: ShiftVal, DL: SL, VT: MVT::i32));
14377 return DAG.getBitcastedAnyExtOrTrunc(Op: Ret, DL: SL, VT: MVT::i32);
14378}
14379
14380static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
14381 SelectionDAG &DAG = DCI.DAG;
14382 [[maybe_unused]] EVT VT = N->getValueType(ResNo: 0);
14383 SmallVector<ByteProvider<SDValue>, 8> PermNodes;
14384
14385 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14386 assert(VT == MVT::i32);
14387 for (int i = 0; i < 4; i++) {
14388 // Find the ByteProvider that provides the ith byte of the result of OR
14389 std::optional<ByteProvider<SDValue>> P =
14390 calculateByteProvider(Op: SDValue(N, 0), Index: i, Depth: 0, /*StartingIndex = */ i);
14391 // TODO support constantZero
14392 if (!P || P->isConstantZero())
14393 return SDValue();
14394
14395 PermNodes.push_back(Elt: *P);
14396 }
14397 if (PermNodes.size() != 4)
14398 return SDValue();
14399
14400 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14401 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14402 uint64_t PermMask = 0x00000000;
14403 for (size_t i = 0; i < PermNodes.size(); i++) {
14404 auto PermOp = PermNodes[i];
14405 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
14406 // by sizeof(Src2) = 4
14407 int SrcByteAdjust = 4;
14408
14409 // If the Src uses a byte from a different DWORD, then it corresponds
14410 // with a difference source
14411 if (!PermOp.hasSameSrc(Other: PermNodes[FirstSrc.first]) ||
14412 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14413 if (SecondSrc)
14414 if (!PermOp.hasSameSrc(Other: PermNodes[SecondSrc->first]) ||
14415 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14416 return SDValue();
14417
14418 // Set the index of the second distinct Src node
14419 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14420 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14421 SrcByteAdjust = 0;
14422 }
14423 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14424 assert(!DAG.getDataLayout().isBigEndian());
14425 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14426 }
14427 SDLoc DL(N);
14428 SDValue Op = *PermNodes[FirstSrc.first].Src;
14429 Op = getDWordFromOffset(DAG, SL: DL, Src: Op, DWordOffset: FirstSrc.second);
14430 assert(Op.getValueSizeInBits() == 32);
14431
14432 // Check that we are not just extracting the bytes in order from an op
14433 if (!SecondSrc) {
14434 int Low16 = PermMask & 0xffff;
14435 int Hi16 = (PermMask & 0xffff0000) >> 16;
14436
14437 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14438 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14439
14440 // The perm op would really just produce Op. So combine into Op
14441 if (WellFormedLow && WellFormedHi)
14442 return DAG.getBitcast(VT: MVT::getIntegerVT(BitWidth: 32), V: Op);
14443 }
14444
14445 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
14446
14447 if (SecondSrc) {
14448 OtherOp = getDWordFromOffset(DAG, SL: DL, Src: OtherOp, DWordOffset: SecondSrc->second);
14449 assert(OtherOp.getValueSizeInBits() == 32);
14450 }
14451
14452 // Check that we haven't just recreated the same FSHR node.
14453 if (N->getOpcode() == ISD::FSHR &&
14454 (N->getOperand(Num: 0) == Op || N->getOperand(Num: 0) == OtherOp) &&
14455 (N->getOperand(Num: 1) == Op || N->getOperand(Num: 1) == OtherOp))
14456 return SDValue();
14457
14458 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
14459
14460 assert(Op.getValueType().isByteSized() &&
14461 OtherOp.getValueType().isByteSized());
14462
14463 // If the ultimate src is less than 32 bits, then we will only be
14464 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
14465 // CalculateByteProvider would not have returned Op as source if we
14466 // used a byte that is outside its ValueType. Thus, we are free to
14467 // ANY_EXTEND as the extended bits are dont-cares.
14468 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, VT: MVT::i32);
14469 OtherOp = DAG.getBitcastedAnyExtOrTrunc(Op: OtherOp, DL, VT: MVT::i32);
14470
14471 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: Op, N2: OtherOp,
14472 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14473 }
14474 return SDValue();
14475}
14476
14477SDValue SITargetLowering::performOrCombine(SDNode *N,
14478 DAGCombinerInfo &DCI) const {
14479 SelectionDAG &DAG = DCI.DAG;
14480 SDValue LHS = N->getOperand(Num: 0);
14481 SDValue RHS = N->getOperand(Num: 1);
14482
14483 EVT VT = N->getValueType(ResNo: 0);
14484 if (VT == MVT::i1) {
14485 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
14486 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14487 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14488 SDValue Src = LHS.getOperand(i: 0);
14489 if (Src != RHS.getOperand(i: 0))
14490 return SDValue();
14491
14492 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: 1));
14493 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: RHS.getOperand(i: 1));
14494 if (!CLHS || !CRHS)
14495 return SDValue();
14496
14497 // Only 10 bits are used.
14498 static const uint32_t MaxMask = 0x3ff;
14499
14500 uint32_t NewMask =
14501 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
14502 SDLoc DL(N);
14503 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL, VT: MVT::i1, N1: Src,
14504 N2: DAG.getConstant(Val: NewMask, DL, VT: MVT::i32));
14505 }
14506
14507 return SDValue();
14508 }
14509
14510 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14511 if (isa<ConstantSDNode>(Val: RHS) && LHS.hasOneUse() &&
14512 LHS.getOpcode() == AMDGPUISD::PERM &&
14513 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2))) {
14514 uint32_t Sel = getConstantPermuteMask(C: N->getConstantOperandVal(Num: 1));
14515 if (!Sel)
14516 return SDValue();
14517
14518 Sel |= LHS.getConstantOperandVal(i: 2);
14519 SDLoc DL(N);
14520 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14521 N2: LHS.getOperand(i: 1), N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14522 }
14523
14524 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14525 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14526 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14527 N->isDivergent() && TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
14528
14529 // If all the uses of an or need to extract the individual elements, do not
14530 // attempt to lower into v_perm
14531 auto usesCombinedOperand = [](SDNode *OrUse) {
14532 // If we have any non-vectorized use, then it is a candidate for v_perm
14533 if (OrUse->getOpcode() != ISD::BITCAST ||
14534 !OrUse->getValueType(ResNo: 0).isVector())
14535 return true;
14536
14537 // If we have any non-vectorized use, then it is a candidate for v_perm
14538 for (auto *VUser : OrUse->users()) {
14539 if (!VUser->getValueType(ResNo: 0).isVector())
14540 return true;
14541
14542 // If the use of a vector is a store, then combining via a v_perm
14543 // is beneficial.
14544 // TODO -- whitelist more uses
14545 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14546 if (VUser->getOpcode() == VectorwiseOp)
14547 return true;
14548 }
14549 return false;
14550 };
14551
14552 if (!any_of(Range: N->users(), P: usesCombinedOperand))
14553 return SDValue();
14554
14555 uint32_t LHSMask = getPermuteMask(V: LHS);
14556 uint32_t RHSMask = getPermuteMask(V: RHS);
14557
14558 if (LHSMask != ~0u && RHSMask != ~0u) {
14559 // Canonicalize the expression in an attempt to have fewer unique masks
14560 // and therefore fewer registers used to hold the masks.
14561 if (LHSMask > RHSMask) {
14562 std::swap(a&: LHSMask, b&: RHSMask);
14563 std::swap(a&: LHS, b&: RHS);
14564 }
14565
14566 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14567 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14568 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14569 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14570
14571 // Check of we need to combine values from two sources within a byte.
14572 if (!(LHSUsedLanes & RHSUsedLanes) &&
14573 // If we select high and lower word keep it for SDWA.
14574 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14575 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14576 // Kill zero bytes selected by other mask. Zero value is 0xc.
14577 LHSMask &= ~RHSUsedLanes;
14578 RHSMask &= ~LHSUsedLanes;
14579 // Add 4 to each active LHS lane
14580 LHSMask |= LHSUsedLanes & 0x04040404;
14581 // Combine masks
14582 uint32_t Sel = LHSMask | RHSMask;
14583 SDLoc DL(N);
14584
14585 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: LHS.getOperand(i: 0),
14586 N2: RHS.getOperand(i: 0),
14587 N3: DAG.getConstant(Val: Sel, DL, VT: MVT::i32));
14588 }
14589 }
14590 if (LHSMask == ~0u || RHSMask == ~0u) {
14591 if (SDValue Perm = matchPERM(N, DCI))
14592 return Perm;
14593 }
14594 }
14595
14596 // Detect identity v2i32 OR and replace with identity source node.
14597 // Specifically an Or that has operands constructed from the same source node
14598 // via extract_vector_elt and build_vector. I.E.
14599 // v2i32 or(
14600 // v2i32 build_vector(
14601 // i32 extract_elt(%IdentitySrc, 0),
14602 // i32 0
14603 // ),
14604 // v2i32 build_vector(
14605 // i32 0,
14606 // i32 extract_elt(%IdentitySrc, 1)
14607 // ) )
14608 // =>
14609 // v2i32 %IdentitySrc
14610
14611 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14612 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14613
14614 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(Val: LHS->getOperand(Num: 1));
14615 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(Val: RHS->getOperand(Num: 0));
14616
14617 // Test for and normalise build vectors.
14618 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14619
14620 // Get the extract_vector_element operands.
14621 SDValue LEVE = LHS->getOperand(Num: 0);
14622 SDValue REVE = RHS->getOperand(Num: 1);
14623
14624 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14625 REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
14626 // Check that different elements from the same vector are
14627 // extracted.
14628 if (LEVE->getOperand(Num: 0) == REVE->getOperand(Num: 0) &&
14629 LEVE->getOperand(Num: 1) != REVE->getOperand(Num: 1)) {
14630 SDValue IdentitySrc = LEVE.getOperand(i: 0);
14631 return IdentitySrc;
14632 }
14633 }
14634 }
14635 }
14636
14637 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14638 return SDValue();
14639
14640 // TODO: This could be a generic combine with a predicate for extracting the
14641 // high half of an integer being free.
14642
14643 // (or i64:x, (zero_extend i32:y)) ->
14644 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14645 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14646 RHS.getOpcode() != ISD::ZERO_EXTEND)
14647 std::swap(a&: LHS, b&: RHS);
14648
14649 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14650 SDValue ExtSrc = RHS.getOperand(i: 0);
14651 EVT SrcVT = ExtSrc.getValueType();
14652 if (SrcVT == MVT::i32) {
14653 SDLoc SL(N);
14654 auto [LowLHS, HiBits] = split64BitValue(Op: LHS, DAG);
14655 SDValue LowOr = DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: LowLHS, N2: ExtSrc);
14656
14657 DCI.AddToWorklist(N: LowOr.getNode());
14658 DCI.AddToWorklist(N: HiBits.getNode());
14659
14660 SDValue Vec =
14661 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL: SL, VT: MVT::v2i32, N1: LowOr, N2: HiBits);
14662 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i64, Operand: Vec);
14663 }
14664 }
14665
14666 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
14667 if (CRHS) {
14668 if (SDValue Split = splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::OR,
14669 LHS: N->getOperand(Num: 0), CRHS))
14670 return Split;
14671 }
14672
14673 return SDValue();
14674}
14675
14676SDValue SITargetLowering::performXorCombine(SDNode *N,
14677 DAGCombinerInfo &DCI) const {
14678 if (SDValue RV = reassociateScalarOps(N, DAG&: DCI.DAG))
14679 return RV;
14680
14681 SDValue LHS = N->getOperand(Num: 0);
14682 SDValue RHS = N->getOperand(Num: 1);
14683
14684 const ConstantSDNode *CRHS = isConstOrConstSplat(N: RHS);
14685 SelectionDAG &DAG = DCI.DAG;
14686
14687 EVT VT = N->getValueType(ResNo: 0);
14688 if (CRHS && VT == MVT::i64) {
14689 if (SDValue Split =
14690 splitBinaryBitConstantOp(DCI, SL: SDLoc(N), Opc: ISD::XOR, LHS, CRHS))
14691 return Split;
14692 }
14693
14694 // v2i32 (xor (vselect cc, x, y), K) ->
14695 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14696 // replaced with source modifiers when the select is lowered to CNDMASK.
14697 unsigned Opc = LHS.getOpcode();
14698 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14699 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14700 CRHS && CRHS->getAPIntValue().isSignMask()) {
14701 SDValue CC = LHS->getOperand(Num: 0);
14702 SDValue TRUE = LHS->getOperand(Num: 1);
14703 SDValue FALSE = LHS->getOperand(Num: 2);
14704 SDValue XTrue = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: TRUE, N2: RHS);
14705 SDValue XFalse = DAG.getNode(Opcode: ISD::XOR, DL: SDLoc(N), VT, N1: FALSE, N2: RHS);
14706 SDValue XSelect =
14707 DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc(N), VT, N1: CC, N2: XTrue, N3: XFalse);
14708 return XSelect;
14709 }
14710
14711 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14712 // fneg-like xors into 64-bit select.
14713 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14714 // This looks like an fneg, try to fold as a source modifier.
14715 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14716 shouldFoldFNegIntoSrc(FNeg: N, FNegSrc: LHS)) {
14717 // xor (select c, a, b), 0x80000000 ->
14718 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14719 SDLoc DL(N);
14720 SDValue CastLHS =
14721 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 1));
14722 SDValue CastRHS =
14723 DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: LHS->getOperand(Num: 2));
14724 SDValue FNegLHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastLHS);
14725 SDValue FNegRHS = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::f32, Operand: CastRHS);
14726 SDValue NewSelect = DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::f32,
14727 N1: LHS->getOperand(Num: 0), N2: FNegLHS, N3: FNegRHS);
14728 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewSelect);
14729 }
14730 }
14731
14732 return SDValue();
14733}
14734
14735SDValue
14736SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
14737 DAGCombinerInfo &DCI) const {
14738 if (!Subtarget->has16BitInsts() ||
14739 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
14740 return SDValue();
14741
14742 EVT VT = N->getValueType(ResNo: 0);
14743 if (VT != MVT::i32)
14744 return SDValue();
14745
14746 SDValue Src = N->getOperand(Num: 0);
14747 if (Src.getValueType() != MVT::i16)
14748 return SDValue();
14749
14750 if (!Src->hasOneUse())
14751 return SDValue();
14752
14753 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
14754 // possible we're missing out on some combine opportunities, but we'd need to
14755 // weigh the cost of extracting the byte from the upper dwords.
14756
14757 std::optional<ByteProvider<SDValue>> BP0 =
14758 calculateByteProvider(Op: SDValue(N, 0), Index: 0, Depth: 0, StartingIndex: 0);
14759 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14760 return SDValue();
14761 SDValue V0 = *BP0->Src;
14762
14763 std::optional<ByteProvider<SDValue>> BP1 =
14764 calculateByteProvider(Op: SDValue(N, 0), Index: 1, Depth: 0, StartingIndex: 1);
14765 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14766 return SDValue();
14767
14768 SDValue V1 = *BP1->Src;
14769
14770 if (V0 == V1)
14771 return SDValue();
14772
14773 SelectionDAG &DAG = DCI.DAG;
14774 SDLoc DL(N);
14775 uint32_t PermMask = 0x0c0c0c0c;
14776 if (V0) {
14777 V0 = DAG.getBitcastedAnyExtOrTrunc(Op: V0, DL, VT: MVT::i32);
14778 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14779 }
14780
14781 if (V1) {
14782 V1 = DAG.getBitcastedAnyExtOrTrunc(Op: V1, DL, VT: MVT::i32);
14783 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14784 }
14785
14786 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL, VT: MVT::i32, N1: V0, N2: V1,
14787 N3: DAG.getConstant(Val: PermMask, DL, VT: MVT::i32));
14788}
14789
14790SDValue
14791SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14792 DAGCombinerInfo &DCI) const {
14793 SDValue Src = N->getOperand(Num: 0);
14794 auto *VTSign = cast<VTSDNode>(Val: N->getOperand(Num: 1));
14795
14796 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14797 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14798 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14799 VTSign->getVT() == MVT::i8) ||
14800 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14801 VTSign->getVT() == MVT::i16))) {
14802 assert(Subtarget->hasScalarSubwordLoads() &&
14803 "s_buffer_load_{u8, i8} are supported "
14804 "in GFX12 (or newer) architectures.");
14805 EVT VT = Src.getValueType();
14806 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14807 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14808 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14809 SDLoc DL(N);
14810 SDVTList ResList = DCI.DAG.getVTList(VT: MVT::i32);
14811 SDValue Ops[] = {
14812 Src.getOperand(i: 0), // source register
14813 Src.getOperand(i: 1), // offset
14814 Src.getOperand(i: 2) // cachePolicy
14815 };
14816 auto *M = cast<MemSDNode>(Val&: Src);
14817 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14818 Opcode: Opc, dl: DL, VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14819 SDValue LoadVal = DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: BufferLoad);
14820 return LoadVal;
14821 }
14822 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14823 VTSign->getVT() == MVT::i8) ||
14824 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14825 VTSign->getVT() == MVT::i16)) &&
14826 Src.hasOneUse()) {
14827 auto *M = cast<MemSDNode>(Val&: Src);
14828 SDValue Ops[] = {Src.getOperand(i: 0), // Chain
14829 Src.getOperand(i: 1), // rsrc
14830 Src.getOperand(i: 2), // vindex
14831 Src.getOperand(i: 3), // voffset
14832 Src.getOperand(i: 4), // soffset
14833 Src.getOperand(i: 5), // offset
14834 Src.getOperand(i: 6), Src.getOperand(i: 7)};
14835 // replace with BUFFER_LOAD_BYTE/SHORT
14836 SDVTList ResList =
14837 DCI.DAG.getVTList(VT1: MVT::i32, VT2: Src.getOperand(i: 0).getValueType());
14838 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14839 ? AMDGPUISD::BUFFER_LOAD_BYTE
14840 : AMDGPUISD::BUFFER_LOAD_SHORT;
14841 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14842 Opcode: Opc, dl: SDLoc(N), VTList: ResList, Ops, MemVT: M->getMemoryVT(), MMO: M->getMemOperand());
14843 return DCI.DAG.getMergeValues(
14844 Ops: {BufferLoadSignExt, BufferLoadSignExt.getValue(R: 1)}, dl: SDLoc(N));
14845 }
14846 return SDValue();
14847}
14848
14849SDValue SITargetLowering::performClassCombine(SDNode *N,
14850 DAGCombinerInfo &DCI) const {
14851 SelectionDAG &DAG = DCI.DAG;
14852 SDValue Mask = N->getOperand(Num: 1);
14853
14854 // fp_class x, 0 -> false
14855 if (isNullConstant(V: Mask))
14856 return DAG.getConstant(Val: 0, DL: SDLoc(N), VT: MVT::i1);
14857
14858 if (N->getOperand(Num: 0).isUndef())
14859 return DAG.getUNDEF(VT: MVT::i1);
14860
14861 return SDValue();
14862}
14863
14864SDValue SITargetLowering::performRcpCombine(SDNode *N,
14865 DAGCombinerInfo &DCI) const {
14866 EVT VT = N->getValueType(ResNo: 0);
14867 SDValue N0 = N->getOperand(Num: 0);
14868
14869 if (N0.isUndef()) {
14870 return DCI.DAG.getConstantFP(Val: APFloat::getQNaN(Sem: VT.getFltSemantics()),
14871 DL: SDLoc(N), VT);
14872 }
14873
14874 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14875 N0.getOpcode() == ISD::SINT_TO_FP)) {
14876 return DCI.DAG.getNode(Opcode: AMDGPUISD::RCP_IFLAG, DL: SDLoc(N), VT, Operand: N0,
14877 Flags: N->getFlags());
14878 }
14879
14880 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14881 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14882 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14883 return DCI.DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SDLoc(N), VT, Operand: N0.getOperand(i: 0),
14884 Flags: N->getFlags());
14885 }
14886
14887 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
14888}
14889
14890bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
14891 SDNodeFlags UserFlags,
14892 unsigned MaxDepth) const {
14893 unsigned Opcode = Op.getOpcode();
14894 if (Opcode == ISD::FCANONICALIZE)
14895 return true;
14896
14897 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
14898 const auto &F = CFP->getValueAPF();
14899 if (F.isNaN() && F.isSignaling())
14900 return false;
14901 if (!F.isDenormal())
14902 return true;
14903
14904 DenormalMode Mode =
14905 DAG.getMachineFunction().getDenormalMode(FPType: F.getSemantics());
14906 return Mode == DenormalMode::getIEEE();
14907 }
14908
14909 // If source is a result of another standard FP operation it is already in
14910 // canonical form.
14911 if (MaxDepth == 0)
14912 return false;
14913
14914 switch (Opcode) {
14915 // These will flush denorms if required.
14916 case ISD::FADD:
14917 case ISD::FSUB:
14918 case ISD::FMUL:
14919 case ISD::FCEIL:
14920 case ISD::FFLOOR:
14921 case ISD::FMA:
14922 case ISD::FMAD:
14923 case ISD::FSQRT:
14924 case ISD::FDIV:
14925 case ISD::FREM:
14926 case ISD::FP_ROUND:
14927 case ISD::FP_EXTEND:
14928 case ISD::FP16_TO_FP:
14929 case ISD::FP_TO_FP16:
14930 case ISD::BF16_TO_FP:
14931 case ISD::FP_TO_BF16:
14932 case ISD::FLDEXP:
14933 case AMDGPUISD::FMUL_LEGACY:
14934 case AMDGPUISD::FMAD_FTZ:
14935 case AMDGPUISD::RCP:
14936 case AMDGPUISD::RSQ:
14937 case AMDGPUISD::RSQ_CLAMP:
14938 case AMDGPUISD::RCP_LEGACY:
14939 case AMDGPUISD::RCP_IFLAG:
14940 case AMDGPUISD::LOG:
14941 case AMDGPUISD::EXP:
14942 case AMDGPUISD::DIV_SCALE:
14943 case AMDGPUISD::DIV_FMAS:
14944 case AMDGPUISD::DIV_FIXUP:
14945 case AMDGPUISD::FRACT:
14946 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14947 case AMDGPUISD::CVT_F32_UBYTE0:
14948 case AMDGPUISD::CVT_F32_UBYTE1:
14949 case AMDGPUISD::CVT_F32_UBYTE2:
14950 case AMDGPUISD::CVT_F32_UBYTE3:
14951 case AMDGPUISD::FP_TO_FP16:
14952 case AMDGPUISD::SIN_HW:
14953 case AMDGPUISD::COS_HW:
14954 return true;
14955
14956 // It can/will be lowered or combined as a bit operation.
14957 // Need to check their input recursively to handle.
14958 case ISD::FNEG:
14959 case ISD::FABS:
14960 case ISD::FCOPYSIGN:
14961 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
14962
14963 case ISD::AND:
14964 if (Op.getValueType() == MVT::i32) {
14965 // Be careful as we only know it is a bitcast floating point type. It
14966 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14967 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14968 // is valid to optimize for all types.
14969 if (auto *RHS = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: 1))) {
14970 if (RHS->getZExtValue() == 0xffff0000) {
14971 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
14972 }
14973 }
14974 }
14975 break;
14976
14977 case ISD::FSIN:
14978 case ISD::FCOS:
14979 case ISD::FSINCOS:
14980 return Op.getValueType().getScalarType() != MVT::f16;
14981
14982 case ISD::FMINNUM:
14983 case ISD::FMAXNUM:
14984 case ISD::FMINNUM_IEEE:
14985 case ISD::FMAXNUM_IEEE:
14986 case ISD::FMINIMUM:
14987 case ISD::FMAXIMUM:
14988 case ISD::FMINIMUMNUM:
14989 case ISD::FMAXIMUMNUM:
14990 case AMDGPUISD::CLAMP:
14991 case AMDGPUISD::FMED3:
14992 case AMDGPUISD::FMAX3:
14993 case AMDGPUISD::FMIN3:
14994 case AMDGPUISD::FMAXIMUM3:
14995 case AMDGPUISD::FMINIMUM3: {
14996 // FIXME: Shouldn't treat the generic operations different based these.
14997 // However, we aren't really required to flush the result from
14998 // minnum/maxnum..
14999
15000 // snans will be quieted, so we only need to worry about denormals.
15001 if (Subtarget->supportsMinMaxDenormModes() ||
15002 // FIXME: denormalsEnabledForType is broken for dynamic
15003 denormalsEnabledForType(DAG, VT: Op.getValueType()))
15004 return true;
15005
15006 // Flushing may be required.
15007 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15008 // targets need to check their input recursively.
15009
15010 // FIXME: Does this apply with clamp? It's implemented with max.
15011 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15012 if (!isCanonicalized(DAG, Op: Op.getOperand(i: I), UserFlags: MaxDepth - 1))
15013 return false;
15014 }
15015
15016 return true;
15017 }
15018 case ISD::SELECT: {
15019 return isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1) &&
15020 isCanonicalized(DAG, Op: Op.getOperand(i: 2), UserFlags: MaxDepth - 1);
15021 }
15022 case ISD::BUILD_VECTOR: {
15023 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15024 SDValue SrcOp = Op.getOperand(i);
15025 if (!isCanonicalized(DAG, Op: SrcOp, UserFlags: MaxDepth - 1))
15026 return false;
15027 }
15028
15029 return true;
15030 }
15031 case ISD::EXTRACT_VECTOR_ELT:
15032 case ISD::EXTRACT_SUBVECTOR: {
15033 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15034 }
15035 case ISD::INSERT_VECTOR_ELT: {
15036 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1) &&
15037 isCanonicalized(DAG, Op: Op.getOperand(i: 1), UserFlags: MaxDepth - 1);
15038 }
15039 case ISD::UNDEF:
15040 // Could be anything.
15041 return false;
15042
15043 case ISD::BITCAST:
15044 // TODO: This is incorrect as it loses track of the operand's type. We may
15045 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15046 // same bits that are canonicalized in one type need not be in the other.
15047 return isCanonicalized(DAG, Op: Op.getOperand(i: 0), UserFlags: MaxDepth - 1);
15048 case ISD::TRUNCATE: {
15049 // Hack round the mess we make when legalizing extract_vector_elt
15050 if (Op.getValueType() == MVT::i16) {
15051 SDValue TruncSrc = Op.getOperand(i: 0);
15052 if (TruncSrc.getValueType() == MVT::i32 &&
15053 TruncSrc.getOpcode() == ISD::BITCAST &&
15054 TruncSrc.getOperand(i: 0).getValueType() == MVT::v2f16) {
15055 return isCanonicalized(DAG, Op: TruncSrc.getOperand(i: 0), UserFlags: MaxDepth - 1);
15056 }
15057 }
15058 return false;
15059 }
15060 case ISD::INTRINSIC_WO_CHAIN: {
15061 unsigned IntrinsicID = Op.getConstantOperandVal(i: 0);
15062 // TODO: Handle more intrinsics
15063 switch (IntrinsicID) {
15064 case Intrinsic::amdgcn_cvt_pkrtz:
15065 case Intrinsic::amdgcn_cubeid:
15066 case Intrinsic::amdgcn_frexp_mant:
15067 case Intrinsic::amdgcn_fdot2:
15068 case Intrinsic::amdgcn_rcp:
15069 case Intrinsic::amdgcn_rsq:
15070 case Intrinsic::amdgcn_rsq_clamp:
15071 case Intrinsic::amdgcn_rcp_legacy:
15072 case Intrinsic::amdgcn_rsq_legacy:
15073 case Intrinsic::amdgcn_trig_preop:
15074 case Intrinsic::amdgcn_tanh:
15075 case Intrinsic::amdgcn_log:
15076 case Intrinsic::amdgcn_exp2:
15077 case Intrinsic::amdgcn_sqrt:
15078 return true;
15079 default:
15080 break;
15081 }
15082
15083 break;
15084 }
15085 default:
15086 break;
15087 }
15088
15089 // FIXME: denormalsEnabledForType is broken for dynamic
15090 return denormalsEnabledForType(DAG, VT: Op.getValueType()) &&
15091 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15092}
15093
15094bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
15095 unsigned MaxDepth) const {
15096 const MachineRegisterInfo &MRI = MF.getRegInfo();
15097 MachineInstr *MI = MRI.getVRegDef(Reg);
15098 unsigned Opcode = MI->getOpcode();
15099
15100 if (Opcode == AMDGPU::G_FCANONICALIZE)
15101 return true;
15102
15103 std::optional<FPValueAndVReg> FCR;
15104 // Constant splat (can be padded with undef) or scalar constant.
15105 if (mi_match(R: Reg, MRI, P: MIPatternMatch::m_GFCstOrSplat(FPValReg&: FCR))) {
15106 if (FCR->Value.isSignaling())
15107 return false;
15108 if (!FCR->Value.isDenormal())
15109 return true;
15110
15111 DenormalMode Mode = MF.getDenormalMode(FPType: FCR->Value.getSemantics());
15112 return Mode == DenormalMode::getIEEE();
15113 }
15114
15115 if (MaxDepth == 0)
15116 return false;
15117
15118 switch (Opcode) {
15119 case AMDGPU::G_FADD:
15120 case AMDGPU::G_FSUB:
15121 case AMDGPU::G_FMUL:
15122 case AMDGPU::G_FCEIL:
15123 case AMDGPU::G_FFLOOR:
15124 case AMDGPU::G_FRINT:
15125 case AMDGPU::G_FNEARBYINT:
15126 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15127 case AMDGPU::G_INTRINSIC_TRUNC:
15128 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15129 case AMDGPU::G_FMA:
15130 case AMDGPU::G_FMAD:
15131 case AMDGPU::G_FSQRT:
15132 case AMDGPU::G_FDIV:
15133 case AMDGPU::G_FREM:
15134 case AMDGPU::G_FPOW:
15135 case AMDGPU::G_FPEXT:
15136 case AMDGPU::G_FLOG:
15137 case AMDGPU::G_FLOG2:
15138 case AMDGPU::G_FLOG10:
15139 case AMDGPU::G_FPTRUNC:
15140 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15141 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15142 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15143 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15144 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15145 return true;
15146 case AMDGPU::G_FNEG:
15147 case AMDGPU::G_FABS:
15148 case AMDGPU::G_FCOPYSIGN:
15149 return isCanonicalized(Reg: MI->getOperand(i: 1).getReg(), MF, MaxDepth: MaxDepth - 1);
15150 case AMDGPU::G_FMINNUM:
15151 case AMDGPU::G_FMAXNUM:
15152 case AMDGPU::G_FMINNUM_IEEE:
15153 case AMDGPU::G_FMAXNUM_IEEE:
15154 case AMDGPU::G_FMINIMUM:
15155 case AMDGPU::G_FMAXIMUM:
15156 case AMDGPU::G_FMINIMUMNUM:
15157 case AMDGPU::G_FMAXIMUMNUM: {
15158 if (Subtarget->supportsMinMaxDenormModes() ||
15159 // FIXME: denormalsEnabledForType is broken for dynamic
15160 denormalsEnabledForType(Ty: MRI.getType(Reg), MF))
15161 return true;
15162
15163 [[fallthrough]];
15164 }
15165 case AMDGPU::G_BUILD_VECTOR:
15166 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands()))
15167 if (!isCanonicalized(Reg: MO.getReg(), MF, MaxDepth: MaxDepth - 1))
15168 return false;
15169 return true;
15170 case AMDGPU::G_INTRINSIC:
15171 case AMDGPU::G_INTRINSIC_CONVERGENT:
15172 switch (cast<GIntrinsic>(Val: MI)->getIntrinsicID()) {
15173 case Intrinsic::amdgcn_fmul_legacy:
15174 case Intrinsic::amdgcn_fmad_ftz:
15175 case Intrinsic::amdgcn_sqrt:
15176 case Intrinsic::amdgcn_fmed3:
15177 case Intrinsic::amdgcn_sin:
15178 case Intrinsic::amdgcn_cos:
15179 case Intrinsic::amdgcn_log:
15180 case Intrinsic::amdgcn_exp2:
15181 case Intrinsic::amdgcn_log_clamp:
15182 case Intrinsic::amdgcn_rcp:
15183 case Intrinsic::amdgcn_rcp_legacy:
15184 case Intrinsic::amdgcn_rsq:
15185 case Intrinsic::amdgcn_rsq_clamp:
15186 case Intrinsic::amdgcn_rsq_legacy:
15187 case Intrinsic::amdgcn_div_scale:
15188 case Intrinsic::amdgcn_div_fmas:
15189 case Intrinsic::amdgcn_div_fixup:
15190 case Intrinsic::amdgcn_fract:
15191 case Intrinsic::amdgcn_cvt_pkrtz:
15192 case Intrinsic::amdgcn_cubeid:
15193 case Intrinsic::amdgcn_cubema:
15194 case Intrinsic::amdgcn_cubesc:
15195 case Intrinsic::amdgcn_cubetc:
15196 case Intrinsic::amdgcn_frexp_mant:
15197 case Intrinsic::amdgcn_fdot2:
15198 case Intrinsic::amdgcn_trig_preop:
15199 case Intrinsic::amdgcn_tanh:
15200 return true;
15201 default:
15202 break;
15203 }
15204
15205 [[fallthrough]];
15206 default:
15207 return false;
15208 }
15209
15210 llvm_unreachable("invalid operation");
15211}
15212
15213// Constant fold canonicalize.
15214SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15215 const SDLoc &SL, EVT VT,
15216 const APFloat &C) const {
15217 // Flush denormals to 0 if not enabled.
15218 if (C.isDenormal()) {
15219 DenormalMode Mode =
15220 DAG.getMachineFunction().getDenormalMode(FPType: C.getSemantics());
15221 if (Mode == DenormalMode::getPreserveSign()) {
15222 return DAG.getConstantFP(
15223 Val: APFloat::getZero(Sem: C.getSemantics(), Negative: C.isNegative()), DL: SL, VT);
15224 }
15225
15226 if (Mode != DenormalMode::getIEEE())
15227 return SDValue();
15228 }
15229
15230 if (C.isNaN()) {
15231 APFloat CanonicalQNaN = APFloat::getQNaN(Sem: C.getSemantics());
15232 if (C.isSignaling()) {
15233 // Quiet a signaling NaN.
15234 // FIXME: Is this supposed to preserve payload bits?
15235 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15236 }
15237
15238 // Make sure it is the canonical NaN bitpattern.
15239 //
15240 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15241 // immediate?
15242 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15243 return DAG.getConstantFP(Val: CanonicalQNaN, DL: SL, VT);
15244 }
15245
15246 // Already canonical.
15247 return DAG.getConstantFP(Val: C, DL: SL, VT);
15248}
15249
15250static bool vectorEltWillFoldAway(SDValue Op) {
15251 return Op.isUndef() || isa<ConstantFPSDNode>(Val: Op);
15252}
15253
15254SDValue
15255SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15256 DAGCombinerInfo &DCI) const {
15257 SelectionDAG &DAG = DCI.DAG;
15258 SDValue N0 = N->getOperand(Num: 0);
15259 EVT VT = N->getValueType(ResNo: 0);
15260
15261 // fcanonicalize undef -> qnan
15262 if (N0.isUndef()) {
15263 APFloat QNaN = APFloat::getQNaN(Sem: VT.getFltSemantics());
15264 return DAG.getConstantFP(Val: QNaN, DL: SDLoc(N), VT);
15265 }
15266
15267 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N: N0)) {
15268 EVT VT = N->getValueType(ResNo: 0);
15269 return getCanonicalConstantFP(DAG, SL: SDLoc(N), VT, C: CFP->getValueAPF());
15270 }
15271
15272 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15273 // (fcanonicalize k)
15274 //
15275 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15276
15277 // TODO: This could be better with wider vectors that will be split to v2f16,
15278 // and to consider uses since there aren't that many packed operations.
15279 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15280 isTypeLegal(VT: MVT::v2f16)) {
15281 SDLoc SL(N);
15282 SDValue NewElts[2];
15283 SDValue Lo = N0.getOperand(i: 0);
15284 SDValue Hi = N0.getOperand(i: 1);
15285 EVT EltVT = Lo.getValueType();
15286
15287 if (vectorEltWillFoldAway(Op: Lo) || vectorEltWillFoldAway(Op: Hi)) {
15288 for (unsigned I = 0; I != 2; ++I) {
15289 SDValue Op = N0.getOperand(i: I);
15290 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
15291 NewElts[I] =
15292 getCanonicalConstantFP(DAG, SL, VT: EltVT, C: CFP->getValueAPF());
15293 } else if (Op.isUndef()) {
15294 // Handled below based on what the other operand is.
15295 NewElts[I] = Op;
15296 } else {
15297 NewElts[I] = DAG.getNode(Opcode: ISD::FCANONICALIZE, DL: SL, VT: EltVT, Operand: Op);
15298 }
15299 }
15300
15301 // If one half is undef, and one is constant, prefer a splat vector rather
15302 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15303 // cheaper to use and may be free with a packed operation.
15304 if (NewElts[0].isUndef()) {
15305 if (isa<ConstantFPSDNode>(Val: NewElts[1]))
15306 NewElts[0] = isa<ConstantFPSDNode>(Val: NewElts[1])
15307 ? NewElts[1]
15308 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15309 }
15310
15311 if (NewElts[1].isUndef()) {
15312 NewElts[1] = isa<ConstantFPSDNode>(Val: NewElts[0])
15313 ? NewElts[0]
15314 : DAG.getConstantFP(Val: 0.0f, DL: SL, VT: EltVT);
15315 }
15316
15317 return DAG.getBuildVector(VT, DL: SL, Ops: NewElts);
15318 }
15319 }
15320
15321 return SDValue();
15322}
15323
15324static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15325 switch (Opc) {
15326 case ISD::FMAXNUM:
15327 case ISD::FMAXNUM_IEEE:
15328 case ISD::FMAXIMUMNUM:
15329 return AMDGPUISD::FMAX3;
15330 case ISD::FMAXIMUM:
15331 return AMDGPUISD::FMAXIMUM3;
15332 case ISD::SMAX:
15333 return AMDGPUISD::SMAX3;
15334 case ISD::UMAX:
15335 return AMDGPUISD::UMAX3;
15336 case ISD::FMINNUM:
15337 case ISD::FMINNUM_IEEE:
15338 case ISD::FMINIMUMNUM:
15339 return AMDGPUISD::FMIN3;
15340 case ISD::FMINIMUM:
15341 return AMDGPUISD::FMINIMUM3;
15342 case ISD::SMIN:
15343 return AMDGPUISD::SMIN3;
15344 case ISD::UMIN:
15345 return AMDGPUISD::UMIN3;
15346 default:
15347 llvm_unreachable("Not a min/max opcode");
15348 }
15349}
15350
15351SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15352 const SDLoc &SL, SDValue Src,
15353 SDValue MinVal,
15354 SDValue MaxVal,
15355 bool Signed) const {
15356
15357 // med3 comes from
15358 // min(max(x, K0), K1), K0 < K1
15359 // max(min(x, K0), K1), K1 < K0
15360 //
15361 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15362 // min/max op.
15363 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(Val&: MinVal);
15364 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(Val&: MaxVal);
15365
15366 if (!MinK || !MaxK)
15367 return SDValue();
15368
15369 if (Signed) {
15370 if (MaxK->getAPIntValue().sge(RHS: MinK->getAPIntValue()))
15371 return SDValue();
15372 } else {
15373 if (MaxK->getAPIntValue().uge(RHS: MinK->getAPIntValue()))
15374 return SDValue();
15375 }
15376
15377 EVT VT = MinK->getValueType(ResNo: 0);
15378 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15379 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15380 return DAG.getNode(Opcode: Med3Opc, DL: SL, VT, N1: Src, N2: MaxVal, N3: MinVal);
15381
15382 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15383 // not available, but this is unlikely to be profitable as constants
15384 // will often need to be materialized & extended, especially on
15385 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15386 return SDValue();
15387}
15388
15389static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
15390 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op))
15391 return C;
15392
15393 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
15394 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
15395 return C;
15396 }
15397
15398 return nullptr;
15399}
15400
15401SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
15402 const SDLoc &SL, SDValue Op0,
15403 SDValue Op1,
15404 bool IsKnownNoNaNs) const {
15405 ConstantFPSDNode *K1 = getSplatConstantFP(Op: Op1);
15406 if (!K1)
15407 return SDValue();
15408
15409 ConstantFPSDNode *K0 = getSplatConstantFP(Op: Op0.getOperand(i: 1));
15410 if (!K0)
15411 return SDValue();
15412
15413 // Ordered >= (although NaN inputs should have folded away by now).
15414 if (K0->getValueAPF() > K1->getValueAPF())
15415 return SDValue();
15416
15417 // med3 with a nan input acts like
15418 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
15419 //
15420 // So the result depends on whether the IEEE mode bit is enabled or not with a
15421 // signaling nan input.
15422 // ieee=1
15423 // s0 snan: yields s2
15424 // s1 snan: yields s2
15425 // s2 snan: qnan
15426
15427 // s0 qnan: min(s1, s2)
15428 // s1 qnan: min(s0, s2)
15429 // s2 qnan: min(s0, s1)
15430
15431 // ieee=0
15432 // s0 snan: min(s1, s2)
15433 // s1 snan: min(s0, s2)
15434 // s2 snan: qnan
15435
15436 // s0 qnan: min(s1, s2)
15437 // s1 qnan: min(s0, s2)
15438 // s2 qnan: min(s0, s1)
15439 const MachineFunction &MF = DAG.getMachineFunction();
15440 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15441
15442 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
15443 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
15444 // can only form if op0 is fmaxnum_ieee if IEEE=1.
15445 EVT VT = Op0.getValueType();
15446 if (Info->getMode().DX10Clamp) {
15447 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
15448 // hardware fmed3 behavior converting to a min.
15449 // FIXME: Should this be allowing -0.0?
15450 if (K1->isExactlyValue(V: 1.0) && K0->isExactlyValue(V: 0.0))
15451 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Op0.getOperand(i: 0));
15452 }
15453
15454 // med3 for f16 is only available on gfx9+, and not available for v2f16.
15455 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15456 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
15457 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
15458 // then give the other result, which is different from med3 with a NaN
15459 // input.
15460 SDValue Var = Op0.getOperand(i: 0);
15461 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Op: Var))
15462 return SDValue();
15463
15464 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15465
15466 if ((!K0->hasOneUse() || TII->isInlineConstant(Imm: K0->getValueAPF())) &&
15467 (!K1->hasOneUse() || TII->isInlineConstant(Imm: K1->getValueAPF()))) {
15468 return DAG.getNode(Opcode: AMDGPUISD::FMED3, DL: SL, VT: K0->getValueType(ResNo: 0), N1: Var,
15469 N2: SDValue(K0, 0), N3: SDValue(K1, 0));
15470 }
15471 }
15472
15473 return SDValue();
15474}
15475
15476/// \return true if the subtarget supports minimum3 and maximum3 with the given
15477/// base min/max opcode \p Opc for type \p VT.
15478static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
15479 EVT VT) {
15480 switch (Opc) {
15481 case ISD::FMINNUM:
15482 case ISD::FMAXNUM:
15483 case ISD::FMINNUM_IEEE:
15484 case ISD::FMAXNUM_IEEE:
15485 case ISD::FMINIMUMNUM:
15486 case ISD::FMAXIMUMNUM:
15487 case AMDGPUISD::FMIN_LEGACY:
15488 case AMDGPUISD::FMAX_LEGACY:
15489 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
15490 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15491 case ISD::FMINIMUM:
15492 case ISD::FMAXIMUM:
15493 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15494 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15495 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15496 case ISD::SMAX:
15497 case ISD::SMIN:
15498 case ISD::UMAX:
15499 case ISD::UMIN:
15500 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
15501 default:
15502 return false;
15503 }
15504
15505 llvm_unreachable("not a min/max opcode");
15506}
15507
15508SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
15509 DAGCombinerInfo &DCI) const {
15510 SelectionDAG &DAG = DCI.DAG;
15511
15512 EVT VT = N->getValueType(ResNo: 0);
15513 unsigned Opc = N->getOpcode();
15514 SDValue Op0 = N->getOperand(Num: 0);
15515 SDValue Op1 = N->getOperand(Num: 1);
15516
15517 // Only do this if the inner op has one use since this will just increases
15518 // register pressure for no benefit.
15519
15520 if (supportsMin3Max3(Subtarget: *Subtarget, Opc, VT)) {
15521 // max(max(a, b), c) -> max3(a, b, c)
15522 // min(min(a, b), c) -> min3(a, b, c)
15523 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
15524 SDLoc DL(N);
15525 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15526 N1: Op0.getOperand(i: 0), N2: Op0.getOperand(i: 1), N3: Op1);
15527 }
15528
15529 // Try commuted.
15530 // max(a, max(b, c)) -> max3(a, b, c)
15531 // min(a, min(b, c)) -> min3(a, b, c)
15532 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
15533 SDLoc DL(N);
15534 return DAG.getNode(Opcode: minMaxOpcToMin3Max3Opc(Opc), DL, VT: N->getValueType(ResNo: 0),
15535 N1: Op0, N2: Op1.getOperand(i: 0), N3: Op1.getOperand(i: 1));
15536 }
15537 }
15538
15539 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
15540 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
15541 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
15542 if (SDValue Med3 = performIntMed3ImmCombine(
15543 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: true))
15544 return Med3;
15545 }
15546 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
15547 if (SDValue Med3 = performIntMed3ImmCombine(
15548 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: true))
15549 return Med3;
15550 }
15551
15552 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
15553 if (SDValue Med3 = performIntMed3ImmCombine(
15554 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op1, MaxVal: Op0->getOperand(Num: 1), Signed: false))
15555 return Med3;
15556 }
15557 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
15558 if (SDValue Med3 = performIntMed3ImmCombine(
15559 DAG, SL: SDLoc(N), Src: Op0->getOperand(Num: 0), MinVal: Op0->getOperand(Num: 1), MaxVal: Op1, Signed: false))
15560 return Med3;
15561 }
15562
15563 // if !is_snan(x):
15564 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15565 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15566 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15567 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
15568 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
15569 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
15570 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
15571 (Opc == AMDGPUISD::FMIN_LEGACY &&
15572 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15573 (VT == MVT::f32 || VT == MVT::f64 ||
15574 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15575 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15576 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15577 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15578 Op0.hasOneUse()) {
15579 if (SDValue Res = performFPMed3ImmCombine(DAG, SL: SDLoc(N), Op0, Op1,
15580 IsKnownNoNaNs: N->getFlags().hasNoNaNs()))
15581 return Res;
15582 }
15583
15584 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15585 // for some types, but at a higher cost since it's implemented with a 3
15586 // operand form.
15587 const SDNodeFlags Flags = N->getFlags();
15588 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
15589 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15590 isOperationLegal(Op: ISD::FMINNUM_IEEE, VT: VT.getScalarType())) {
15591 unsigned NewOpc =
15592 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15593 return DAG.getNode(Opcode: NewOpc, DL: SDLoc(N), VT, N1: Op0, N2: Op1, Flags);
15594 }
15595
15596 return SDValue();
15597}
15598
15599static bool isClampZeroToOne(SDValue A, SDValue B) {
15600 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(Val&: A)) {
15601 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(Val&: B)) {
15602 // FIXME: Should this be allowing -0.0?
15603 return (CA->isExactlyValue(V: 0.0) && CB->isExactlyValue(V: 1.0)) ||
15604 (CA->isExactlyValue(V: 1.0) && CB->isExactlyValue(V: 0.0));
15605 }
15606 }
15607
15608 return false;
15609}
15610
15611// FIXME: Should only worry about snans for version with chain.
15612SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15613 DAGCombinerInfo &DCI) const {
15614 EVT VT = N->getValueType(ResNo: 0);
15615 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15616 // NaNs. With a NaN input, the order of the operands may change the result.
15617
15618 SelectionDAG &DAG = DCI.DAG;
15619 SDLoc SL(N);
15620
15621 SDValue Src0 = N->getOperand(Num: 0);
15622 SDValue Src1 = N->getOperand(Num: 1);
15623 SDValue Src2 = N->getOperand(Num: 2);
15624
15625 if (isClampZeroToOne(A: Src0, B: Src1)) {
15626 // const_a, const_b, x -> clamp is safe in all cases including signaling
15627 // nans.
15628 // FIXME: Should this be allowing -0.0?
15629 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src2);
15630 }
15631
15632 const MachineFunction &MF = DAG.getMachineFunction();
15633 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15634
15635 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15636 // handling no dx10-clamp?
15637 if (Info->getMode().DX10Clamp) {
15638 // If NaNs is clamped to 0, we are free to reorder the inputs.
15639
15640 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15641 std::swap(a&: Src0, b&: Src1);
15642
15643 if (isa<ConstantFPSDNode>(Val: Src1) && !isa<ConstantFPSDNode>(Val: Src2))
15644 std::swap(a&: Src1, b&: Src2);
15645
15646 if (isa<ConstantFPSDNode>(Val: Src0) && !isa<ConstantFPSDNode>(Val: Src1))
15647 std::swap(a&: Src0, b&: Src1);
15648
15649 if (isClampZeroToOne(A: Src1, B: Src2))
15650 return DAG.getNode(Opcode: AMDGPUISD::CLAMP, DL: SL, VT, Operand: Src0);
15651 }
15652
15653 return SDValue();
15654}
15655
15656SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15657 DAGCombinerInfo &DCI) const {
15658 SDValue Src0 = N->getOperand(Num: 0);
15659 SDValue Src1 = N->getOperand(Num: 1);
15660 if (Src0.isUndef() && Src1.isUndef())
15661 return DCI.DAG.getUNDEF(VT: N->getValueType(ResNo: 0));
15662 return SDValue();
15663}
15664
15665// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15666// expanded into a set of cmp/select instructions.
15667bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
15668 unsigned NumElem,
15669 bool IsDivergentIdx,
15670 const GCNSubtarget *Subtarget) {
15671 if (UseDivergentRegisterIndexing)
15672 return false;
15673
15674 unsigned VecSize = EltSize * NumElem;
15675
15676 // Sub-dword vectors of size 2 dword or less have better implementation.
15677 if (VecSize <= 64 && EltSize < 32)
15678 return false;
15679
15680 // Always expand the rest of sub-dword instructions, otherwise it will be
15681 // lowered via memory.
15682 if (EltSize < 32)
15683 return true;
15684
15685 // Always do this if var-idx is divergent, otherwise it will become a loop.
15686 if (IsDivergentIdx)
15687 return true;
15688
15689 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15690 unsigned NumInsts = NumElem /* Number of compares */ +
15691 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15692
15693 // On some architectures (GFX9) movrel is not available and it's better
15694 // to expand.
15695 if (Subtarget->useVGPRIndexMode())
15696 return NumInsts <= 16;
15697
15698 // If movrel is available, use it instead of expanding for vector of 8
15699 // elements.
15700 if (Subtarget->hasMovrel())
15701 return NumInsts <= 15;
15702
15703 return true;
15704}
15705
15706bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
15707 SDValue Idx = N->getOperand(Num: N->getNumOperands() - 1);
15708 if (isa<ConstantSDNode>(Val: Idx))
15709 return false;
15710
15711 SDValue Vec = N->getOperand(Num: 0);
15712 EVT VecVT = Vec.getValueType();
15713 EVT EltVT = VecVT.getVectorElementType();
15714 unsigned EltSize = EltVT.getSizeInBits();
15715 unsigned NumElem = VecVT.getVectorNumElements();
15716
15717 return SITargetLowering::shouldExpandVectorDynExt(
15718 EltSize, NumElem, IsDivergentIdx: Idx->isDivergent(), Subtarget: getSubtarget());
15719}
15720
15721SDValue
15722SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15723 DAGCombinerInfo &DCI) const {
15724 SDValue Vec = N->getOperand(Num: 0);
15725 SelectionDAG &DAG = DCI.DAG;
15726
15727 EVT VecVT = Vec.getValueType();
15728 EVT VecEltVT = VecVT.getVectorElementType();
15729 EVT ResVT = N->getValueType(ResNo: 0);
15730
15731 unsigned VecSize = VecVT.getSizeInBits();
15732 unsigned VecEltSize = VecEltVT.getSizeInBits();
15733
15734 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15735 allUsesHaveSourceMods(N)) {
15736 SDLoc SL(N);
15737 SDValue Idx = N->getOperand(Num: 1);
15738 SDValue Elt =
15739 DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec.getOperand(i: 0), N2: Idx);
15740 return DAG.getNode(Opcode: Vec.getOpcode(), DL: SL, VT: ResVT, Operand: Elt);
15741 }
15742
15743 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15744 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15745 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15746 // depending on the shift operand. See e.g. performSraCombine().
15747 // This combine ensures that the optimisation is compatible with v2i32
15748 // legalised AND.
15749 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15750 Vec->getOperand(Num: 1)->getOpcode() == ISD::BUILD_VECTOR) {
15751
15752 const ConstantSDNode *C = isConstOrConstSplat(N: Vec.getOperand(i: 1));
15753 if (!C || C->getZExtValue() != 0x1f)
15754 return SDValue();
15755
15756 SDLoc SL(N);
15757 SDValue AndMask = DAG.getConstant(Val: 0x1f, DL: SL, VT: MVT::i32);
15758 SDValue EVE = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32,
15759 N1: Vec->getOperand(Num: 0), N2: N->getOperand(Num: 1));
15760 SDValue A = DAG.getNode(Opcode: ISD::AND, DL: SL, VT: MVT::i32, N1: EVE, N2: AndMask);
15761 DAG.ReplaceAllUsesWith(From: N, To: A.getNode());
15762 }
15763
15764 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15765 // =>
15766 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15767 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15768 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15769 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15770 SDLoc SL(N);
15771 SDValue Idx = N->getOperand(Num: 1);
15772 unsigned Opc = Vec.getOpcode();
15773
15774 switch (Opc) {
15775 default:
15776 break;
15777 // TODO: Support other binary operations.
15778 case ISD::FADD:
15779 case ISD::FSUB:
15780 case ISD::FMUL:
15781 case ISD::ADD:
15782 case ISD::UMIN:
15783 case ISD::UMAX:
15784 case ISD::SMIN:
15785 case ISD::SMAX:
15786 case ISD::FMAXNUM:
15787 case ISD::FMINNUM:
15788 case ISD::FMAXNUM_IEEE:
15789 case ISD::FMINNUM_IEEE:
15790 case ISD::FMAXIMUM:
15791 case ISD::FMINIMUM: {
15792 SDValue Elt0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15793 N1: Vec.getOperand(i: 0), N2: Idx);
15794 SDValue Elt1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT,
15795 N1: Vec.getOperand(i: 1), N2: Idx);
15796
15797 DCI.AddToWorklist(N: Elt0.getNode());
15798 DCI.AddToWorklist(N: Elt1.getNode());
15799 return DAG.getNode(Opcode: Opc, DL: SL, VT: ResVT, N1: Elt0, N2: Elt1, Flags: Vec->getFlags());
15800 }
15801 }
15802 }
15803
15804 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15805 if (shouldExpandVectorDynExt(N)) {
15806 SDLoc SL(N);
15807 SDValue Idx = N->getOperand(Num: 1);
15808 SDValue V;
15809 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15810 SDValue IC = DAG.getVectorIdxConstant(Val: I, DL: SL);
15811 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: ResVT, N1: Vec, N2: IC);
15812 if (I == 0)
15813 V = Elt;
15814 else
15815 V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Elt, False: V, Cond: ISD::SETEQ);
15816 }
15817 return V;
15818 }
15819
15820 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
15821 // =>
15822 // i32:Lo(k) if Idx == 0, or
15823 // i32:Hi(k) if Idx == 1
15824 auto *Idx = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
15825 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
15826 SDLoc SL(N);
15827 SDValue PeekThrough = Vec.getOperand(i: 0);
15828 auto *KImm = dyn_cast<ConstantSDNode>(Val&: PeekThrough);
15829 if (KImm && KImm->getValueType(ResNo: 0).getSizeInBits() == 64) {
15830 uint64_t KImmValue = KImm->getZExtValue();
15831 return DAG.getConstant(
15832 Val: (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, DL: SL, VT: MVT::i32);
15833 }
15834 auto *KFPImm = dyn_cast<ConstantFPSDNode>(Val&: PeekThrough);
15835 if (KFPImm && KFPImm->getValueType(ResNo: 0).getSizeInBits() == 64) {
15836 uint64_t KFPImmValue =
15837 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
15838 return DAG.getConstant(Val: (KFPImmValue >> (32 * Idx->getZExtValue())) &
15839 0xffffffff,
15840 DL: SL, VT: MVT::i32);
15841 }
15842 }
15843
15844 if (!DCI.isBeforeLegalize())
15845 return SDValue();
15846
15847 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15848 // elements. This exposes more load reduction opportunities by replacing
15849 // multiple small extract_vector_elements with a single 32-bit extract.
15850 if (isa<MemSDNode>(Val: Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15851 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15852 EVT NewVT = getEquivalentMemType(Context&: *DAG.getContext(), VT: VecVT);
15853
15854 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15855 unsigned EltIdx = BitIndex / 32;
15856 unsigned LeftoverBitIdx = BitIndex % 32;
15857 SDLoc SL(N);
15858
15859 SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: NewVT, Operand: Vec);
15860 DCI.AddToWorklist(N: Cast.getNode());
15861
15862 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: MVT::i32, N1: Cast,
15863 N2: DAG.getConstant(Val: EltIdx, DL: SL, VT: MVT::i32));
15864 DCI.AddToWorklist(N: Elt.getNode());
15865 SDValue Srl = DAG.getNode(Opcode: ISD::SRL, DL: SL, VT: MVT::i32, N1: Elt,
15866 N2: DAG.getConstant(Val: LeftoverBitIdx, DL: SL, VT: MVT::i32));
15867 DCI.AddToWorklist(N: Srl.getNode());
15868
15869 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15870 SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: VecEltAsIntVT, Operand: Srl);
15871 DCI.AddToWorklist(N: Trunc.getNode());
15872
15873 if (VecEltVT == ResVT) {
15874 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: VecEltVT, Operand: Trunc);
15875 }
15876
15877 assert(ResVT.isScalarInteger());
15878 return DAG.getAnyExtOrTrunc(Op: Trunc, DL: SL, VT: ResVT);
15879 }
15880
15881 return SDValue();
15882}
15883
15884SDValue
15885SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15886 DAGCombinerInfo &DCI) const {
15887 SDValue Vec = N->getOperand(Num: 0);
15888 SDValue Idx = N->getOperand(Num: 2);
15889 EVT VecVT = Vec.getValueType();
15890 EVT EltVT = VecVT.getVectorElementType();
15891
15892 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15893 // => BUILD_VECTOR n x select (e, const-idx)
15894 if (!shouldExpandVectorDynExt(N))
15895 return SDValue();
15896
15897 SelectionDAG &DAG = DCI.DAG;
15898 SDLoc SL(N);
15899 SDValue Ins = N->getOperand(Num: 1);
15900 EVT IdxVT = Idx.getValueType();
15901
15902 SmallVector<SDValue, 16> Ops;
15903 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15904 SDValue IC = DAG.getConstant(Val: I, DL: SL, VT: IdxVT);
15905 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: SL, VT: EltVT, N1: Vec, N2: IC);
15906 SDValue V = DAG.getSelectCC(DL: SL, LHS: Idx, RHS: IC, True: Ins, False: Elt, Cond: ISD::SETEQ);
15907 Ops.push_back(Elt: V);
15908 }
15909
15910 return DAG.getBuildVector(VT: VecVT, DL: SL, Ops);
15911}
15912
15913/// Return the source of an fp_extend from f16 to f32, or a converted FP
15914/// constant.
15915static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
15916 if (Src.getOpcode() == ISD::FP_EXTEND &&
15917 Src.getOperand(i: 0).getValueType() == MVT::f16) {
15918 return Src.getOperand(i: 0);
15919 }
15920
15921 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Val&: Src)) {
15922 APFloat Val = CFP->getValueAPF();
15923 bool LosesInfo = true;
15924 Val.convert(ToSemantics: APFloat::IEEEhalf(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
15925 if (!LosesInfo)
15926 return DAG.getConstantFP(Val, DL: SDLoc(Src), VT: MVT::f16);
15927 }
15928
15929 return SDValue();
15930}
15931
15932SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15933 DAGCombinerInfo &DCI) const {
15934 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15935 "combine only useful on gfx8");
15936
15937 SDValue TruncSrc = N->getOperand(Num: 0);
15938 EVT VT = N->getValueType(ResNo: 0);
15939 if (VT != MVT::f16)
15940 return SDValue();
15941
15942 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15943 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15944 return SDValue();
15945
15946 SelectionDAG &DAG = DCI.DAG;
15947 SDLoc SL(N);
15948
15949 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15950 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15951 // casting back.
15952
15953 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15954 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15955 SDValue A = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 0));
15956 if (!A)
15957 return SDValue();
15958
15959 SDValue B = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 1));
15960 if (!B)
15961 return SDValue();
15962
15963 SDValue C = strictFPExtFromF16(DAG, Src: TruncSrc.getOperand(i: 2));
15964 if (!C)
15965 return SDValue();
15966
15967 // This changes signaling nan behavior. If an input is a signaling nan, it
15968 // would have been quieted by the fpext originally. We don't care because
15969 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15970 // we would be worse off than just doing the promotion.
15971 SDValue A1 = DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15972 SDValue B1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A, N2: B);
15973 SDValue C1 = DAG.getNode(Opcode: ISD::FMAXNUM_IEEE, DL: SL, VT, N1: A1, N2: C);
15974 return DAG.getNode(Opcode: ISD::FMINNUM_IEEE, DL: SL, VT, N1: B1, N2: C1);
15975}
15976
15977unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15978 const SDNode *N0,
15979 const SDNode *N1) const {
15980 EVT VT = N0->getValueType(ResNo: 0);
15981
15982 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15983 // support denormals ever.
15984 if (((VT == MVT::f32 &&
15985 denormalModeIsFlushAllF32(MF: DAG.getMachineFunction())) ||
15986 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15987 denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction()))) &&
15988 isOperationLegal(Op: ISD::FMAD, VT))
15989 return ISD::FMAD;
15990
15991 const TargetOptions &Options = DAG.getTarget().Options;
15992 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15993 (N0->getFlags().hasAllowContract() &&
15994 N1->getFlags().hasAllowContract())) &&
15995 isFMAFasterThanFMulAndFAdd(MF: DAG.getMachineFunction(), VT)) {
15996 return ISD::FMA;
15997 }
15998
15999 return 0;
16000}
16001
16002// For a reassociatable opcode perform:
16003// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16004SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16005 SelectionDAG &DAG) const {
16006 EVT VT = N->getValueType(ResNo: 0);
16007 if (VT != MVT::i32 && VT != MVT::i64)
16008 return SDValue();
16009
16010 if (DAG.isBaseWithConstantOffset(Op: SDValue(N, 0)))
16011 return SDValue();
16012
16013 unsigned Opc = N->getOpcode();
16014 SDValue Op0 = N->getOperand(Num: 0);
16015 SDValue Op1 = N->getOperand(Num: 1);
16016
16017 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16018 return SDValue();
16019
16020 if (Op0->isDivergent())
16021 std::swap(a&: Op0, b&: Op1);
16022
16023 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16024 return SDValue();
16025
16026 SDValue Op2 = Op1.getOperand(i: 1);
16027 Op1 = Op1.getOperand(i: 0);
16028 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16029 return SDValue();
16030
16031 if (Op1->isDivergent())
16032 std::swap(a&: Op1, b&: Op2);
16033
16034 SDLoc SL(N);
16035 SDValue Add1 = DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Op0, N2: Op1);
16036 return DAG.getNode(Opcode: Opc, DL: SL, VT, N1: Add1, N2: Op2);
16037}
16038
16039static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16040 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16041 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
16042 SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i1);
16043 SDValue Mad = DAG.getNode(Opcode: MadOpc, DL: SL, VTList: VTs, N1: N0, N2: N1, N3: N2);
16044 return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Mad);
16045}
16046
16047// Fold
16048// y = lshr i64 x, 32
16049// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16050// with Const.hi == -1
16051// To
16052// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16053static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
16054 SDValue MulLHS, SDValue MulRHS,
16055 SDValue AddRHS) {
16056 if (MulRHS.getOpcode() == ISD::SRL)
16057 std::swap(a&: MulLHS, b&: MulRHS);
16058
16059 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16060 return SDValue();
16061
16062 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(Val: MulLHS.getOperand(i: 1));
16063 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16064 MulLHS.getOperand(i: 0) != AddRHS)
16065 return SDValue();
16066
16067 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val: MulRHS.getNode());
16068 if (!Const || Hi_32(Value: Const->getZExtValue()) != uint32_t(-1))
16069 return SDValue();
16070
16071 SDValue ConstMul =
16072 DAG.getConstant(Val: Lo_32(Value: Const->getZExtValue()), DL: SL, VT: MVT::i32);
16073 return getMad64_32(DAG, SL, VT: MVT::i64,
16074 N0: DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS), N1: ConstMul,
16075 N2: DAG.getZeroExtendInReg(Op: AddRHS, DL: SL, VT: MVT::i32), Signed: false);
16076}
16077
16078// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16079// multiplies, if any.
16080//
16081// Full 64-bit multiplies that feed into an addition are lowered here instead
16082// of using the generic expansion. The generic expansion ends up with
16083// a tree of ADD nodes that prevents us from using the "add" part of the
16084// MAD instruction. The expansion produced here results in a chain of ADDs
16085// instead of a tree.
16086SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16087 DAGCombinerInfo &DCI) const {
16088 assert(N->isAnyAdd());
16089
16090 SelectionDAG &DAG = DCI.DAG;
16091 EVT VT = N->getValueType(ResNo: 0);
16092 SDLoc SL(N);
16093 SDValue LHS = N->getOperand(Num: 0);
16094 SDValue RHS = N->getOperand(Num: 1);
16095
16096 if (VT.isVector())
16097 return SDValue();
16098
16099 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16100 // result in scalar registers for uniform values.
16101 if (!N->isDivergent() && Subtarget->hasSMulHi())
16102 return SDValue();
16103
16104 unsigned NumBits = VT.getScalarSizeInBits();
16105 if (NumBits <= 32 || NumBits > 64)
16106 return SDValue();
16107
16108 if (LHS.getOpcode() != ISD::MUL) {
16109 assert(RHS.getOpcode() == ISD::MUL);
16110 std::swap(a&: LHS, b&: RHS);
16111 }
16112
16113 // Avoid the fold if it would unduly increase the number of multiplies due to
16114 // multiple uses, except on hardware with full-rate multiply-add (which is
16115 // part of full-rate 64-bit ops).
16116 if (!Subtarget->hasFullRate64Ops()) {
16117 unsigned NumUsers = 0;
16118 for (SDNode *User : LHS->users()) {
16119 // There is a use that does not feed into addition, so the multiply can't
16120 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16121 if (!User->isAnyAdd())
16122 return SDValue();
16123
16124 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16125 // MUL + 3xADD + 3xADDC over 3xMAD.
16126 ++NumUsers;
16127 if (NumUsers >= 3)
16128 return SDValue();
16129 }
16130 }
16131
16132 SDValue MulLHS = LHS.getOperand(i: 0);
16133 SDValue MulRHS = LHS.getOperand(i: 1);
16134 SDValue AddRHS = RHS;
16135
16136 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16137 return FoldedMAD;
16138
16139 // Always check whether operands are small unsigned values, since that
16140 // knowledge is useful in more cases. Check for small signed values only if
16141 // doing so can unlock a shorter code sequence.
16142 bool MulLHSUnsigned32 = numBitsUnsigned(Op: MulLHS, DAG) <= 32;
16143 bool MulRHSUnsigned32 = numBitsUnsigned(Op: MulRHS, DAG) <= 32;
16144
16145 bool MulSignedLo = false;
16146 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16147 MulSignedLo =
16148 numBitsSigned(Op: MulLHS, DAG) <= 32 && numBitsSigned(Op: MulRHS, DAG) <= 32;
16149 }
16150
16151 // The operands and final result all have the same number of bits. If
16152 // operands need to be extended, they can be extended with garbage. The
16153 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16154 // truncated away in the end.
16155 if (VT != MVT::i64) {
16156 MulLHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulLHS);
16157 MulRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: MulRHS);
16158 AddRHS = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i64, Operand: AddRHS);
16159 }
16160
16161 // The basic code generated is conceptually straightforward. Pseudo code:
16162 //
16163 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16164 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16165 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16166 //
16167 // The second and third lines are optional, depending on whether the factors
16168 // are {sign,zero}-extended or not.
16169 //
16170 // The actual DAG is noisier than the pseudo code, but only due to
16171 // instructions that disassemble values into low and high parts, and
16172 // assemble the final result.
16173 SDValue One = DAG.getConstant(Val: 1, DL: SL, VT: MVT::i32);
16174
16175 auto MulLHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulLHS);
16176 auto MulRHSLo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: MulRHS);
16177 SDValue Accum =
16178 getMad64_32(DAG, SL, VT: MVT::i64, N0: MulLHSLo, N1: MulRHSLo, N2: AddRHS, Signed: MulSignedLo);
16179
16180 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16181 auto [AccumLo, AccumHi] = DAG.SplitScalar(N: Accum, DL: SL, LoVT: MVT::i32, HiVT: MVT::i32);
16182
16183 if (!MulLHSUnsigned32) {
16184 auto MulLHSHi =
16185 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulLHS, N2: One);
16186 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSHi, N2: MulRHSLo);
16187 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16188 }
16189
16190 if (!MulRHSUnsigned32) {
16191 auto MulRHSHi =
16192 DAG.getNode(Opcode: ISD::EXTRACT_ELEMENT, DL: SL, VT: MVT::i32, N1: MulRHS, N2: One);
16193 SDValue MulHi = DAG.getNode(Opcode: ISD::MUL, DL: SL, VT: MVT::i32, N1: MulLHSLo, N2: MulRHSHi);
16194 AccumHi = DAG.getNode(Opcode: ISD::ADD, DL: SL, VT: MVT::i32, N1: MulHi, N2: AccumHi);
16195 }
16196
16197 Accum = DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {AccumLo, AccumHi});
16198 Accum = DAG.getBitcast(VT: MVT::i64, V: Accum);
16199 }
16200
16201 if (VT != MVT::i64)
16202 Accum = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT, Operand: Accum);
16203 return Accum;
16204}
16205
16206SDValue
16207SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16208 DAGCombinerInfo &DCI) const {
16209 SDValue RHS = N->getOperand(Num: 1);
16210 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
16211 if (!CRHS)
16212 return SDValue();
16213
16214 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16215 // common.
16216 uint64_t Val = CRHS->getZExtValue();
16217 if (countr_zero(Val) >= 32) {
16218 SelectionDAG &DAG = DCI.DAG;
16219 SDLoc SL(N);
16220 SDValue LHS = N->getOperand(Num: 0);
16221
16222 // Avoid carry machinery if we know the low half of the add does not
16223 // contribute to the final result.
16224 //
16225 // add i64:x, K if computeTrailingZeros(K) >= 32
16226 // => build_pair (add x.hi, K.hi), x.lo
16227
16228 // Breaking the 64-bit add here with this strange constant is unlikely
16229 // to interfere with addressing mode patterns.
16230
16231 SDValue Hi = getHiHalf64(Op: LHS, DAG);
16232 SDValue ConstHi32 = DAG.getConstant(Val: Hi_32(Value: Val), DL: SL, VT: MVT::i32);
16233 unsigned Opcode = N->getOpcode();
16234 if (Opcode == ISD::PTRADD)
16235 Opcode = ISD::ADD;
16236 SDValue AddHi =
16237 DAG.getNode(Opcode, DL: SL, VT: MVT::i32, N1: Hi, N2: ConstHi32, Flags: N->getFlags());
16238
16239 SDValue Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: LHS);
16240 return DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SL, VT: MVT::i64, N1: Lo, N2: AddHi);
16241 }
16242
16243 return SDValue();
16244}
16245
16246// Collect the ultimate src of each of the mul node's operands, and confirm
16247// each operand is 8 bytes.
16248static std::optional<ByteProvider<SDValue>>
16249handleMulOperand(const SDValue &MulOperand) {
16250 auto Byte0 = calculateByteProvider(Op: MulOperand, Index: 0, Depth: 0);
16251 if (!Byte0 || Byte0->isConstantZero()) {
16252 return std::nullopt;
16253 }
16254 auto Byte1 = calculateByteProvider(Op: MulOperand, Index: 1, Depth: 0);
16255 if (Byte1 && !Byte1->isConstantZero()) {
16256 return std::nullopt;
16257 }
16258 return Byte0;
16259}
16260
16261static unsigned addPermMasks(unsigned First, unsigned Second) {
16262 unsigned FirstCs = First & 0x0c0c0c0c;
16263 unsigned SecondCs = Second & 0x0c0c0c0c;
16264 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16265 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16266
16267 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16268 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16269 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16270 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16271
16272 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16273}
16274
16275struct DotSrc {
16276 SDValue SrcOp;
16277 int64_t PermMask;
16278 int64_t DWordOffset;
16279};
16280
16281static void placeSources(ByteProvider<SDValue> &Src0,
16282 ByteProvider<SDValue> &Src1,
16283 SmallVectorImpl<DotSrc> &Src0s,
16284 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16285
16286 assert(Src0.Src.has_value() && Src1.Src.has_value());
16287 // Src0s and Src1s are empty, just place arbitrarily.
16288 if (Step == 0) {
16289 Src0s.push_back(Elt: {.SrcOp: *Src0.Src, .PermMask: ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16290 .DWordOffset: Src0.SrcOffset / 4});
16291 Src1s.push_back(Elt: {.SrcOp: *Src1.Src, .PermMask: ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16292 .DWordOffset: Src1.SrcOffset / 4});
16293 return;
16294 }
16295
16296 for (int BPI = 0; BPI < 2; BPI++) {
16297 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16298 if (BPI == 1) {
16299 BPP = {Src1, Src0};
16300 }
16301 unsigned ZeroMask = 0x0c0c0c0c;
16302 unsigned FMask = 0xFF << (8 * (3 - Step));
16303
16304 unsigned FirstMask =
16305 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16306 unsigned SecondMask =
16307 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16308 // Attempt to find Src vector which contains our SDValue, if so, add our
16309 // perm mask to the existing one. If we are unable to find a match for the
16310 // first SDValue, attempt to find match for the second.
16311 int FirstGroup = -1;
16312 for (int I = 0; I < 2; I++) {
16313 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16314 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16315 return IterElt.SrcOp == *BPP.first.Src &&
16316 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16317 };
16318
16319 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesFirst);
16320 if (Match != Srcs.end()) {
16321 Match->PermMask = addPermMasks(First: FirstMask, Second: Match->PermMask);
16322 FirstGroup = I;
16323 break;
16324 }
16325 }
16326 if (FirstGroup != -1) {
16327 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16328 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16329 return IterElt.SrcOp == *BPP.second.Src &&
16330 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16331 };
16332 auto *Match = llvm::find_if(Range&: Srcs, P: MatchesSecond);
16333 if (Match != Srcs.end()) {
16334 Match->PermMask = addPermMasks(First: SecondMask, Second: Match->PermMask);
16335 } else
16336 Srcs.push_back(Elt: {.SrcOp: *BPP.second.Src, .PermMask: SecondMask, .DWordOffset: BPP.second.SrcOffset / 4});
16337 return;
16338 }
16339 }
16340
16341 // If we have made it here, then we could not find a match in Src0s or Src1s
16342 // for either Src0 or Src1, so just place them arbitrarily.
16343
16344 unsigned ZeroMask = 0x0c0c0c0c;
16345 unsigned FMask = 0xFF << (8 * (3 - Step));
16346
16347 Src0s.push_back(
16348 Elt: {.SrcOp: *Src0.Src,
16349 .PermMask: ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16350 .DWordOffset: Src0.SrcOffset / 4});
16351 Src1s.push_back(
16352 Elt: {.SrcOp: *Src1.Src,
16353 .PermMask: ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16354 .DWordOffset: Src1.SrcOffset / 4});
16355}
16356
16357static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL,
16358 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
16359 bool IsAny) {
16360
16361 // If we just have one source, just permute it accordingly.
16362 if (Srcs.size() == 1) {
16363 auto *Elt = Srcs.begin();
16364 auto EltOp = getDWordFromOffset(DAG, SL, Src: Elt->SrcOp, DWordOffset: Elt->DWordOffset);
16365
16366 // v_perm will produce the original value
16367 if (Elt->PermMask == 0x3020100)
16368 return EltOp;
16369
16370 return DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16371 N3: DAG.getConstant(Val: Elt->PermMask, DL: SL, VT: MVT::i32));
16372 }
16373
16374 auto *FirstElt = Srcs.begin();
16375 auto *SecondElt = std::next(x: FirstElt);
16376
16377 SmallVector<SDValue, 2> Perms;
16378
16379 // If we have multiple sources in the chain, combine them via perms (using
16380 // calculated perm mask) and Ors.
16381 while (true) {
16382 auto FirstMask = FirstElt->PermMask;
16383 auto SecondMask = SecondElt->PermMask;
16384
16385 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16386 unsigned FirstPlusFour = FirstMask | 0x04040404;
16387 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
16388 // original 0x0C.
16389 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16390
16391 auto PermMask = addPermMasks(First: FirstMask, Second: SecondMask);
16392 auto FirstVal =
16393 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16394 auto SecondVal =
16395 getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp, DWordOffset: SecondElt->DWordOffset);
16396
16397 Perms.push_back(Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: FirstVal,
16398 N2: SecondVal,
16399 N3: DAG.getConstant(Val: PermMask, DL: SL, VT: MVT::i32)));
16400
16401 FirstElt = std::next(x: SecondElt);
16402 if (FirstElt == Srcs.end())
16403 break;
16404
16405 SecondElt = std::next(x: FirstElt);
16406 // If we only have a FirstElt, then just combine that into the cumulative
16407 // source node.
16408 if (SecondElt == Srcs.end()) {
16409 auto EltOp =
16410 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16411
16412 Perms.push_back(
16413 Elt: DAG.getNode(Opcode: AMDGPUISD::PERM, DL: SL, VT: MVT::i32, N1: EltOp, N2: EltOp,
16414 N3: DAG.getConstant(Val: FirstElt->PermMask, DL: SL, VT: MVT::i32)));
16415 break;
16416 }
16417 }
16418
16419 assert(Perms.size() == 1 || Perms.size() == 2);
16420 return Perms.size() == 2
16421 ? DAG.getNode(Opcode: ISD::OR, DL: SL, VT: MVT::i32, N1: Perms[0], N2: Perms[1])
16422 : Perms[0];
16423}
16424
16425static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
16426 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16427 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16428 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16429 EntryMask += ZeroMask;
16430 }
16431}
16432
16433static bool isMul(const SDValue Op) {
16434 auto Opcode = Op.getOpcode();
16435
16436 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16437 Opcode == AMDGPUISD::MUL_I24);
16438}
16439
16440static std::optional<bool>
16441checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
16442 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
16443 const SDValue &S1Op, const SelectionDAG &DAG) {
16444 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
16445 // of the dot4 is irrelevant.
16446 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
16447 return false;
16448
16449 auto Known0 = DAG.computeKnownBits(Op: S0Op, Depth: 0);
16450 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
16451 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16452 auto Known1 = DAG.computeKnownBits(Op: S1Op, Depth: 0);
16453 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
16454 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16455
16456 assert(!(S0IsUnsigned && S0IsSigned));
16457 assert(!(S1IsUnsigned && S1IsSigned));
16458
16459 // There are 9 possible permutations of
16460 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
16461
16462 // In two permutations, the sign bits are known to be the same for both Ops,
16463 // so simply return Signed / Unsigned corresponding to the MSB
16464
16465 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16466 return S0IsSigned;
16467
16468 // In another two permutations, the sign bits are known to be opposite. In
16469 // this case return std::nullopt to indicate a bad match.
16470
16471 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16472 return std::nullopt;
16473
16474 // In the remaining five permutations, we don't know the value of the sign
16475 // bit for at least one Op. Since we have a valid ByteProvider, we know that
16476 // the upper bits must be extension bits. Thus, the only ways for the sign
16477 // bit to be unknown is if it was sign extended from unknown value, or if it
16478 // was any extended. In either case, it is correct to use the signed
16479 // version of the signedness semantics of dot4
16480
16481 // In two of such permutations, we known the sign bit is set for
16482 // one op, and the other is unknown. It is okay to used signed version of
16483 // dot4.
16484 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16485 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16486 return true;
16487
16488 // In one such permutation, we don't know either of the sign bits. It is okay
16489 // to used the signed version of dot4.
16490 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16491 return true;
16492
16493 // In two of such permutations, we known the sign bit is unset for
16494 // one op, and the other is unknown. Return std::nullopt to indicate a
16495 // bad match.
16496 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16497 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16498 return std::nullopt;
16499
16500 llvm_unreachable("Fully covered condition");
16501}
16502
16503SDValue SITargetLowering::performAddCombine(SDNode *N,
16504 DAGCombinerInfo &DCI) const {
16505 SelectionDAG &DAG = DCI.DAG;
16506 EVT VT = N->getValueType(ResNo: 0);
16507 SDLoc SL(N);
16508 SDValue LHS = N->getOperand(Num: 0);
16509 SDValue RHS = N->getOperand(Num: 1);
16510
16511 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
16512 if (Subtarget->hasMad64_32()) {
16513 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16514 return Folded;
16515 }
16516 }
16517
16518 if (SDValue V = reassociateScalarOps(N, DAG)) {
16519 return V;
16520 }
16521
16522 if (VT == MVT::i64) {
16523 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16524 return Folded;
16525 }
16526
16527 if ((isMul(Op: LHS) || isMul(Op: RHS)) && Subtarget->hasDot7Insts() &&
16528 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16529 SDValue TempNode(N, 0);
16530 std::optional<bool> IsSigned;
16531 SmallVector<DotSrc, 4> Src0s;
16532 SmallVector<DotSrc, 4> Src1s;
16533 SmallVector<SDValue, 4> Src2s;
16534
16535 // Match the v_dot4 tree, while collecting src nodes.
16536 int ChainLength = 0;
16537 for (int I = 0; I < 4; I++) {
16538 auto MulIdx = isMul(Op: LHS) ? 0 : isMul(Op: RHS) ? 1 : -1;
16539 if (MulIdx == -1)
16540 break;
16541 auto Src0 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0));
16542 if (!Src0)
16543 break;
16544 auto Src1 = handleMulOperand(MulOperand: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1));
16545 if (!Src1)
16546 break;
16547
16548 auto IterIsSigned = checkDot4MulSignedness(
16549 N: TempNode->getOperand(Num: MulIdx), Src0&: *Src0, Src1&: *Src1,
16550 S0Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 0),
16551 S1Op: TempNode->getOperand(Num: MulIdx)->getOperand(Num: 1), DAG);
16552 if (!IterIsSigned)
16553 break;
16554 if (!IsSigned)
16555 IsSigned = *IterIsSigned;
16556 if (*IterIsSigned != *IsSigned)
16557 break;
16558 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I);
16559 auto AddIdx = 1 - MulIdx;
16560 // Allow the special case where add (add (mul24, 0), mul24) became ->
16561 // add (mul24, mul24).
16562 if (I == 2 && isMul(Op: TempNode->getOperand(Num: AddIdx))) {
16563 Src2s.push_back(Elt: TempNode->getOperand(Num: AddIdx));
16564 auto Src0 =
16565 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0));
16566 if (!Src0)
16567 break;
16568 auto Src1 =
16569 handleMulOperand(MulOperand: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1));
16570 if (!Src1)
16571 break;
16572 auto IterIsSigned = checkDot4MulSignedness(
16573 N: TempNode->getOperand(Num: AddIdx), Src0&: *Src0, Src1&: *Src1,
16574 S0Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 0),
16575 S1Op: TempNode->getOperand(Num: AddIdx)->getOperand(Num: 1), DAG);
16576 if (!IterIsSigned)
16577 break;
16578 assert(IsSigned);
16579 if (*IterIsSigned != *IsSigned)
16580 break;
16581 placeSources(Src0&: *Src0, Src1&: *Src1, Src0s, Src1s, Step: I + 1);
16582 Src2s.push_back(Elt: DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32));
16583 ChainLength = I + 2;
16584 break;
16585 }
16586
16587 TempNode = TempNode->getOperand(Num: AddIdx);
16588 Src2s.push_back(Elt: TempNode);
16589 ChainLength = I + 1;
16590 if (TempNode->getNumOperands() < 2)
16591 break;
16592 LHS = TempNode->getOperand(Num: 0);
16593 RHS = TempNode->getOperand(Num: 1);
16594 }
16595
16596 if (ChainLength < 2)
16597 return SDValue();
16598
16599 // Masks were constructed with assumption that we would find a chain of
16600 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
16601 // 0x0c) so they do not affect dot calculation.
16602 if (ChainLength < 4) {
16603 fixMasks(Srcs&: Src0s, ChainLength);
16604 fixMasks(Srcs&: Src1s, ChainLength);
16605 }
16606
16607 SDValue Src0, Src1;
16608
16609 // If we are just using a single source for both, and have permuted the
16610 // bytes consistently, we can just use the sources without permuting
16611 // (commutation).
16612 bool UseOriginalSrc = false;
16613 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16614 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16615 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16616 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16617 SmallVector<unsigned, 4> SrcBytes;
16618 auto Src0Mask = Src0s.begin()->PermMask;
16619 SrcBytes.push_back(Elt: Src0Mask & 0xFF000000);
16620 bool UniqueEntries = true;
16621 for (auto I = 1; I < 4; I++) {
16622 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16623
16624 if (is_contained(Range&: SrcBytes, Element: NextByte)) {
16625 UniqueEntries = false;
16626 break;
16627 }
16628 SrcBytes.push_back(Elt: NextByte);
16629 }
16630
16631 if (UniqueEntries) {
16632 UseOriginalSrc = true;
16633
16634 auto *FirstElt = Src0s.begin();
16635 auto FirstEltOp =
16636 getDWordFromOffset(DAG, SL, Src: FirstElt->SrcOp, DWordOffset: FirstElt->DWordOffset);
16637
16638 auto *SecondElt = Src1s.begin();
16639 auto SecondEltOp = getDWordFromOffset(DAG, SL, Src: SecondElt->SrcOp,
16640 DWordOffset: SecondElt->DWordOffset);
16641
16642 Src0 = DAG.getBitcastedAnyExtOrTrunc(Op: FirstEltOp, DL: SL,
16643 VT: MVT::getIntegerVT(BitWidth: 32));
16644 Src1 = DAG.getBitcastedAnyExtOrTrunc(Op: SecondEltOp, DL: SL,
16645 VT: MVT::getIntegerVT(BitWidth: 32));
16646 }
16647 }
16648
16649 if (!UseOriginalSrc) {
16650 Src0 = resolveSources(DAG, SL, Srcs&: Src0s, IsSigned: false, IsAny: true);
16651 Src1 = resolveSources(DAG, SL, Srcs&: Src1s, IsSigned: false, IsAny: true);
16652 }
16653
16654 assert(IsSigned);
16655 SDValue Src2 =
16656 DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Src2s[ChainLength - 1], DL: SL, VT: MVT::i32);
16657
16658 SDValue IID = DAG.getTargetConstant(Val: *IsSigned ? Intrinsic::amdgcn_sdot4
16659 : Intrinsic::amdgcn_udot4,
16660 DL: SL, VT: MVT::i64);
16661
16662 assert(!VT.isVector());
16663 auto Dot = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SL, VT: MVT::i32, N1: IID, N2: Src0,
16664 N3: Src1, N4: Src2, N5: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
16665
16666 return DAG.getExtOrTrunc(IsSigned: *IsSigned, Op: Dot, DL: SL, VT);
16667 }
16668
16669 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16670 return SDValue();
16671
16672 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16673 // add x, sext (setcc) => usubo_carry x, 0, setcc
16674 unsigned Opc = LHS.getOpcode();
16675 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
16676 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
16677 std::swap(a&: RHS, b&: LHS);
16678
16679 Opc = RHS.getOpcode();
16680 switch (Opc) {
16681 default:
16682 break;
16683 case ISD::ZERO_EXTEND:
16684 case ISD::SIGN_EXTEND:
16685 case ISD::ANY_EXTEND: {
16686 auto Cond = RHS.getOperand(i: 0);
16687 // If this won't be a real VOPC output, we would still need to insert an
16688 // extra instruction anyway.
16689 if (!isBoolSGPR(V: Cond))
16690 break;
16691 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16692 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16693 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
16694 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16695 }
16696 case ISD::UADDO_CARRY: {
16697 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16698 if (!isNullConstant(V: RHS.getOperand(i: 1)))
16699 break;
16700 SDValue Args[] = {LHS, RHS.getOperand(i: 0), RHS.getOperand(i: 2)};
16701 return DAG.getNode(Opcode: ISD::UADDO_CARRY, DL: SDLoc(N), VTList: RHS->getVTList(), Ops: Args);
16702 }
16703 }
16704 return SDValue();
16705}
16706
16707SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16708 DAGCombinerInfo &DCI) const {
16709 SelectionDAG &DAG = DCI.DAG;
16710 SDLoc DL(N);
16711 EVT VT = N->getValueType(ResNo: 0);
16712 SDValue N0 = N->getOperand(Num: 0);
16713 SDValue N1 = N->getOperand(Num: 1);
16714
16715 // The following folds transform PTRADDs into regular arithmetic in cases
16716 // where the PTRADD wouldn't be folded as an immediate offset into memory
16717 // instructions anyway. They are target-specific in that other targets might
16718 // prefer to not lose information about the pointer arithmetic.
16719
16720 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16721 // Adapted from DAGCombiner::visitADDLikeCommutative.
16722 SDValue V, K;
16723 if (sd_match(N: N1, P: m_Shl(L: m_Neg(V: m_Value(N&: V)), R: m_Value(N&: K)))) {
16724 SDNodeFlags ShlFlags = N1->getFlags();
16725 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16726 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16727 // preserved.
16728 SDNodeFlags NewShlFlags =
16729 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16730 ? SDNodeFlags::NoSignedWrap
16731 : SDNodeFlags();
16732 SDValue Inner = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: V, N2: K, Flags: NewShlFlags);
16733 DCI.AddToWorklist(N: Inner.getNode());
16734 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: Inner);
16735 }
16736
16737 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16738 // performAddCombine.
16739 if (N1.getOpcode() == ISD::MUL) {
16740 if (Subtarget->hasMad64_32()) {
16741 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16742 return Folded;
16743 }
16744 }
16745
16746 // If the 32 low bits of the constant are all zero, there is nothing to fold
16747 // into an immediate offset, so it's better to eliminate the unnecessary
16748 // addition for the lower 32 bits than to preserve the PTRADD.
16749 // Analogous to a fold in performAddCombine.
16750 if (VT == MVT::i64) {
16751 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16752 return Folded;
16753 }
16754
16755 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16756 return SDValue();
16757
16758 SDValue X = N0;
16759 SDValue Y = N1.getOperand(i: 0);
16760 SDValue Z = N1.getOperand(i: 1);
16761 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Y);
16762 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(N: Z);
16763
16764 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16765 Y->isDivergent() != Z->isDivergent()) {
16766 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16767 // y are uniform and z isn't.
16768 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16769 // z are uniform and y isn't.
16770 // The goal is to push uniform operands up in the computation, so that they
16771 // can be handled with scalar operations. We can't use reassociateScalarOps
16772 // for this since it requires two identical commutative operations to
16773 // reassociate.
16774 if (Y->isDivergent())
16775 std::swap(a&: Y, b&: Z);
16776 // If both additions in the original were NUW, reassociation preserves that.
16777 SDNodeFlags ReassocFlags =
16778 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16779 SDValue UniformInner = DAG.getMemBasePlusOffset(Base: X, Offset: Y, DL, Flags: ReassocFlags);
16780 DCI.AddToWorklist(N: UniformInner.getNode());
16781 return DAG.getMemBasePlusOffset(Base: UniformInner, Offset: Z, DL, Flags: ReassocFlags);
16782 }
16783
16784 return SDValue();
16785}
16786
16787SDValue SITargetLowering::performSubCombine(SDNode *N,
16788 DAGCombinerInfo &DCI) const {
16789 SelectionDAG &DAG = DCI.DAG;
16790 EVT VT = N->getValueType(ResNo: 0);
16791
16792 if (VT == MVT::i64) {
16793 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16794 return Folded;
16795 }
16796
16797 if (VT != MVT::i32)
16798 return SDValue();
16799
16800 SDLoc SL(N);
16801 SDValue LHS = N->getOperand(Num: 0);
16802 SDValue RHS = N->getOperand(Num: 1);
16803
16804 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16805 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16806 unsigned Opc = RHS.getOpcode();
16807 switch (Opc) {
16808 default:
16809 break;
16810 case ISD::ZERO_EXTEND:
16811 case ISD::SIGN_EXTEND:
16812 case ISD::ANY_EXTEND: {
16813 auto Cond = RHS.getOperand(i: 0);
16814 // If this won't be a real VOPC output, we would still need to insert an
16815 // extra instruction anyway.
16816 if (!isBoolSGPR(V: Cond))
16817 break;
16818 SDVTList VTList = DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1);
16819 SDValue Args[] = {LHS, DAG.getConstant(Val: 0, DL: SL, VT: MVT::i32), Cond};
16820 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
16821 return DAG.getNode(Opcode: Opc, DL: SL, VTList, Ops: Args);
16822 }
16823 }
16824
16825 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16826 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16827 if (!isNullConstant(V: LHS.getOperand(i: 1)))
16828 return SDValue();
16829 SDValue Args[] = {LHS.getOperand(i: 0), RHS, LHS.getOperand(i: 2)};
16830 return DAG.getNode(Opcode: ISD::USUBO_CARRY, DL: SDLoc(N), VTList: LHS->getVTList(), Ops: Args);
16831 }
16832 return SDValue();
16833}
16834
16835SDValue
16836SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16837 DAGCombinerInfo &DCI) const {
16838
16839 if (N->getValueType(ResNo: 0) != MVT::i32)
16840 return SDValue();
16841
16842 if (!isNullConstant(V: N->getOperand(Num: 1)))
16843 return SDValue();
16844
16845 SelectionDAG &DAG = DCI.DAG;
16846 SDValue LHS = N->getOperand(Num: 0);
16847
16848 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16849 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16850 unsigned LHSOpc = LHS.getOpcode();
16851 unsigned Opc = N->getOpcode();
16852 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16853 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16854 SDValue Args[] = {LHS.getOperand(i: 0), LHS.getOperand(i: 1), N->getOperand(Num: 2)};
16855 return DAG.getNode(Opcode: Opc, DL: SDLoc(N), VTList: N->getVTList(), Ops: Args);
16856 }
16857 return SDValue();
16858}
16859
16860SDValue SITargetLowering::performFAddCombine(SDNode *N,
16861 DAGCombinerInfo &DCI) const {
16862 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16863 return SDValue();
16864
16865 SelectionDAG &DAG = DCI.DAG;
16866 EVT VT = N->getValueType(ResNo: 0);
16867
16868 SDLoc SL(N);
16869 SDValue LHS = N->getOperand(Num: 0);
16870 SDValue RHS = N->getOperand(Num: 1);
16871
16872 // These should really be instruction patterns, but writing patterns with
16873 // source modifiers is a pain.
16874
16875 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16876 if (LHS.getOpcode() == ISD::FADD) {
16877 SDValue A = LHS.getOperand(i: 0);
16878 if (A == LHS.getOperand(i: 1)) {
16879 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16880 if (FusedOp != 0) {
16881 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16882 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: RHS);
16883 }
16884 }
16885 }
16886
16887 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16888 if (RHS.getOpcode() == ISD::FADD) {
16889 SDValue A = RHS.getOperand(i: 0);
16890 if (A == RHS.getOperand(i: 1)) {
16891 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16892 if (FusedOp != 0) {
16893 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16894 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: LHS);
16895 }
16896 }
16897 }
16898
16899 return SDValue();
16900}
16901
16902SDValue SITargetLowering::performFSubCombine(SDNode *N,
16903 DAGCombinerInfo &DCI) const {
16904 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16905 return SDValue();
16906
16907 SelectionDAG &DAG = DCI.DAG;
16908 SDLoc SL(N);
16909 EVT VT = N->getValueType(ResNo: 0);
16910 assert(!VT.isVector());
16911
16912 // Try to get the fneg to fold into the source modifier. This undoes generic
16913 // DAG combines and folds them into the mad.
16914 //
16915 // Only do this if we are not trying to support denormals. v_mad_f32 does
16916 // not support denormals ever.
16917 SDValue LHS = N->getOperand(Num: 0);
16918 SDValue RHS = N->getOperand(Num: 1);
16919 if (LHS.getOpcode() == ISD::FADD) {
16920 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16921 SDValue A = LHS.getOperand(i: 0);
16922 if (A == LHS.getOperand(i: 1)) {
16923 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: LHS.getNode());
16924 if (FusedOp != 0) {
16925 const SDValue Two = DAG.getConstantFP(Val: 2.0, DL: SL, VT);
16926 SDValue NegRHS = DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: RHS);
16927
16928 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: Two, N3: NegRHS);
16929 }
16930 }
16931 }
16932
16933 if (RHS.getOpcode() == ISD::FADD) {
16934 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16935
16936 SDValue A = RHS.getOperand(i: 0);
16937 if (A == RHS.getOperand(i: 1)) {
16938 unsigned FusedOp = getFusedOpcode(DAG, N0: N, N1: RHS.getNode());
16939 if (FusedOp != 0) {
16940 const SDValue NegTwo = DAG.getConstantFP(Val: -2.0, DL: SL, VT);
16941 return DAG.getNode(Opcode: FusedOp, DL: SL, VT, N1: A, N2: NegTwo, N3: LHS);
16942 }
16943 }
16944 }
16945
16946 return SDValue();
16947}
16948
16949SDValue SITargetLowering::performFDivCombine(SDNode *N,
16950 DAGCombinerInfo &DCI) const {
16951 SelectionDAG &DAG = DCI.DAG;
16952 SDLoc SL(N);
16953 EVT VT = N->getValueType(ResNo: 0);
16954
16955 // fsqrt legality correlates to rsq availability.
16956 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(Op: ISD::FSQRT, VT))
16957 return SDValue();
16958
16959 SDValue LHS = N->getOperand(Num: 0);
16960 SDValue RHS = N->getOperand(Num: 1);
16961
16962 SDNodeFlags Flags = N->getFlags();
16963 SDNodeFlags RHSFlags = RHS->getFlags();
16964 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16965 !RHS->hasOneUse())
16966 return SDValue();
16967
16968 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(Val&: LHS)) {
16969 bool IsNegative = false;
16970 if (CLHS->isExactlyValue(V: 1.0) ||
16971 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
16972 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16973 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16974 if (RHS.getOpcode() == ISD::FSQRT) {
16975 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16976 SDValue Rsq =
16977 DAG.getNode(Opcode: AMDGPUISD::RSQ, DL: SL, VT, Operand: RHS.getOperand(i: 0), Flags);
16978 return IsNegative ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: Rsq, Flags) : Rsq;
16979 }
16980 }
16981 }
16982
16983 return SDValue();
16984}
16985
16986SDValue SITargetLowering::performFMulCombine(SDNode *N,
16987 DAGCombinerInfo &DCI) const {
16988 SelectionDAG &DAG = DCI.DAG;
16989 EVT VT = N->getValueType(ResNo: 0);
16990 EVT ScalarVT = VT.getScalarType();
16991 EVT IntVT = VT.changeElementType(Context&: *DAG.getContext(), EltVT: MVT::i32);
16992
16993 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16994 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16995 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16996 return SDValue();
16997 }
16998
16999 SDValue LHS = N->getOperand(Num: 0);
17000 SDValue RHS = N->getOperand(Num: 1);
17001
17002 // It is cheaper to realize i32 inline constants as compared against
17003 // materializing f16 or f64 (or even non-inline f32) values,
17004 // possible via ldexp usage, as shown below :
17005 //
17006 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17007 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17008 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17009 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17010 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17011 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(N: RHS.getOperand(i: 1));
17012 if (!TrueNode)
17013 return SDValue();
17014 const ConstantFPSDNode *FalseNode =
17015 isConstOrConstSplatFP(N: RHS.getOperand(i: 2));
17016 if (!FalseNode)
17017 return SDValue();
17018
17019 if (TrueNode->isNegative() != FalseNode->isNegative())
17020 return SDValue();
17021
17022 // For f32, only non-inline constants should be transformed.
17023 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17024 if (ScalarVT == MVT::f32 &&
17025 TII->isInlineConstant(Imm: TrueNode->getValueAPF()) &&
17026 TII->isInlineConstant(Imm: FalseNode->getValueAPF()))
17027 return SDValue();
17028
17029 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17030 if (TrueNodeExpVal == INT_MIN)
17031 return SDValue();
17032 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17033 if (FalseNodeExpVal == INT_MIN)
17034 return SDValue();
17035
17036 SDLoc SL(N);
17037 SDValue SelectNode =
17038 DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT: IntVT, N1: RHS.getOperand(i: 0),
17039 N2: DAG.getSignedConstant(Val: TrueNodeExpVal, DL: SL, VT: IntVT),
17040 N3: DAG.getSignedConstant(Val: FalseNodeExpVal, DL: SL, VT: IntVT));
17041
17042 LHS = TrueNode->isNegative()
17043 ? DAG.getNode(Opcode: ISD::FNEG, DL: SL, VT, Operand: LHS, Flags: LHS->getFlags())
17044 : LHS;
17045
17046 return DAG.getNode(Opcode: ISD::FLDEXP, DL: SL, VT, N1: LHS, N2: SelectNode, Flags: N->getFlags());
17047 }
17048
17049 return SDValue();
17050}
17051
17052SDValue SITargetLowering::performFMACombine(SDNode *N,
17053 DAGCombinerInfo &DCI) const {
17054 SelectionDAG &DAG = DCI.DAG;
17055 EVT VT = N->getValueType(ResNo: 0);
17056 SDLoc SL(N);
17057
17058 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17059 return SDValue();
17060
17061 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17062 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17063 SDValue Op1 = N->getOperand(Num: 0);
17064 SDValue Op2 = N->getOperand(Num: 1);
17065 SDValue FMA = N->getOperand(Num: 2);
17066
17067 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17068 Op2.getOpcode() != ISD::FP_EXTEND)
17069 return SDValue();
17070
17071 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17072 // regardless of the denorm mode setting. Therefore,
17073 // fp-contract is sufficient to allow generating fdot2.
17074 const TargetOptions &Options = DAG.getTarget().Options;
17075 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17076 (N->getFlags().hasAllowContract() &&
17077 FMA->getFlags().hasAllowContract())) {
17078 Op1 = Op1.getOperand(i: 0);
17079 Op2 = Op2.getOperand(i: 0);
17080 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17081 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17082 return SDValue();
17083
17084 SDValue Vec1 = Op1.getOperand(i: 0);
17085 SDValue Idx1 = Op1.getOperand(i: 1);
17086 SDValue Vec2 = Op2.getOperand(i: 0);
17087
17088 SDValue FMAOp1 = FMA.getOperand(i: 0);
17089 SDValue FMAOp2 = FMA.getOperand(i: 1);
17090 SDValue FMAAcc = FMA.getOperand(i: 2);
17091
17092 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17093 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17094 return SDValue();
17095
17096 FMAOp1 = FMAOp1.getOperand(i: 0);
17097 FMAOp2 = FMAOp2.getOperand(i: 0);
17098 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17099 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17100 return SDValue();
17101
17102 SDValue Vec3 = FMAOp1.getOperand(i: 0);
17103 SDValue Vec4 = FMAOp2.getOperand(i: 0);
17104 SDValue Idx2 = FMAOp1.getOperand(i: 1);
17105
17106 if (Idx1 != Op2.getOperand(i: 1) || Idx2 != FMAOp2.getOperand(i: 1) ||
17107 // Idx1 and Idx2 cannot be the same.
17108 Idx1 == Idx2)
17109 return SDValue();
17110
17111 if (Vec1 == Vec2 || Vec3 == Vec4)
17112 return SDValue();
17113
17114 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17115 return SDValue();
17116
17117 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17118 return DAG.getNode(Opcode: AMDGPUISD::FDOT2, DL: SL, VT: MVT::f32, N1: Vec1, N2: Vec2, N3: FMAAcc,
17119 N4: DAG.getTargetConstant(Val: 0, DL: SL, VT: MVT::i1));
17120 }
17121 }
17122 return SDValue();
17123}
17124
17125SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17126 DAGCombinerInfo &DCI) const {
17127 SelectionDAG &DAG = DCI.DAG;
17128 SDLoc SL(N);
17129
17130 SDValue LHS = N->getOperand(Num: 0);
17131 SDValue RHS = N->getOperand(Num: 1);
17132 EVT VT = LHS.getValueType();
17133 ISD::CondCode CC = cast<CondCodeSDNode>(Val: N->getOperand(Num: 2))->get();
17134
17135 auto *CRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
17136 if (!CRHS) {
17137 CRHS = dyn_cast<ConstantSDNode>(Val&: LHS);
17138 if (CRHS) {
17139 std::swap(a&: LHS, b&: RHS);
17140 CC = getSetCCSwappedOperands(Operation: CC);
17141 }
17142 }
17143
17144 if (CRHS) {
17145 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
17146 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17147 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
17148 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
17149 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
17150 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
17151 if ((CRHS->isAllOnes() &&
17152 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
17153 (CRHS->isZero() &&
17154 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
17155 return DAG.getNode(Opcode: ISD::XOR, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17156 N2: DAG.getAllOnesConstant(DL: SL, VT: MVT::i1));
17157 if ((CRHS->isAllOnes() &&
17158 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
17159 (CRHS->isZero() &&
17160 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
17161 return LHS.getOperand(i: 0);
17162 }
17163
17164 const APInt &CRHSVal = CRHS->getAPIntValue();
17165 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
17166 LHS.getOpcode() == ISD::SELECT &&
17167 isa<ConstantSDNode>(Val: LHS.getOperand(i: 1)) &&
17168 isa<ConstantSDNode>(Val: LHS.getOperand(i: 2)) &&
17169 isBoolSGPR(V: LHS.getOperand(i: 0))) {
17170 // Given CT != FT:
17171 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
17172 // setcc (select cc, CT, CF), CF, ne => cc
17173 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
17174 // setcc (select cc, CT, CF), CT, eq => cc
17175 const APInt &CT = LHS.getConstantOperandAPInt(i: 1);
17176 const APInt &CF = LHS.getConstantOperandAPInt(i: 2);
17177
17178 if (CT != CF) {
17179 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
17180 (CT == CRHSVal && CC == ISD::SETNE))
17181 return DAG.getNOT(DL: SL, Val: LHS.getOperand(i: 0), VT: MVT::i1);
17182 if ((CF == CRHSVal && CC == ISD::SETNE) ||
17183 (CT == CRHSVal && CC == ISD::SETEQ))
17184 return LHS.getOperand(i: 0);
17185 }
17186 }
17187 }
17188
17189 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
17190 // following cases where information about the lower 32-bits of its operands
17191 // is known:
17192 //
17193 // If LHS.lo32 == RHS.lo32:
17194 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17195 // If LHS.lo32 != RHS.lo32:
17196 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17197 // If LHS.lo32 >= RHS.lo32 (unsigned):
17198 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17199 // If LHS.lo32 > RHS.lo32 (unsigned):
17200 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17201 // If LHS.lo32 <= RHS.lo32 (unsigned):
17202 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17203 // If LHS.lo32 < RHS.lo32 (unsigned):
17204 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17205 if (VT == MVT::i64) {
17206 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(Op: LHS).trunc(BitWidth: 32);
17207 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(Op: RHS).trunc(BitWidth: 32);
17208
17209 // NewCC is valid iff we can truncate the setcc to only test the upper 32
17210 // bits
17211 ISD::CondCode NewCC = ISD::SETCC_INVALID;
17212
17213 switch (CC) {
17214 default:
17215 break;
17216 case ISD::SETEQ: {
17217 const std::optional<bool> KnownEq =
17218 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17219 if (KnownEq)
17220 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
17221
17222 break;
17223 }
17224 case ISD::SETNE: {
17225 const std::optional<bool> KnownEq =
17226 KnownBits::eq(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17227 if (KnownEq)
17228 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
17229
17230 break;
17231 }
17232 case ISD::SETULT:
17233 case ISD::SETUGE:
17234 case ISD::SETLT:
17235 case ISD::SETGE: {
17236 const std::optional<bool> KnownUge =
17237 KnownBits::uge(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17238 if (KnownUge) {
17239 if (*KnownUge) {
17240 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17241 NewCC = CC;
17242 } else {
17243 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17244 NewCC = CC == ISD::SETULT ? ISD::SETULE
17245 : CC == ISD::SETUGE ? ISD::SETUGT
17246 : CC == ISD::SETLT ? ISD::SETLE
17247 : ISD::SETGT;
17248 }
17249 }
17250 break;
17251 }
17252 case ISD::SETULE:
17253 case ISD::SETUGT:
17254 case ISD::SETLE:
17255 case ISD::SETGT: {
17256 const std::optional<bool> KnownUle =
17257 KnownBits::ule(LHS: LHSKnownLo32, RHS: RHSKnownLo32);
17258 if (KnownUle) {
17259 if (*KnownUle) {
17260 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17261 NewCC = CC;
17262 } else {
17263 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17264 NewCC = CC == ISD::SETULE ? ISD::SETULT
17265 : CC == ISD::SETUGT ? ISD::SETUGE
17266 : CC == ISD::SETLE ? ISD::SETLT
17267 : ISD::SETGE;
17268 }
17269 }
17270 break;
17271 }
17272 }
17273
17274 if (NewCC != ISD::SETCC_INVALID)
17275 return DAG.getSetCC(DL: SL, VT: N->getValueType(ResNo: 0), LHS: getHiHalf64(Op: LHS, DAG),
17276 RHS: getHiHalf64(Op: RHS, DAG), Cond: NewCC);
17277 }
17278
17279 // Eliminate setcc by using carryout from add/sub instruction
17280
17281 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
17282 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
17283 // similarly for subtraction
17284
17285 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
17286 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
17287
17288 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
17289 sd_match(N: LHS, P: m_Add(L: m_Specific(N: RHS), R: m_Value()))) ||
17290 (CC == ISD::SETUGT &&
17291 sd_match(N: LHS, P: m_Sub(L: m_Specific(N: RHS), R: m_Value()))) ||
17292 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
17293 sd_match(N: LHS, P: m_Add(L: m_Value(), R: m_One()))))) {
17294 bool IsAdd = LHS.getOpcode() == ISD::ADD;
17295
17296 SDValue Op0 = LHS.getOperand(i: 0);
17297 SDValue Op1 = LHS.getOperand(i: 1);
17298
17299 SDValue Op0Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op0);
17300 SDValue Op1Lo = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SL, VT: MVT::i32, Operand: Op1);
17301
17302 SDValue Op0Hi = getHiHalf64(Op: Op0, DAG);
17303 SDValue Op1Hi = getHiHalf64(Op: Op1, DAG);
17304
17305 SDValue NodeLo =
17306 DAG.getNode(Opcode: IsAdd ? ISD::UADDO : ISD::USUBO, DL: SL,
17307 VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1), Ops: {Op0Lo, Op1Lo});
17308
17309 SDValue CarryInHi = NodeLo.getValue(R: 1);
17310 SDValue NodeHi = DAG.getNode(Opcode: IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
17311 DL: SL, VTList: DAG.getVTList(VT1: MVT::i32, VT2: MVT::i1),
17312 Ops: {Op0Hi, Op1Hi, CarryInHi});
17313
17314 SDValue ResultLo = NodeLo.getValue(R: 0);
17315 SDValue ResultHi = NodeHi.getValue(R: 0);
17316
17317 SDValue JoinedResult =
17318 DAG.getBuildVector(VT: MVT::v2i32, DL: SL, Ops: {ResultLo, ResultHi});
17319
17320 SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: JoinedResult);
17321 SDValue Overflow = NodeHi.getValue(R: 1);
17322 DCI.CombineTo(N: LHS.getNode(), Res: Result);
17323 return Overflow;
17324 }
17325
17326 if (VT != MVT::f32 && VT != MVT::f64 &&
17327 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17328 return SDValue();
17329
17330 // Match isinf/isfinite pattern
17331 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
17332 // (fcmp one (fabs x), inf) -> (fp_class x,
17333 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
17334 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
17335 LHS.getOpcode() == ISD::FABS) {
17336 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(Val&: RHS);
17337 if (!CRHS)
17338 return SDValue();
17339
17340 const APFloat &APF = CRHS->getValueAPF();
17341 if (APF.isInfinity() && !APF.isNegative()) {
17342 const unsigned IsInfMask =
17343 SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
17344 const unsigned IsFiniteMask =
17345 SIInstrFlags::N_ZERO | SIInstrFlags::P_ZERO | SIInstrFlags::N_NORMAL |
17346 SIInstrFlags::P_NORMAL | SIInstrFlags::N_SUBNORMAL |
17347 SIInstrFlags::P_SUBNORMAL;
17348 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
17349 return DAG.getNode(Opcode: AMDGPUISD::FP_CLASS, DL: SL, VT: MVT::i1, N1: LHS.getOperand(i: 0),
17350 N2: DAG.getConstant(Val: Mask, DL: SL, VT: MVT::i32));
17351 }
17352 }
17353
17354 return SDValue();
17355}
17356
17357SDValue
17358SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
17359 DAGCombinerInfo &DCI) const {
17360 SelectionDAG &DAG = DCI.DAG;
17361 SDLoc SL(N);
17362 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17363
17364 SDValue Src = N->getOperand(Num: 0);
17365 SDValue Shift = N->getOperand(Num: 0);
17366
17367 // TODO: Extend type shouldn't matter (assuming legal types).
17368 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
17369 Shift = Shift.getOperand(i: 0);
17370
17371 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
17372 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
17373 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
17374 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
17375 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
17376 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
17377 if (auto *C = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: 1))) {
17378 SDValue Shifted = DAG.getZExtOrTrunc(
17379 Op: Shift.getOperand(i: 0), DL: SDLoc(Shift.getOperand(i: 0)), VT: MVT::i32);
17380
17381 unsigned ShiftOffset = 8 * Offset;
17382 if (Shift.getOpcode() == ISD::SHL)
17383 ShiftOffset -= C->getZExtValue();
17384 else
17385 ShiftOffset += C->getZExtValue();
17386
17387 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17388 return DAG.getNode(Opcode: AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, DL: SL,
17389 VT: MVT::f32, Operand: Shifted);
17390 }
17391 }
17392 }
17393
17394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17395 APInt DemandedBits = APInt::getBitsSet(numBits: 32, loBit: 8 * Offset, hiBit: 8 * Offset + 8);
17396 if (TLI.SimplifyDemandedBits(Op: Src, DemandedBits, DCI)) {
17397 // We simplified Src. If this node is not dead, visit it again so it is
17398 // folded properly.
17399 if (N->getOpcode() != ISD::DELETED_NODE)
17400 DCI.AddToWorklist(N);
17401 return SDValue(N, 0);
17402 }
17403
17404 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
17405 if (SDValue DemandedSrc =
17406 TLI.SimplifyMultipleUseDemandedBits(Op: Src, DemandedBits, DAG))
17407 return DAG.getNode(Opcode: N->getOpcode(), DL: SL, VT: MVT::f32, Operand: DemandedSrc);
17408
17409 return SDValue();
17410}
17411
17412SDValue SITargetLowering::performClampCombine(SDNode *N,
17413 DAGCombinerInfo &DCI) const {
17414 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(Val: N->getOperand(Num: 0));
17415 if (!CSrc)
17416 return SDValue();
17417
17418 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17419 const APFloat &F = CSrc->getValueAPF();
17420 APFloat Zero = APFloat::getZero(Sem: F.getSemantics());
17421 if (F < Zero ||
17422 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17423 return DCI.DAG.getConstantFP(Val: Zero, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17424 }
17425
17426 APFloat One(F.getSemantics(), "1.0");
17427 if (F > One)
17428 return DCI.DAG.getConstantFP(Val: One, DL: SDLoc(N), VT: N->getValueType(ResNo: 0));
17429
17430 return SDValue(CSrc, 0);
17431}
17432
17433SDValue SITargetLowering::performSelectCombine(SDNode *N,
17434 DAGCombinerInfo &DCI) const {
17435
17436 // Try to fold CMP + SELECT patterns with shared constants (both FP and
17437 // integer).
17438 // Detect when CMP and SELECT use the same constant and fold them to avoid
17439 // loading the constant twice. Specifically handles patterns like:
17440 // %cmp = icmp eq i32 %val, 4242
17441 // %sel = select i1 %cmp, i32 4242, i32 %other
17442 // It can be optimized to reuse %val instead of 4242 in select.
17443 SDValue Cond = N->getOperand(Num: 0);
17444 SDValue TrueVal = N->getOperand(Num: 1);
17445 SDValue FalseVal = N->getOperand(Num: 2);
17446
17447 // Check if condition is a comparison.
17448 if (Cond.getOpcode() != ISD::SETCC)
17449 return SDValue();
17450
17451 SDValue LHS = Cond.getOperand(i: 0);
17452 SDValue RHS = Cond.getOperand(i: 1);
17453 ISD::CondCode CC = cast<CondCodeSDNode>(Val: Cond.getOperand(i: 2))->get();
17454
17455 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
17456 bool isInteger = LHS.getValueType().isInteger();
17457
17458 // Handle simple floating-point and integer types only.
17459 if (!isFloatingPoint && !isInteger)
17460 return SDValue();
17461
17462 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
17463 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
17464 if (!isEquality && !isNonEquality)
17465 return SDValue();
17466
17467 SDValue ArgVal, ConstVal;
17468 if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: RHS)) ||
17469 (isInteger && isa<ConstantSDNode>(Val: RHS))) {
17470 ConstVal = RHS;
17471 ArgVal = LHS;
17472 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(Val: LHS)) ||
17473 (isInteger && isa<ConstantSDNode>(Val: LHS))) {
17474 ConstVal = LHS;
17475 ArgVal = RHS;
17476 } else {
17477 return SDValue();
17478 }
17479
17480 // Skip optimization for inlinable immediates.
17481 if (isFloatingPoint) {
17482 const APFloat &Val = cast<ConstantFPSDNode>(Val&: ConstVal)->getValueAPF();
17483 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Imm: Val))
17484 return SDValue();
17485 } else {
17486 if (AMDGPU::isInlinableIntLiteral(
17487 Literal: cast<ConstantSDNode>(Val&: ConstVal)->getSExtValue()))
17488 return SDValue();
17489 }
17490
17491 // For equality and non-equality comparisons, patterns:
17492 // select (setcc x, const), const, y -> select (setcc x, const), x, y
17493 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
17494 if (!(isEquality && TrueVal == ConstVal) &&
17495 !(isNonEquality && FalseVal == ConstVal))
17496 return SDValue();
17497
17498 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
17499 SDValue SelectRHS =
17500 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
17501 return DCI.DAG.getNode(Opcode: ISD::SELECT, DL: SDLoc(N), VT: N->getValueType(ResNo: 0), N1: Cond,
17502 N2: SelectLHS, N3: SelectRHS);
17503}
17504
17505SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
17506 DAGCombinerInfo &DCI) const {
17507 switch (N->getOpcode()) {
17508 case ISD::ADD:
17509 case ISD::SUB:
17510 case ISD::SHL:
17511 case ISD::SRL:
17512 case ISD::SRA:
17513 case ISD::AND:
17514 case ISD::OR:
17515 case ISD::XOR:
17516 case ISD::MUL:
17517 case ISD::SETCC:
17518 case ISD::SELECT:
17519 case ISD::SMIN:
17520 case ISD::SMAX:
17521 case ISD::UMIN:
17522 case ISD::UMAX:
17523 if (auto Res = promoteUniformOpToI32(Op: SDValue(N, 0), DCI))
17524 return Res;
17525 break;
17526 default:
17527 break;
17528 }
17529
17530 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
17531 return SDValue();
17532
17533 switch (N->getOpcode()) {
17534 case ISD::ADD:
17535 return performAddCombine(N, DCI);
17536 case ISD::PTRADD:
17537 return performPtrAddCombine(N, DCI);
17538 case ISD::SUB:
17539 return performSubCombine(N, DCI);
17540 case ISD::UADDO_CARRY:
17541 case ISD::USUBO_CARRY:
17542 return performAddCarrySubCarryCombine(N, DCI);
17543 case ISD::FADD:
17544 return performFAddCombine(N, DCI);
17545 case ISD::FSUB:
17546 return performFSubCombine(N, DCI);
17547 case ISD::FDIV:
17548 return performFDivCombine(N, DCI);
17549 case ISD::FMUL:
17550 return performFMulCombine(N, DCI);
17551 case ISD::SETCC:
17552 return performSetCCCombine(N, DCI);
17553 case ISD::SELECT:
17554 if (auto Res = performSelectCombine(N, DCI))
17555 return Res;
17556 break;
17557 case ISD::FMAXNUM:
17558 case ISD::FMINNUM:
17559 case ISD::FMAXNUM_IEEE:
17560 case ISD::FMINNUM_IEEE:
17561 case ISD::FMAXIMUM:
17562 case ISD::FMINIMUM:
17563 case ISD::FMAXIMUMNUM:
17564 case ISD::FMINIMUMNUM:
17565 case ISD::SMAX:
17566 case ISD::SMIN:
17567 case ISD::UMAX:
17568 case ISD::UMIN:
17569 case AMDGPUISD::FMIN_LEGACY:
17570 case AMDGPUISD::FMAX_LEGACY:
17571 return performMinMaxCombine(N, DCI);
17572 case ISD::FMA:
17573 return performFMACombine(N, DCI);
17574 case ISD::AND:
17575 return performAndCombine(N, DCI);
17576 case ISD::OR:
17577 return performOrCombine(N, DCI);
17578 case ISD::FSHR: {
17579 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17580 if (N->getValueType(ResNo: 0) == MVT::i32 && N->isDivergent() &&
17581 TII->pseudoToMCOpcode(Opcode: AMDGPU::V_PERM_B32_e64) != -1) {
17582 return matchPERM(N, DCI);
17583 }
17584 break;
17585 }
17586 case ISD::XOR:
17587 return performXorCombine(N, DCI);
17588 case ISD::ANY_EXTEND:
17589 case ISD::ZERO_EXTEND:
17590 return performZeroOrAnyExtendCombine(N, DCI);
17591 case ISD::SIGN_EXTEND_INREG:
17592 return performSignExtendInRegCombine(N, DCI);
17593 case AMDGPUISD::FP_CLASS:
17594 return performClassCombine(N, DCI);
17595 case ISD::FCANONICALIZE:
17596 return performFCanonicalizeCombine(N, DCI);
17597 case AMDGPUISD::RCP:
17598 return performRcpCombine(N, DCI);
17599 case ISD::FLDEXP:
17600 case AMDGPUISD::FRACT:
17601 case AMDGPUISD::RSQ:
17602 case AMDGPUISD::RCP_LEGACY:
17603 case AMDGPUISD::RCP_IFLAG:
17604 case AMDGPUISD::RSQ_CLAMP: {
17605 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
17606 SDValue Src = N->getOperand(Num: 0);
17607 if (Src.isUndef())
17608 return Src;
17609 break;
17610 }
17611 case ISD::SINT_TO_FP:
17612 case ISD::UINT_TO_FP:
17613 return performUCharToFloatCombine(N, DCI);
17614 case ISD::FCOPYSIGN:
17615 return performFCopySignCombine(N, DCI);
17616 case AMDGPUISD::CVT_F32_UBYTE0:
17617 case AMDGPUISD::CVT_F32_UBYTE1:
17618 case AMDGPUISD::CVT_F32_UBYTE2:
17619 case AMDGPUISD::CVT_F32_UBYTE3:
17620 return performCvtF32UByteNCombine(N, DCI);
17621 case AMDGPUISD::FMED3:
17622 return performFMed3Combine(N, DCI);
17623 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17624 return performCvtPkRTZCombine(N, DCI);
17625 case AMDGPUISD::CLAMP:
17626 return performClampCombine(N, DCI);
17627 case ISD::SCALAR_TO_VECTOR: {
17628 SelectionDAG &DAG = DCI.DAG;
17629 EVT VT = N->getValueType(ResNo: 0);
17630
17631 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
17632 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17633 SDLoc SL(N);
17634 SDValue Src = N->getOperand(Num: 0);
17635 EVT EltVT = Src.getValueType();
17636 if (EltVT != MVT::i16)
17637 Src = DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT: MVT::i16, Operand: Src);
17638
17639 SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SL, VT: MVT::i32, Operand: Src);
17640 return DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: Ext);
17641 }
17642
17643 break;
17644 }
17645 case ISD::EXTRACT_VECTOR_ELT:
17646 return performExtractVectorEltCombine(N, DCI);
17647 case ISD::INSERT_VECTOR_ELT:
17648 return performInsertVectorEltCombine(N, DCI);
17649 case ISD::FP_ROUND:
17650 return performFPRoundCombine(N, DCI);
17651 case ISD::LOAD: {
17652 if (SDValue Widened = widenLoad(Ld: cast<LoadSDNode>(Val: N), DCI))
17653 return Widened;
17654 [[fallthrough]];
17655 }
17656 default: {
17657 if (!DCI.isBeforeLegalize()) {
17658 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(Val: N))
17659 return performMemSDNodeCombine(N: MemNode, DCI);
17660 }
17661
17662 break;
17663 }
17664 }
17665
17666 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
17667}
17668
17669/// Helper function for adjustWritemask
17670static unsigned SubIdx2Lane(unsigned Idx) {
17671 switch (Idx) {
17672 default:
17673 return ~0u;
17674 case AMDGPU::sub0:
17675 return 0;
17676 case AMDGPU::sub1:
17677 return 1;
17678 case AMDGPU::sub2:
17679 return 2;
17680 case AMDGPU::sub3:
17681 return 3;
17682 case AMDGPU::sub4:
17683 return 4; // Possible with TFE/LWE
17684 }
17685}
17686
17687/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
17688SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
17689 SelectionDAG &DAG) const {
17690 unsigned Opcode = Node->getMachineOpcode();
17691
17692 // Subtract 1 because the vdata output is not a MachineSDNode operand.
17693 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::d16) - 1;
17694 if (D16Idx >= 0 && Node->getConstantOperandVal(Num: D16Idx))
17695 return Node; // not implemented for D16
17696
17697 SDNode *Users[5] = {nullptr};
17698 unsigned Lane = 0;
17699 unsigned DmaskIdx =
17700 AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::dmask) - 1;
17701 unsigned OldDmask = Node->getConstantOperandVal(Num: DmaskIdx);
17702 unsigned NewDmask = 0;
17703 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::tfe) - 1;
17704 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, Name: AMDGPU::OpName::lwe) - 1;
17705 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(Num: TFEIdx)) ||
17706 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(Num: LWEIdx));
17707 unsigned TFCLane = 0;
17708 bool HasChain = Node->getNumValues() > 1;
17709
17710 if (OldDmask == 0) {
17711 // These are folded out, but on the chance it happens don't assert.
17712 return Node;
17713 }
17714
17715 unsigned OldBitsSet = llvm::popcount(Value: OldDmask);
17716 // Work out which is the TFE/LWE lane if that is enabled.
17717 if (UsesTFC) {
17718 TFCLane = OldBitsSet;
17719 }
17720
17721 // Try to figure out the used register components
17722 for (SDUse &Use : Node->uses()) {
17723
17724 // Don't look at users of the chain.
17725 if (Use.getResNo() != 0)
17726 continue;
17727
17728 SDNode *User = Use.getUser();
17729
17730 // Abort if we can't understand the usage
17731 if (!User->isMachineOpcode() ||
17732 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17733 return Node;
17734
17735 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
17736 // Note that subregs are packed, i.e. Lane==0 is the first bit set
17737 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
17738 // set, etc.
17739 Lane = SubIdx2Lane(Idx: User->getConstantOperandVal(Num: 1));
17740 if (Lane == ~0u)
17741 return Node;
17742
17743 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
17744 if (UsesTFC && Lane == TFCLane) {
17745 Users[Lane] = User;
17746 } else {
17747 // Set which texture component corresponds to the lane.
17748 unsigned Comp;
17749 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17750 Comp = llvm::countr_zero(Val: Dmask);
17751 Dmask &= ~(1 << Comp);
17752 }
17753
17754 // Abort if we have more than one user per component.
17755 if (Users[Lane])
17756 return Node;
17757
17758 Users[Lane] = User;
17759 NewDmask |= 1 << Comp;
17760 }
17761 }
17762
17763 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17764 bool NoChannels = !NewDmask;
17765 if (NoChannels) {
17766 if (!UsesTFC) {
17767 // No uses of the result and not using TFC. Then do nothing.
17768 return Node;
17769 }
17770 // If the original dmask has one channel - then nothing to do
17771 if (OldBitsSet == 1)
17772 return Node;
17773 // Use an arbitrary dmask - required for the instruction to work
17774 NewDmask = 1;
17775 }
17776 // Abort if there's no change
17777 if (NewDmask == OldDmask)
17778 return Node;
17779
17780 unsigned BitsSet = llvm::popcount(Value: NewDmask);
17781
17782 // Check for TFE or LWE - increase the number of channels by one to account
17783 // for the extra return value
17784 // This will need adjustment for D16 if this is also included in
17785 // adjustWriteMask (this function) but at present D16 are excluded.
17786 unsigned NewChannels = BitsSet + UsesTFC;
17787
17788 int NewOpcode =
17789 AMDGPU::getMaskedMIMGOp(Opc: Node->getMachineOpcode(), NewChannels);
17790 assert(NewOpcode != -1 &&
17791 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17792 "failed to find equivalent MIMG op");
17793
17794 // Adjust the writemask in the node
17795 SmallVector<SDValue, 12> Ops;
17796 llvm::append_range(C&: Ops, R: Node->ops().take_front(N: DmaskIdx));
17797 Ops.push_back(Elt: DAG.getTargetConstant(Val: NewDmask, DL: SDLoc(Node), VT: MVT::i32));
17798 llvm::append_range(C&: Ops, R: Node->ops().drop_front(N: DmaskIdx + 1));
17799
17800 MVT SVT = Node->getValueType(ResNo: 0).getVectorElementType().getSimpleVT();
17801
17802 MVT ResultVT = NewChannels == 1
17803 ? SVT
17804 : MVT::getVectorVT(VT: SVT, NumElements: NewChannels == 3 ? 4
17805 : NewChannels == 5 ? 8
17806 : NewChannels);
17807 SDVTList NewVTList =
17808 HasChain ? DAG.getVTList(VT1: ResultVT, VT2: MVT::Other) : DAG.getVTList(VT: ResultVT);
17809
17810 MachineSDNode *NewNode =
17811 DAG.getMachineNode(Opcode: NewOpcode, dl: SDLoc(Node), VTs: NewVTList, Ops);
17812
17813 if (HasChain) {
17814 // Update chain.
17815 DAG.setNodeMemRefs(N: NewNode, NewMemRefs: Node->memoperands());
17816 DAG.ReplaceAllUsesOfValueWith(From: SDValue(Node, 1), To: SDValue(NewNode, 1));
17817 }
17818
17819 if (NewChannels == 1) {
17820 assert(Node->hasNUsesOfValue(1, 0));
17821 SDNode *Copy =
17822 DAG.getMachineNode(Opcode: TargetOpcode::COPY, dl: SDLoc(Node),
17823 VT: Users[Lane]->getValueType(ResNo: 0), Op1: SDValue(NewNode, 0));
17824 DAG.ReplaceAllUsesWith(From: Users[Lane], To: Copy);
17825 return nullptr;
17826 }
17827
17828 // Update the users of the node with the new indices
17829 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17830 SDNode *User = Users[i];
17831 if (!User) {
17832 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17833 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17834 if (i || !NoChannels)
17835 continue;
17836 } else {
17837 SDValue Op = DAG.getTargetConstant(Val: Idx, DL: SDLoc(User), VT: MVT::i32);
17838 SDNode *NewUser = DAG.UpdateNodeOperands(N: User, Op1: SDValue(NewNode, 0), Op2: Op);
17839 if (NewUser != User) {
17840 DAG.ReplaceAllUsesWith(From: SDValue(User, 0), To: SDValue(NewUser, 0));
17841 DAG.RemoveDeadNode(N: User);
17842 }
17843 }
17844
17845 switch (Idx) {
17846 default:
17847 break;
17848 case AMDGPU::sub0:
17849 Idx = AMDGPU::sub1;
17850 break;
17851 case AMDGPU::sub1:
17852 Idx = AMDGPU::sub2;
17853 break;
17854 case AMDGPU::sub2:
17855 Idx = AMDGPU::sub3;
17856 break;
17857 case AMDGPU::sub3:
17858 Idx = AMDGPU::sub4;
17859 break;
17860 }
17861 }
17862
17863 DAG.RemoveDeadNode(N: Node);
17864 return nullptr;
17865}
17866
17867static bool isFrameIndexOp(SDValue Op) {
17868 if (Op.getOpcode() == ISD::AssertZext)
17869 Op = Op.getOperand(i: 0);
17870
17871 return isa<FrameIndexSDNode>(Val: Op);
17872}
17873
17874/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17875/// with frame index operands.
17876/// LLVM assumes that inputs are to these instructions are registers.
17877SDNode *
17878SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
17879 SelectionDAG &DAG) const {
17880 if (Node->getOpcode() == ISD::CopyToReg) {
17881 RegisterSDNode *DestReg = cast<RegisterSDNode>(Val: Node->getOperand(Num: 1));
17882 SDValue SrcVal = Node->getOperand(Num: 2);
17883
17884 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17885 // to try understanding copies to physical registers.
17886 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17887 SDLoc SL(Node);
17888 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17889 SDValue VReg = DAG.getRegister(
17890 Reg: MRI.createVirtualRegister(RegClass: &AMDGPU::VReg_1RegClass), VT: MVT::i1);
17891
17892 SDNode *Glued = Node->getGluedNode();
17893 SDValue ToVReg = DAG.getCopyToReg(
17894 Chain: Node->getOperand(Num: 0), dl: SL, Reg: VReg, N: SrcVal,
17895 Glue: SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17896 SDValue ToResultReg = DAG.getCopyToReg(Chain: ToVReg, dl: SL, Reg: SDValue(DestReg, 0),
17897 N: VReg, Glue: ToVReg.getValue(R: 1));
17898 DAG.ReplaceAllUsesWith(From: Node, To: ToResultReg.getNode());
17899 DAG.RemoveDeadNode(N: Node);
17900 return ToResultReg.getNode();
17901 }
17902 }
17903
17904 SmallVector<SDValue, 8> Ops;
17905 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17906 if (!isFrameIndexOp(Op: Node->getOperand(Num: i))) {
17907 Ops.push_back(Elt: Node->getOperand(Num: i));
17908 continue;
17909 }
17910
17911 SDLoc DL(Node);
17912 Ops.push_back(Elt: SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL,
17913 VT: Node->getOperand(Num: i).getValueType(),
17914 Op1: Node->getOperand(Num: i)),
17915 0));
17916 }
17917
17918 return DAG.UpdateNodeOperands(N: Node, Ops);
17919}
17920
17921/// Fold the instructions after selecting them.
17922/// Returns null if users were already updated.
17923SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
17924 SelectionDAG &DAG) const {
17925 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17926 unsigned Opcode = Node->getMachineOpcode();
17927
17928 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17929 !TII->isGather4(Opcode) &&
17930 AMDGPU::hasNamedOperand(Opcode, NamedIdx: AMDGPU::OpName::dmask)) {
17931 return adjustWritemask(Node, DAG);
17932 }
17933
17934 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17935 legalizeTargetIndependentNode(Node, DAG);
17936 return Node;
17937 }
17938
17939 switch (Opcode) {
17940 case AMDGPU::V_DIV_SCALE_F32_e64:
17941 case AMDGPU::V_DIV_SCALE_F64_e64: {
17942 // Satisfy the operand register constraint when one of the inputs is
17943 // undefined. Ordinarily each undef value will have its own implicit_def of
17944 // a vreg, so force these to use a single register.
17945 SDValue Src0 = Node->getOperand(Num: 1);
17946 SDValue Src1 = Node->getOperand(Num: 3);
17947 SDValue Src2 = Node->getOperand(Num: 5);
17948
17949 if ((Src0.isMachineOpcode() &&
17950 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17951 (Src0 == Src1 || Src0 == Src2))
17952 break;
17953
17954 MVT VT = Src0.getValueType().getSimpleVT();
17955 const TargetRegisterClass *RC =
17956 getRegClassFor(VT, isDivergent: Src0.getNode()->isDivergent());
17957
17958 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
17959 SDValue UndefReg = DAG.getRegister(Reg: MRI.createVirtualRegister(RegClass: RC), VT);
17960
17961 SDValue ImpDef = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: SDLoc(Node), Reg: UndefReg,
17962 N: Src0, Glue: SDValue());
17963
17964 // src0 must be the same register as src1 or src2, even if the value is
17965 // undefined, so make sure we don't violate this constraint.
17966 if (Src0.isMachineOpcode() &&
17967 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17968 if (Src1.isMachineOpcode() &&
17969 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17970 Src0 = Src1;
17971 else if (Src2.isMachineOpcode() &&
17972 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17973 Src0 = Src2;
17974 else {
17975 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17976 Src0 = UndefReg;
17977 Src1 = UndefReg;
17978 }
17979 } else
17980 break;
17981
17982 SmallVector<SDValue, 9> Ops(Node->ops());
17983 Ops[1] = Src0;
17984 Ops[3] = Src1;
17985 Ops[5] = Src2;
17986 Ops.push_back(Elt: ImpDef.getValue(R: 1));
17987 return DAG.getMachineNode(Opcode, dl: SDLoc(Node), VTs: Node->getVTList(), Ops);
17988 }
17989 default:
17990 break;
17991 }
17992
17993 return Node;
17994}
17995
17996// Any MIMG instructions that use tfe or lwe require an initialization of the
17997// result register that will be written in the case of a memory access failure.
17998// The required code is also added to tie this init code to the result of the
17999// img instruction.
18000void SITargetLowering::AddMemOpInit(MachineInstr &MI) const {
18001 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18002 const SIRegisterInfo &TRI = TII->getRegisterInfo();
18003 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
18004 MachineBasicBlock &MBB = *MI.getParent();
18005
18006 int DstIdx =
18007 AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), Name: AMDGPU::OpName::vdata);
18008 unsigned InitIdx = 0;
18009
18010 if (TII->isImage(MI)) {
18011 MachineOperand *TFE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::tfe);
18012 MachineOperand *LWE = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::lwe);
18013 MachineOperand *D16 = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::d16);
18014
18015 if (!TFE && !LWE) // intersect_ray
18016 return;
18017
18018 unsigned TFEVal = TFE ? TFE->getImm() : 0;
18019 unsigned LWEVal = LWE ? LWE->getImm() : 0;
18020 unsigned D16Val = D16 ? D16->getImm() : 0;
18021
18022 if (!TFEVal && !LWEVal)
18023 return;
18024
18025 // At least one of TFE or LWE are non-zero
18026 // We have to insert a suitable initialization of the result value and
18027 // tie this to the dest of the image instruction.
18028
18029 // Calculate which dword we have to initialize to 0.
18030 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, OperandName: AMDGPU::OpName::dmask);
18031
18032 // check that dmask operand is found.
18033 assert(MO_Dmask && "Expected dmask operand in instruction");
18034
18035 unsigned dmask = MO_Dmask->getImm();
18036 // Determine the number of active lanes taking into account the
18037 // Gather4 special case
18038 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(Value: dmask);
18039
18040 bool Packed = !Subtarget->hasUnpackedD16VMem();
18041
18042 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18043
18044 // Abandon attempt if the dst size isn't large enough
18045 // - this is in fact an error but this is picked up elsewhere and
18046 // reported correctly.
18047 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18048
18049 uint32_t DstSize = TRI.getRegSizeInBits(RC: *DstRC) / 32;
18050 if (DstSize < InitIdx)
18051 return;
18052 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(Opc: MI.getOpcode())) {
18053 const TargetRegisterClass *DstRC = TII->getRegClass(MCID: MI.getDesc(), OpNum: DstIdx);
18054 InitIdx = TRI.getRegSizeInBits(RC: *DstRC) / 32;
18055 } else {
18056 return;
18057 }
18058
18059 const DebugLoc &DL = MI.getDebugLoc();
18060
18061 // Create a register for the initialization value.
18062 Register PrevDst = MRI.cloneVirtualRegister(VReg: MI.getOperand(i: DstIdx).getReg());
18063 unsigned NewDst = 0; // Final initialized value will be in here
18064
18065 // If PRTStrictNull feature is enabled (the default) then initialize
18066 // all the result registers to 0, otherwise just the error indication
18067 // register (VGPRn+1)
18068 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18069 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18070
18071 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::IMPLICIT_DEF), DestReg: PrevDst);
18072 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18073 NewDst = MRI.createVirtualRegister(RegClass: TII->getOpRegClass(MI, OpNo: DstIdx));
18074 // Initialize dword
18075 Register SubReg = MRI.createVirtualRegister(RegClass: &AMDGPU::VGPR_32RegClass);
18076 // clang-format off
18077 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: AMDGPU::V_MOV_B32_e32), DestReg: SubReg)
18078 .addImm(Val: 0);
18079 // clang-format on
18080 // Insert into the super-reg
18081 BuildMI(BB&: MBB, I&: MI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::INSERT_SUBREG), DestReg: NewDst)
18082 .addReg(RegNo: PrevDst)
18083 .addReg(RegNo: SubReg)
18084 .addImm(Val: SIRegisterInfo::getSubRegFromChannel(Channel: CurrIdx));
18085
18086 PrevDst = NewDst;
18087 }
18088
18089 // Add as an implicit operand
18090 MI.addOperand(Op: MachineOperand::CreateReg(Reg: NewDst, isDef: false, isImp: true));
18091
18092 // Tie the just added implicit operand to the dst
18093 MI.tieOperands(DefIdx: DstIdx, UseIdx: MI.getNumOperands() - 1);
18094}
18095
18096/// Assign the register class depending on the number of
18097/// bits set in the writemask
18098void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
18099 SDNode *Node) const {
18100 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18101
18102 MachineFunction *MF = MI.getMF();
18103 MachineRegisterInfo &MRI = MF->getRegInfo();
18104
18105 if (TII->isVOP3(Opcode: MI.getOpcode())) {
18106 // Make sure constant bus requirements are respected.
18107 TII->legalizeOperandsVOP3(MRI, MI);
18108
18109 if (TII->isMAI(MI)) {
18110 // The ordinary src0, src1, src2 were legalized above.
18111 //
18112 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18113 // as a separate instruction.
18114 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18115 Name: AMDGPU::OpName::scale_src0);
18116 if (Src0Idx != -1) {
18117 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(),
18118 Name: AMDGPU::OpName::scale_src1);
18119 if (TII->usesConstantBus(MRI, MI, OpIdx: Src0Idx) &&
18120 TII->usesConstantBus(MRI, MI, OpIdx: Src1Idx))
18121 TII->legalizeOpWithMove(MI, OpIdx: Src1Idx);
18122 }
18123 }
18124
18125 return;
18126 }
18127
18128 if (TII->isImage(MI))
18129 TII->enforceOperandRCAlignment(MI, OpName: AMDGPU::OpName::vaddr);
18130}
18131
18132static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
18133 uint64_t Val) {
18134 SDValue K = DAG.getTargetConstant(Val, DL, VT: MVT::i32);
18135 return SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_MOV_B32, dl: DL, VT: MVT::i32, Op1: K), 0);
18136}
18137
18138MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
18139 const SDLoc &DL,
18140 SDValue Ptr) const {
18141 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18142
18143 // Build the half of the subregister with the constants before building the
18144 // full 128-bit register. If we are building multiple resource descriptors,
18145 // this will allow CSEing of the 2-component register.
18146 const SDValue Ops0[] = {
18147 DAG.getTargetConstant(Val: AMDGPU::SGPR_64RegClassID, DL, VT: MVT::i32),
18148 buildSMovImm32(DAG, DL, Val: 0),
18149 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18150 buildSMovImm32(DAG, DL, Val: TII->getDefaultRsrcDataFormat() >> 32),
18151 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32)};
18152
18153 SDValue SubRegHi = SDValue(
18154 DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v2i32, Ops: Ops0), 0);
18155
18156 // Combine the constants and the pointer.
18157 const SDValue Ops1[] = {
18158 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32), Ptr,
18159 DAG.getTargetConstant(Val: AMDGPU::sub0_sub1, DL, VT: MVT::i32), SubRegHi,
18160 DAG.getTargetConstant(Val: AMDGPU::sub2_sub3, DL, VT: MVT::i32)};
18161
18162 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops: Ops1);
18163}
18164
18165/// Return a resource descriptor with the 'Add TID' bit enabled
18166/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
18167/// of the resource descriptor) to create an offset, which is added to
18168/// the resource pointer.
18169MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
18170 SDValue Ptr, uint32_t RsrcDword1,
18171 uint64_t RsrcDword2And3) const {
18172 SDValue PtrLo = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub0, DL, VT: MVT::i32, Operand: Ptr);
18173 SDValue PtrHi = DAG.getTargetExtractSubreg(SRIdx: AMDGPU::sub1, DL, VT: MVT::i32, Operand: Ptr);
18174 if (RsrcDword1) {
18175 PtrHi =
18176 SDValue(DAG.getMachineNode(Opcode: AMDGPU::S_OR_B32, dl: DL, VT: MVT::i32, Op1: PtrHi,
18177 Op2: DAG.getConstant(Val: RsrcDword1, DL, VT: MVT::i32)),
18178 0);
18179 }
18180
18181 SDValue DataLo =
18182 buildSMovImm32(DAG, DL, Val: RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
18183 SDValue DataHi = buildSMovImm32(DAG, DL, Val: RsrcDword2And3 >> 32);
18184
18185 const SDValue Ops[] = {
18186 DAG.getTargetConstant(Val: AMDGPU::SGPR_128RegClassID, DL, VT: MVT::i32),
18187 PtrLo,
18188 DAG.getTargetConstant(Val: AMDGPU::sub0, DL, VT: MVT::i32),
18189 PtrHi,
18190 DAG.getTargetConstant(Val: AMDGPU::sub1, DL, VT: MVT::i32),
18191 DataLo,
18192 DAG.getTargetConstant(Val: AMDGPU::sub2, DL, VT: MVT::i32),
18193 DataHi,
18194 DAG.getTargetConstant(Val: AMDGPU::sub3, DL, VT: MVT::i32)};
18195
18196 return DAG.getMachineNode(Opcode: AMDGPU::REG_SEQUENCE, dl: DL, VT: MVT::v4i32, Ops);
18197}
18198
18199//===----------------------------------------------------------------------===//
18200// SI Inline Assembly Support
18201//===----------------------------------------------------------------------===//
18202
18203std::pair<unsigned, const TargetRegisterClass *>
18204SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
18205 StringRef Constraint,
18206 MVT VT) const {
18207 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
18208
18209 const TargetRegisterClass *RC = nullptr;
18210 if (Constraint.size() == 1) {
18211 // Check if we cannot determine the bit size of the given value type. This
18212 // can happen, for example, in this situation where we have an empty struct
18213 // (size 0): `call void asm "", "v"({} poison)`-
18214 if (VT == MVT::Other)
18215 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18216 const unsigned BitWidth = VT.getSizeInBits();
18217 switch (Constraint[0]) {
18218 default:
18219 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18220 case 's':
18221 case 'r':
18222 switch (BitWidth) {
18223 case 16:
18224 RC = &AMDGPU::SReg_32RegClass;
18225 break;
18226 case 64:
18227 RC = &AMDGPU::SGPR_64RegClass;
18228 break;
18229 default:
18230 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
18231 if (!RC)
18232 return std::pair(0U, nullptr);
18233 break;
18234 }
18235 break;
18236 case 'v':
18237 switch (BitWidth) {
18238 case 1:
18239 return std::pair(0U, nullptr);
18240 case 16:
18241 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18242 : &AMDGPU::VGPR_32_Lo256RegClass;
18243 break;
18244 default:
18245 RC = Subtarget->has1024AddressableVGPRs()
18246 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
18247 : TRI->getVGPRClassForBitWidth(BitWidth);
18248 if (!RC)
18249 return std::pair(0U, nullptr);
18250 break;
18251 }
18252 break;
18253 case 'a':
18254 if (!Subtarget->hasMAIInsts())
18255 break;
18256 switch (BitWidth) {
18257 case 1:
18258 return std::pair(0U, nullptr);
18259 case 16:
18260 RC = &AMDGPU::AGPR_32RegClass;
18261 break;
18262 default:
18263 RC = TRI->getAGPRClassForBitWidth(BitWidth);
18264 if (!RC)
18265 return std::pair(0U, nullptr);
18266 break;
18267 }
18268 break;
18269 }
18270 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
18271 const unsigned BitWidth = VT.getSizeInBits();
18272 switch (BitWidth) {
18273 case 16:
18274 RC = &AMDGPU::AV_32RegClass;
18275 break;
18276 default:
18277 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
18278 if (!RC)
18279 return std::pair(0U, nullptr);
18280 break;
18281 }
18282 }
18283
18284 // We actually support i128, i16 and f16 as inline parameters
18285 // even if they are not reported as legal
18286 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
18287 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
18288 return std::pair(0U, RC);
18289
18290 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
18291 if (Kind != '\0') {
18292 if (Kind == 'v') {
18293 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18294 } else if (Kind == 's') {
18295 RC = &AMDGPU::SGPR_32RegClass;
18296 } else if (Kind == 'a') {
18297 RC = &AMDGPU::AGPR_32RegClass;
18298 }
18299
18300 if (RC) {
18301 if (NumRegs > 1) {
18302 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
18303 return std::pair(0U, nullptr);
18304
18305 uint32_t Width = NumRegs * 32;
18306 // Prohibit constraints for register ranges with a width that does not
18307 // match the required type.
18308 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
18309 return std::pair(0U, nullptr);
18310
18311 MCRegister Reg = RC->getRegister(i: Idx);
18312 if (SIRegisterInfo::isVGPRClass(RC))
18313 RC = TRI->getVGPRClassForBitWidth(BitWidth: Width);
18314 else if (SIRegisterInfo::isSGPRClass(RC))
18315 RC = TRI->getSGPRClassForBitWidth(BitWidth: Width);
18316 else if (SIRegisterInfo::isAGPRClass(RC))
18317 RC = TRI->getAGPRClassForBitWidth(BitWidth: Width);
18318 if (RC) {
18319 Reg = TRI->getMatchingSuperReg(Reg, SubIdx: AMDGPU::sub0, RC);
18320 if (!Reg) {
18321 // The register class does not contain the requested register,
18322 // e.g., because it is an SGPR pair that would violate alignment
18323 // requirements.
18324 return std::pair(0U, nullptr);
18325 }
18326 return std::pair(Reg, RC);
18327 }
18328 }
18329
18330 // Check for lossy scalar/vector conversions.
18331 if (VT.isVector() && VT.getSizeInBits() != 32)
18332 return std::pair(0U, nullptr);
18333 if (Idx < RC->getNumRegs())
18334 return std::pair(RC->getRegister(i: Idx), RC);
18335 return std::pair(0U, nullptr);
18336 }
18337 }
18338
18339 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
18340 if (Ret.first)
18341 Ret.second = TRI->getPhysRegBaseClass(Reg: Ret.first);
18342
18343 return Ret;
18344}
18345
18346static bool isImmConstraint(StringRef Constraint) {
18347 if (Constraint.size() == 1) {
18348 switch (Constraint[0]) {
18349 default:
18350 break;
18351 case 'I':
18352 case 'J':
18353 case 'A':
18354 case 'B':
18355 case 'C':
18356 return true;
18357 }
18358 } else if (Constraint == "DA" || Constraint == "DB") {
18359 return true;
18360 }
18361 return false;
18362}
18363
18364SITargetLowering::ConstraintType
18365SITargetLowering::getConstraintType(StringRef Constraint) const {
18366 if (Constraint.size() == 1) {
18367 switch (Constraint[0]) {
18368 default:
18369 break;
18370 case 's':
18371 case 'v':
18372 case 'a':
18373 return C_RegisterClass;
18374 }
18375 } else if (Constraint.size() == 2) {
18376 if (Constraint == "VA")
18377 return C_RegisterClass;
18378 }
18379 if (isImmConstraint(Constraint)) {
18380 return C_Other;
18381 }
18382 return TargetLowering::getConstraintType(Constraint);
18383}
18384
18385static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
18386 if (!AMDGPU::isInlinableIntLiteral(Literal: Val)) {
18387 Val = Val & maskTrailingOnes<uint64_t>(N: Size);
18388 }
18389 return Val;
18390}
18391
18392void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
18393 StringRef Constraint,
18394 std::vector<SDValue> &Ops,
18395 SelectionDAG &DAG) const {
18396 if (isImmConstraint(Constraint)) {
18397 uint64_t Val;
18398 if (getAsmOperandConstVal(Op, Val) &&
18399 checkAsmConstraintVal(Op, Constraint, Val)) {
18400 Val = clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits());
18401 Ops.push_back(x: DAG.getTargetConstant(Val, DL: SDLoc(Op), VT: MVT::i64));
18402 }
18403 } else {
18404 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
18405 }
18406}
18407
18408bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
18409 unsigned Size = Op.getScalarValueSizeInBits();
18410 if (Size > 64)
18411 return false;
18412
18413 if (Size == 16 && !Subtarget->has16BitInsts())
18414 return false;
18415
18416 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op)) {
18417 Val = C->getSExtValue();
18418 return true;
18419 }
18420 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val&: Op)) {
18421 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18422 return true;
18423 }
18424 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Val&: Op)) {
18425 if (Size != 16 || Op.getNumOperands() != 2)
18426 return false;
18427 if (Op.getOperand(i: 0).isUndef() || Op.getOperand(i: 1).isUndef())
18428 return false;
18429 if (ConstantSDNode *C = V->getConstantSplatNode()) {
18430 Val = C->getSExtValue();
18431 return true;
18432 }
18433 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
18434 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
18435 return true;
18436 }
18437 }
18438
18439 return false;
18440}
18441
18442bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
18443 uint64_t Val) const {
18444 if (Constraint.size() == 1) {
18445 switch (Constraint[0]) {
18446 case 'I':
18447 return AMDGPU::isInlinableIntLiteral(Literal: Val);
18448 case 'J':
18449 return isInt<16>(x: Val);
18450 case 'A':
18451 return checkAsmConstraintValA(Op, Val);
18452 case 'B':
18453 return isInt<32>(x: Val);
18454 case 'C':
18455 return isUInt<32>(x: clearUnusedBits(Val, Size: Op.getScalarValueSizeInBits())) ||
18456 AMDGPU::isInlinableIntLiteral(Literal: Val);
18457 default:
18458 break;
18459 }
18460 } else if (Constraint.size() == 2) {
18461 if (Constraint == "DA") {
18462 int64_t HiBits = static_cast<int32_t>(Val >> 32);
18463 int64_t LoBits = static_cast<int32_t>(Val);
18464 return checkAsmConstraintValA(Op, Val: HiBits, MaxSize: 32) &&
18465 checkAsmConstraintValA(Op, Val: LoBits, MaxSize: 32);
18466 }
18467 if (Constraint == "DB") {
18468 return true;
18469 }
18470 }
18471 llvm_unreachable("Invalid asm constraint");
18472}
18473
18474bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val,
18475 unsigned MaxSize) const {
18476 unsigned Size = std::min<unsigned>(a: Op.getScalarValueSizeInBits(), b: MaxSize);
18477 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18478 if (Size == 16) {
18479 MVT VT = Op.getSimpleValueType();
18480 switch (VT.SimpleTy) {
18481 default:
18482 return false;
18483 case MVT::i16:
18484 return AMDGPU::isInlinableLiteralI16(Literal: Val, HasInv2Pi);
18485 case MVT::f16:
18486 return AMDGPU::isInlinableLiteralFP16(Literal: Val, HasInv2Pi);
18487 case MVT::bf16:
18488 return AMDGPU::isInlinableLiteralBF16(Literal: Val, HasInv2Pi);
18489 case MVT::v2i16:
18490 return AMDGPU::getInlineEncodingV2I16(Literal: Val).has_value();
18491 case MVT::v2f16:
18492 return AMDGPU::getInlineEncodingV2F16(Literal: Val).has_value();
18493 case MVT::v2bf16:
18494 return AMDGPU::getInlineEncodingV2BF16(Literal: Val).has_value();
18495 }
18496 }
18497 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Literal: Val, HasInv2Pi)) ||
18498 (Size == 64 && AMDGPU::isInlinableLiteral64(Literal: Val, HasInv2Pi)))
18499 return true;
18500 return false;
18501}
18502
18503static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
18504 switch (UnalignedClassID) {
18505 case AMDGPU::VReg_64RegClassID:
18506 return AMDGPU::VReg_64_Align2RegClassID;
18507 case AMDGPU::VReg_96RegClassID:
18508 return AMDGPU::VReg_96_Align2RegClassID;
18509 case AMDGPU::VReg_128RegClassID:
18510 return AMDGPU::VReg_128_Align2RegClassID;
18511 case AMDGPU::VReg_160RegClassID:
18512 return AMDGPU::VReg_160_Align2RegClassID;
18513 case AMDGPU::VReg_192RegClassID:
18514 return AMDGPU::VReg_192_Align2RegClassID;
18515 case AMDGPU::VReg_224RegClassID:
18516 return AMDGPU::VReg_224_Align2RegClassID;
18517 case AMDGPU::VReg_256RegClassID:
18518 return AMDGPU::VReg_256_Align2RegClassID;
18519 case AMDGPU::VReg_288RegClassID:
18520 return AMDGPU::VReg_288_Align2RegClassID;
18521 case AMDGPU::VReg_320RegClassID:
18522 return AMDGPU::VReg_320_Align2RegClassID;
18523 case AMDGPU::VReg_352RegClassID:
18524 return AMDGPU::VReg_352_Align2RegClassID;
18525 case AMDGPU::VReg_384RegClassID:
18526 return AMDGPU::VReg_384_Align2RegClassID;
18527 case AMDGPU::VReg_512RegClassID:
18528 return AMDGPU::VReg_512_Align2RegClassID;
18529 case AMDGPU::VReg_1024RegClassID:
18530 return AMDGPU::VReg_1024_Align2RegClassID;
18531 case AMDGPU::AReg_64RegClassID:
18532 return AMDGPU::AReg_64_Align2RegClassID;
18533 case AMDGPU::AReg_96RegClassID:
18534 return AMDGPU::AReg_96_Align2RegClassID;
18535 case AMDGPU::AReg_128RegClassID:
18536 return AMDGPU::AReg_128_Align2RegClassID;
18537 case AMDGPU::AReg_160RegClassID:
18538 return AMDGPU::AReg_160_Align2RegClassID;
18539 case AMDGPU::AReg_192RegClassID:
18540 return AMDGPU::AReg_192_Align2RegClassID;
18541 case AMDGPU::AReg_256RegClassID:
18542 return AMDGPU::AReg_256_Align2RegClassID;
18543 case AMDGPU::AReg_512RegClassID:
18544 return AMDGPU::AReg_512_Align2RegClassID;
18545 case AMDGPU::AReg_1024RegClassID:
18546 return AMDGPU::AReg_1024_Align2RegClassID;
18547 default:
18548 return -1;
18549 }
18550}
18551
18552// Figure out which registers should be reserved for stack access. Only after
18553// the function is legalized do we know all of the non-spill stack objects or if
18554// calls are present.
18555void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
18556 MachineRegisterInfo &MRI = MF.getRegInfo();
18557 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18558 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18559 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18560 const SIInstrInfo *TII = ST.getInstrInfo();
18561
18562 if (Info->isEntryFunction()) {
18563 // Callable functions have fixed registers used for stack access.
18564 reservePrivateMemoryRegs(TM: getTargetMachine(), MF, TRI: *TRI, Info&: *Info);
18565 }
18566
18567 // TODO: Move this logic to getReservedRegs()
18568 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
18569 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18570 Register SReg = ST.isWave32()
18571 ? AMDGPU::SGPR_32RegClass.getRegister(i: MaxNumSGPRs - 1)
18572 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
18573 RC: &AMDGPU::SGPR_64RegClass);
18574 Info->setSGPRForEXECCopy(SReg);
18575
18576 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
18577 Info->getStackPtrOffsetReg()));
18578 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18579 MRI.replaceRegWith(FromReg: AMDGPU::SP_REG, ToReg: Info->getStackPtrOffsetReg());
18580
18581 // We need to worry about replacing the default register with itself in case
18582 // of MIR testcases missing the MFI.
18583 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18584 MRI.replaceRegWith(FromReg: AMDGPU::PRIVATE_RSRC_REG, ToReg: Info->getScratchRSrcReg());
18585
18586 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18587 MRI.replaceRegWith(FromReg: AMDGPU::FP_REG, ToReg: Info->getFrameOffsetReg());
18588
18589 Info->limitOccupancy(MF);
18590
18591 if (ST.isWave32() && !MF.empty()) {
18592 for (auto &MBB : MF) {
18593 for (auto &MI : MBB) {
18594 TII->fixImplicitOperands(MI);
18595 }
18596 }
18597 }
18598
18599 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
18600 // classes if required. Ideally the register class constraints would differ
18601 // per-subtarget, but there's no easy way to achieve that right now. This is
18602 // not a problem for VGPRs because the correctly aligned VGPR class is implied
18603 // from using them as the register class for legal types.
18604 if (ST.needsAlignedVGPRs()) {
18605 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
18606 const Register Reg = Register::index2VirtReg(Index: I);
18607 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
18608 if (!RC)
18609 continue;
18610 int NewClassID = getAlignedAGPRClassID(UnalignedClassID: RC->getID());
18611 if (NewClassID != -1)
18612 MRI.setRegClass(Reg, RC: TRI->getRegClass(i: NewClassID));
18613 }
18614 }
18615
18616 TargetLoweringBase::finalizeLowering(MF);
18617}
18618
18619void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
18620 KnownBits &Known,
18621 const APInt &DemandedElts,
18622 const SelectionDAG &DAG,
18623 unsigned Depth) const {
18624 Known.resetAll();
18625 unsigned Opc = Op.getOpcode();
18626 switch (Opc) {
18627 case ISD::INTRINSIC_WO_CHAIN: {
18628 unsigned IID = Op.getConstantOperandVal(i: 0);
18629 switch (IID) {
18630 case Intrinsic::amdgcn_mbcnt_lo:
18631 case Intrinsic::amdgcn_mbcnt_hi: {
18632 const GCNSubtarget &ST =
18633 DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
18634 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18635 // most 31 + src1.
18636 Known.Zero.setBitsFrom(
18637 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18638 KnownBits Known2 = DAG.computeKnownBits(Op: Op.getOperand(i: 2), Depth: Depth + 1);
18639 Known = KnownBits::add(LHS: Known, RHS: Known2);
18640 return;
18641 }
18642 }
18643 break;
18644 }
18645 }
18646 return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
18647 Op, Known, DemandedElts, DAG, Depth);
18648}
18649
18650void SITargetLowering::computeKnownBitsForFrameIndex(
18651 const int FI, KnownBits &Known, const MachineFunction &MF) const {
18652 TargetLowering::computeKnownBitsForFrameIndex(FIOp: FI, Known, MF);
18653
18654 // Set the high bits to zero based on the maximum allowed scratch size per
18655 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
18656 // calculation won't overflow, so assume the sign bit is never set.
18657 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
18658}
18659
18660static void knownBitsForWorkitemID(const GCNSubtarget &ST,
18661 GISelValueTracking &VT, KnownBits &Known,
18662 unsigned Dim) {
18663 unsigned MaxValue =
18664 ST.getMaxWorkitemID(Kernel: VT.getMachineFunction().getFunction(), Dimension: Dim);
18665 Known.Zero.setHighBits(llvm::countl_zero(Val: MaxValue));
18666}
18667
18668static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT,
18669 KnownBits &Known, const APInt &DemandedElts,
18670 unsigned BFEWidth, bool SExt, unsigned Depth) {
18671 const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo();
18672 const MachineOperand &Src1 = MI.getOperand(i: 2);
18673
18674 unsigned Src1Cst = 0;
18675 if (Src1.isImm()) {
18676 Src1Cst = Src1.getImm();
18677 } else if (Src1.isReg()) {
18678 auto Cst = getIConstantVRegValWithLookThrough(VReg: Src1.getReg(), MRI);
18679 if (!Cst)
18680 return;
18681 Src1Cst = Cst->Value.getZExtValue();
18682 } else {
18683 return;
18684 }
18685
18686 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
18687 // Width is always [22:16].
18688 const unsigned Offset =
18689 Src1Cst & maskTrailingOnes<unsigned>(N: (BFEWidth == 32) ? 5 : 6);
18690 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(N: 6);
18691
18692 if (Width >= BFEWidth) // Ill-formed.
18693 return;
18694
18695 VT.computeKnownBitsImpl(R: MI.getOperand(i: 1).getReg(), Known, DemandedElts,
18696 Depth: Depth + 1);
18697
18698 Known = Known.extractBits(NumBits: Width, BitPosition: Offset);
18699
18700 if (SExt)
18701 Known = Known.sext(BitWidth: BFEWidth);
18702 else
18703 Known = Known.zext(BitWidth: BFEWidth);
18704}
18705
18706void SITargetLowering::computeKnownBitsForTargetInstr(
18707 GISelValueTracking &VT, Register R, KnownBits &Known,
18708 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18709 unsigned Depth) const {
18710 Known.resetAll();
18711 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18712 switch (MI->getOpcode()) {
18713 case AMDGPU::S_BFE_I32:
18714 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18715 /*SExt=*/true, Depth);
18716 case AMDGPU::S_BFE_U32:
18717 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 32,
18718 /*SExt=*/false, Depth);
18719 case AMDGPU::S_BFE_I64:
18720 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18721 /*SExt=*/true, Depth);
18722 case AMDGPU::S_BFE_U64:
18723 return knownBitsForSBFE(MI: *MI, VT, Known, DemandedElts, /*Width=*/BFEWidth: 64,
18724 /*SExt=*/false, Depth);
18725 case AMDGPU::G_INTRINSIC:
18726 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18727 Intrinsic::ID IID = cast<GIntrinsic>(Val: MI)->getIntrinsicID();
18728 switch (IID) {
18729 case Intrinsic::amdgcn_workitem_id_x:
18730 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 0);
18731 break;
18732 case Intrinsic::amdgcn_workitem_id_y:
18733 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 1);
18734 break;
18735 case Intrinsic::amdgcn_workitem_id_z:
18736 knownBitsForWorkitemID(ST: *getSubtarget(), VT, Known, Dim: 2);
18737 break;
18738 case Intrinsic::amdgcn_mbcnt_lo:
18739 case Intrinsic::amdgcn_mbcnt_hi: {
18740 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18741 // most 31 + src1.
18742 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18743 ? getSubtarget()->getWavefrontSizeLog2()
18744 : 5);
18745 KnownBits Known2;
18746 VT.computeKnownBitsImpl(R: MI->getOperand(i: 3).getReg(), Known&: Known2, DemandedElts,
18747 Depth: Depth + 1);
18748 Known = KnownBits::add(LHS: Known, RHS: Known2);
18749 break;
18750 }
18751 case Intrinsic::amdgcn_groupstaticsize: {
18752 // We can report everything over the maximum size as 0. We can't report
18753 // based on the actual size because we don't know if it's accurate or not
18754 // at any given point.
18755 Known.Zero.setHighBits(
18756 llvm::countl_zero(Val: getSubtarget()->getAddressableLocalMemorySize()));
18757 break;
18758 }
18759 }
18760 break;
18761 }
18762 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18763 Known.Zero.setHighBits(24);
18764 break;
18765 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18766 Known.Zero.setHighBits(16);
18767 break;
18768 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18769 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
18770 // producing exactly 0 or 1.
18771 Known.Zero.setHighBits(Known.getBitWidth() - 1);
18772 break;
18773 case AMDGPU::G_AMDGPU_SMED3:
18774 case AMDGPU::G_AMDGPU_UMED3: {
18775 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18776
18777 KnownBits Known2;
18778 VT.computeKnownBitsImpl(R: Src2, Known&: Known2, DemandedElts, Depth: Depth + 1);
18779 if (Known2.isUnknown())
18780 break;
18781
18782 KnownBits Known1;
18783 VT.computeKnownBitsImpl(R: Src1, Known&: Known1, DemandedElts, Depth: Depth + 1);
18784 if (Known1.isUnknown())
18785 break;
18786
18787 KnownBits Known0;
18788 VT.computeKnownBitsImpl(R: Src0, Known&: Known0, DemandedElts, Depth: Depth + 1);
18789 if (Known0.isUnknown())
18790 break;
18791
18792 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18793 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18794 Known.One = Known0.One & Known1.One & Known2.One;
18795 break;
18796 }
18797 }
18798}
18799
18800Align SITargetLowering::computeKnownAlignForTargetInstr(
18801 GISelValueTracking &VT, Register R, const MachineRegisterInfo &MRI,
18802 unsigned Depth) const {
18803 const MachineInstr *MI = MRI.getVRegDef(Reg: R);
18804 if (auto *GI = dyn_cast<GIntrinsic>(Val: MI)) {
18805 // FIXME: Can this move to generic code? What about the case where the call
18806 // site specifies a lower alignment?
18807 Intrinsic::ID IID = GI->getIntrinsicID();
18808 LLVMContext &Ctx = VT.getMachineFunction().getFunction().getContext();
18809 AttributeList Attrs =
18810 Intrinsic::getAttributes(C&: Ctx, id: IID, FT: Intrinsic::getType(Context&: Ctx, id: IID));
18811 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18812 return *RetAlign;
18813 }
18814 return Align(1);
18815}
18816
18817Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
18818 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
18819 const Align CacheLineAlign = Align(64);
18820
18821 // GFX950: Prevent an 8-byte instruction at loop header from being split by
18822 // the 32-byte instruction fetch window boundary. This avoids a significant
18823 // fetch delay after backward branch. We use 32-byte alignment with max
18824 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
18825 if (ML && !DisableLoopAlignment &&
18826 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
18827 const MachineBasicBlock *Header = ML->getHeader();
18828 // Respect user-specified or previously set alignment.
18829 if (Header->getAlignment() != PrefAlign)
18830 return Header->getAlignment();
18831 if (needsFetchWindowAlignment(MBB: *Header))
18832 return Align(32);
18833 }
18834
18835 // Pre-GFX10 target did not benefit from loop alignment
18836 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18837 getSubtarget()->hasInstFwdPrefetchBug())
18838 return PrefAlign;
18839
18840 // On GFX10 I$ is 4 x 64 bytes cache lines.
18841 // By default prefetcher keeps one cache line behind and reads two ahead.
18842 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18843 // behind and one ahead.
18844 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18845 // If loop fits 64 bytes it always spans no more than two cache lines and
18846 // does not need an alignment.
18847 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18848 // Else if loop is less or equal 192 bytes we need two lines behind.
18849
18850 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18851 const MachineBasicBlock *Header = ML->getHeader();
18852 if (Header->getAlignment() != PrefAlign)
18853 return Header->getAlignment(); // Already processed.
18854
18855 unsigned LoopSize = 0;
18856 for (const MachineBasicBlock *MBB : ML->blocks()) {
18857 // If inner loop block is aligned assume in average half of the alignment
18858 // size to be added as nops.
18859 if (MBB != Header)
18860 LoopSize += MBB->getAlignment().value() / 2;
18861
18862 for (const MachineInstr &MI : *MBB) {
18863 LoopSize += TII->getInstSizeInBytes(MI);
18864 if (LoopSize > 192)
18865 return PrefAlign;
18866 }
18867 }
18868
18869 if (LoopSize <= 64)
18870 return PrefAlign;
18871
18872 if (LoopSize <= 128)
18873 return CacheLineAlign;
18874
18875 // If any of parent loops is surrounded by prefetch instructions do not
18876 // insert new for inner loop, which would reset parent's settings.
18877 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18878 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18879 auto I = Exit->getFirstNonDebugInstr();
18880 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18881 return CacheLineAlign;
18882 }
18883 }
18884
18885 MachineBasicBlock *Pre = ML->getLoopPreheader();
18886 MachineBasicBlock *Exit = ML->getExitBlock();
18887
18888 if (Pre && Exit) {
18889 auto PreTerm = Pre->getFirstTerminator();
18890 if (PreTerm == Pre->begin() ||
18891 std::prev(x: PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18892 BuildMI(BB&: *Pre, I: PreTerm, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18893 .addImm(Val: 1); // prefetch 2 lines behind PC
18894
18895 auto ExitHead = Exit->getFirstNonDebugInstr();
18896 if (ExitHead == Exit->end() ||
18897 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18898 BuildMI(BB&: *Exit, I: ExitHead, MIMD: DebugLoc(), MCID: TII->get(Opcode: AMDGPU::S_INST_PREFETCH))
18899 .addImm(Val: 2); // prefetch 1 line behind PC
18900 }
18901
18902 return CacheLineAlign;
18903}
18904
18905unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
18906 MachineBasicBlock *MBB) const {
18907 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
18908 // instruction could be split by the 32-byte fetch window boundary.
18909 // See getPrefLoopAlignment() for context.
18910 if (needsFetchWindowAlignment(MBB: *MBB))
18911 return 4;
18912 return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
18913}
18914
18915bool SITargetLowering::needsFetchWindowAlignment(
18916 const MachineBasicBlock &MBB) const {
18917 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
18918 return false;
18919 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
18920 for (const MachineInstr &MI : MBB) {
18921 if (MI.isMetaInstruction())
18922 continue;
18923 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
18924 return TII->getInstSizeInBytes(MI) > 4;
18925 }
18926 return false;
18927}
18928
18929[[maybe_unused]]
18930static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18931 assert(N->getOpcode() == ISD::CopyFromReg);
18932 do {
18933 // Follow the chain until we find an INLINEASM node.
18934 N = N->getOperand(Num: 0).getNode();
18935 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18936 return true;
18937 } while (N->getOpcode() == ISD::CopyFromReg);
18938 return false;
18939}
18940
18941bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
18942 FunctionLoweringInfo *FLI,
18943 UniformityInfo *UA) const {
18944 switch (N->getOpcode()) {
18945 case ISD::CopyFromReg: {
18946 const RegisterSDNode *R = cast<RegisterSDNode>(Val: N->getOperand(Num: 1));
18947 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18948 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18949 Register Reg = R->getReg();
18950
18951 // FIXME: Why does this need to consider isLiveIn?
18952 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18953 return !TRI->isSGPRReg(MRI, Reg);
18954
18955 if (const Value *V = FLI->getValueFromVirtualReg(Vreg: R->getReg()))
18956 return UA->isDivergent(V);
18957
18958 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
18959 return !TRI->isSGPRReg(MRI, Reg);
18960 }
18961 case ISD::LOAD: {
18962 const LoadSDNode *L = cast<LoadSDNode>(Val: N);
18963 unsigned AS = L->getAddressSpace();
18964 // A flat load may access private memory.
18965 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
18966 }
18967 case ISD::CALLSEQ_END:
18968 return true;
18969 case ISD::INTRINSIC_WO_CHAIN:
18970 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 0));
18971 case ISD::INTRINSIC_W_CHAIN:
18972 return AMDGPU::isIntrinsicSourceOfDivergence(IntrID: N->getConstantOperandVal(Num: 1));
18973 case AMDGPUISD::ATOMIC_CMP_SWAP:
18974 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18975 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18976 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18977 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18978 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18979 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18980 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18981 case AMDGPUISD::BUFFER_ATOMIC_AND:
18982 case AMDGPUISD::BUFFER_ATOMIC_OR:
18983 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18984 case AMDGPUISD::BUFFER_ATOMIC_INC:
18985 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18986 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18987 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18988 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18989 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18990 // Target-specific read-modify-write atomics are sources of divergence.
18991 return true;
18992 default:
18993 if (auto *A = dyn_cast<AtomicSDNode>(Val: N)) {
18994 // Generic read-modify-write atomics are sources of divergence.
18995 return A->readMem() && A->writeMem();
18996 }
18997 return false;
18998 }
18999}
19000
19001bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
19002 EVT VT) const {
19003 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
19004 case MVT::f32:
19005 return !denormalModeIsFlushAllF32(MF: DAG.getMachineFunction());
19006 case MVT::f64:
19007 case MVT::f16:
19008 return !denormalModeIsFlushAllF64F16(MF: DAG.getMachineFunction());
19009 default:
19010 return false;
19011 }
19012}
19013
19014bool SITargetLowering::denormalsEnabledForType(
19015 LLT Ty, const MachineFunction &MF) const {
19016 switch (Ty.getScalarSizeInBits()) {
19017 case 32:
19018 return !denormalModeIsFlushAllF32(MF);
19019 case 64:
19020 case 16:
19021 return !denormalModeIsFlushAllF64F16(MF);
19022 default:
19023 return false;
19024 }
19025}
19026
19027bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
19028 const APInt &DemandedElts,
19029 const SelectionDAG &DAG,
19030 bool SNaN,
19031 unsigned Depth) const {
19032 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19033 const MachineFunction &MF = DAG.getMachineFunction();
19034 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
19035
19036 if (Info->getMode().DX10Clamp)
19037 return true; // Clamped to 0.
19038 return DAG.isKnownNeverNaN(Op: Op.getOperand(i: 0), SNaN, Depth: Depth + 1);
19039 }
19040
19041 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DemandedElts,
19042 DAG, SNaN, Depth);
19043}
19044
19045// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19046// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19047static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
19048 if (RMW->hasMetadata(Kind: "amdgpu.ignore.denormal.mode"))
19049 return true;
19050
19051 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19052 auto DenormMode = RMW->getFunction()->getDenormalMode(FPType: Flt);
19053 if (DenormMode == DenormalMode::getPreserveSign())
19054 return true;
19055
19056 // TODO: Remove this.
19057 return RMW->getFunction()
19058 ->getFnAttribute(Kind: "amdgpu-unsafe-fp-atomics")
19059 .getValueAsBool();
19060}
19061
19062static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
19063 LLVMContext &Ctx = RMW->getContext();
19064 StringRef MemScope =
19065 Ctx.getSyncScopeName(Id: RMW->getSyncScopeID()).value_or(u: "system");
19066
19067 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19068 << "Hardware instruction generated for atomic "
19069 << RMW->getOperationName(Op: RMW->getOperation())
19070 << " operation at memory scope " << MemScope;
19071}
19072
19073static bool isV2F16OrV2BF16(Type *Ty) {
19074 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19075 Type *EltTy = VT->getElementType();
19076 return VT->getNumElements() == 2 &&
19077 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19078 }
19079
19080 return false;
19081}
19082
19083static bool isV2F16(Type *Ty) {
19084 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19085 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19086}
19087
19088static bool isV2BF16(Type *Ty) {
19089 FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty);
19090 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19091}
19092
19093/// \return true if atomicrmw integer ops work for the type.
19094static bool isAtomicRMWLegalIntTy(Type *Ty) {
19095 if (auto *IT = dyn_cast<IntegerType>(Val: Ty)) {
19096 unsigned BW = IT->getBitWidth();
19097 return BW == 32 || BW == 64;
19098 }
19099
19100 return false;
19101}
19102
19103/// \return true if this atomicrmw xchg type can be selected.
19104static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19105 Type *Ty = RMW->getType();
19106 if (isAtomicRMWLegalIntTy(Ty))
19107 return true;
19108
19109 if (PointerType *PT = dyn_cast<PointerType>(Val: Ty)) {
19110 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19111 unsigned BW = DL.getPointerSizeInBits(AS: PT->getAddressSpace());
19112 return BW == 32 || BW == 64;
19113 }
19114
19115 if (Ty->isFloatTy() || Ty->isDoubleTy())
19116 return true;
19117
19118 if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
19119 return VT->getNumElements() == 2 &&
19120 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19121 }
19122
19123 return false;
19124}
19125
19126/// \returns true if it's valid to emit a native instruction for \p RMW, based
19127/// on the properties of the target memory.
19128static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
19129 const AtomicRMWInst *RMW,
19130 bool HasSystemScope) {
19131 // The remote/fine-grained access logic is different from the integer
19132 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
19133 // fine-grained access does not work, even for a device local allocation.
19134 //
19135 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
19136 // allocations work.
19137 if (HasSystemScope) {
19138 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19139 RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19140 return true;
19141 if (Subtarget.hasEmulatedSystemScopeAtomics())
19142 return true;
19143 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19144 return true;
19145
19146 return RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory");
19147}
19148
19149/// \return Action to perform on AtomicRMWInsts for integer operations.
19150static TargetLowering::AtomicExpansionKind
19151atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
19152 return isAtomicRMWLegalIntTy(Ty: RMW->getType())
19153 ? TargetLowering::AtomicExpansionKind::None
19154 : TargetLowering::AtomicExpansionKind::CmpXChg;
19155}
19156
19157/// Return if a flat address space atomicrmw can access private memory.
19158static bool flatInstrMayAccessPrivate(const Instruction *I) {
19159 const MDNode *MD = I->getMetadata(KindID: LLVMContext::MD_noalias_addrspace);
19160 return !MD ||
19161 !AMDGPU::hasValueInRangeLikeMetadata(MD: *MD, Val: AMDGPUAS::PRIVATE_ADDRESS);
19162}
19163
19164static TargetLowering::AtomicExpansionKind
19165getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
19166 // For GAS, lower to flat atomic.
19167 return STI.hasGloballyAddressableScratch()
19168 ? TargetLowering::AtomicExpansionKind::CustomExpand
19169 : TargetLowering::AtomicExpansionKind::NotAtomic;
19170}
19171
19172TargetLowering::AtomicExpansionKind
19173SITargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *RMW) const {
19174 unsigned AS = RMW->getPointerAddressSpace();
19175 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
19176 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19177
19178 // 64-bit flat atomics that dynamically reside in private memory will silently
19179 // be dropped.
19180 //
19181 // Note that we will emit a new copy of the original atomic in the expansion,
19182 // which will be incrementally relegalized.
19183 const DataLayout &DL = RMW->getFunction()->getDataLayout();
19184 if (AS == AMDGPUAS::FLAT_ADDRESS &&
19185 DL.getTypeSizeInBits(Ty: RMW->getType()) == 64 &&
19186 flatInstrMayAccessPrivate(I: RMW))
19187 return AtomicExpansionKind::CustomExpand;
19188
19189 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
19190 OptimizationRemarkEmitter ORE(RMW->getFunction());
19191 ORE.emit(RemarkBuilder: [=]() {
19192 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
19193 });
19194 return Kind;
19195 };
19196
19197 auto SSID = RMW->getSyncScopeID();
19198 bool HasSystemScope =
19199 SSID == SyncScope::System ||
19200 SSID == RMW->getContext().getOrInsertSyncScopeID(SSN: "one-as");
19201
19202 auto Op = RMW->getOperation();
19203 switch (Op) {
19204 case AtomicRMWInst::Xchg:
19205 // PCIe supports add and xchg for system atomics.
19206 return isAtomicRMWLegalXChgTy(RMW)
19207 ? TargetLowering::AtomicExpansionKind::None
19208 : TargetLowering::AtomicExpansionKind::CmpXChg;
19209 case AtomicRMWInst::Add:
19210 // PCIe supports add and xchg for system atomics.
19211 return atomicSupportedIfLegalIntType(RMW);
19212 case AtomicRMWInst::Sub:
19213 case AtomicRMWInst::And:
19214 case AtomicRMWInst::Or:
19215 case AtomicRMWInst::Xor:
19216 case AtomicRMWInst::Max:
19217 case AtomicRMWInst::Min:
19218 case AtomicRMWInst::UMax:
19219 case AtomicRMWInst::UMin:
19220 case AtomicRMWInst::UIncWrap:
19221 case AtomicRMWInst::UDecWrap:
19222 case AtomicRMWInst::USubCond:
19223 case AtomicRMWInst::USubSat: {
19224 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
19225 return AtomicExpansionKind::CmpXChg;
19226 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
19227 return AtomicExpansionKind::CmpXChg;
19228 if (Op == AtomicRMWInst::USubCond || Op == AtomicRMWInst::USubSat) {
19229 auto *IT = dyn_cast<IntegerType>(Val: RMW->getType());
19230 if (!IT || IT->getBitWidth() != 32)
19231 return AtomicExpansionKind::CmpXChg;
19232 }
19233
19234 if (AMDGPU::isFlatGlobalAddrSpace(AS) ||
19235 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19236 if (Subtarget->hasEmulatedSystemScopeAtomics())
19237 return atomicSupportedIfLegalIntType(RMW);
19238
19239 // On most subtargets, for atomicrmw operations other than add/xchg,
19240 // whether or not the instructions will behave correctly depends on where
19241 // the address physically resides and what interconnect is used in the
19242 // system configuration. On some some targets the instruction will nop,
19243 // and in others synchronization will only occur at degraded device scope.
19244 //
19245 // If the allocation is known local to the device, the instructions should
19246 // work correctly.
19247 if (RMW->hasMetadata(Kind: "amdgpu.no.remote.memory"))
19248 return atomicSupportedIfLegalIntType(RMW);
19249
19250 // If fine-grained remote memory works at device scope, we don't need to
19251 // do anything.
19252 if (!HasSystemScope &&
19253 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19254 return atomicSupportedIfLegalIntType(RMW);
19255
19256 // If we are targeting a remote allocated address, it depends what kind of
19257 // allocation the address belongs to.
19258 //
19259 // If the allocation is fine-grained (in host memory, or in PCIe peer
19260 // device memory), the operation will fail depending on the target.
19261 //
19262 // Note fine-grained host memory access does work on APUs or if XGMI is
19263 // used, but we do not know if we are targeting an APU or the system
19264 // configuration from the ISA version/target-cpu.
19265 if (RMW->hasMetadata(Kind: "amdgpu.no.fine.grained.memory"))
19266 return atomicSupportedIfLegalIntType(RMW);
19267
19268 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19269 Op == AtomicRMWInst::Xor) {
19270 // Atomic sub/or/xor do not work over PCI express, but atomic add
19271 // does. InstCombine transforms these with 0 to or, so undo that.
19272 if (const Constant *ConstVal = dyn_cast<Constant>(Val: RMW->getValOperand());
19273 ConstVal && ConstVal->isNullValue())
19274 return AtomicExpansionKind::CustomExpand;
19275 }
19276
19277 // If the allocation could be in remote, fine-grained memory, the rmw
19278 // instructions may fail. cmpxchg should work, so emit that. On some
19279 // system configurations, PCIe atomics aren't supported so cmpxchg won't
19280 // even work, so you're out of luck anyway.
19281
19282 // In summary:
19283 //
19284 // Cases that may fail:
19285 // - fine-grained pinned host memory
19286 // - fine-grained migratable host memory
19287 // - fine-grained PCIe peer device
19288 //
19289 // Cases that should work, but may be treated overly conservatively.
19290 // - fine-grained host memory on an APU
19291 // - fine-grained XGMI peer device
19292 return AtomicExpansionKind::CmpXChg;
19293 }
19294
19295 return atomicSupportedIfLegalIntType(RMW);
19296 }
19297 case AtomicRMWInst::FAdd: {
19298 Type *Ty = RMW->getType();
19299
19300 // TODO: Handle REGION_ADDRESS
19301 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19302 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
19303 // is fixed to round-to-nearest-even.
19304 //
19305 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
19306 // round-to-nearest-even.
19307 //
19308 // We ignore the rounding mode problem, even in strictfp. The C++ standard
19309 // suggests it is OK if the floating-point mode may not match the calling
19310 // thread.
19311 if (Ty->isFloatTy()) {
19312 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
19313 : AtomicExpansionKind::CmpXChg;
19314 }
19315
19316 if (Ty->isDoubleTy()) {
19317 // Ignores denormal mode, but we don't consider flushing mandatory.
19318 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
19319 : AtomicExpansionKind::CmpXChg;
19320 }
19321
19322 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19323 return AtomicExpansionKind::None;
19324
19325 return AtomicExpansionKind::CmpXChg;
19326 }
19327
19328 // LDS atomics respect the denormal mode from the mode register.
19329 //
19330 // Traditionally f32 global/buffer memory atomics would unconditionally
19331 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
19332 // flush.
19333 //
19334 // On targets with flat atomic fadd, denormals would flush depending on
19335 // whether the target address resides in LDS or global memory. We consider
19336 // this flat-maybe-flush as will-flush.
19337 if (Ty->isFloatTy() &&
19338 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19339 !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
19340 return AtomicExpansionKind::CmpXChg;
19341
19342 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
19343 // safe. The message phrasing also should be better.
19344 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19345 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19346 // gfx942, gfx12
19347 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
19348 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19349 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
19350 // gfx90a, gfx942, gfx12
19351 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19352 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19353
19354 // gfx942, gfx12
19355 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
19356 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19357 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19358 // gfx90a, gfx942, gfx12
19359 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
19360 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19361
19362 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
19363 // buffer. gfx12 does have the buffer version.
19364 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
19365 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19366 }
19367
19368 // global and flat atomic fadd f64: gfx90a, gfx942.
19369 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19370 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19371
19372 if (AS != AMDGPUAS::FLAT_ADDRESS) {
19373 if (Ty->isFloatTy()) {
19374 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
19375 // gfx11+.
19376 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19377 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19378 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
19379 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19380 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19381 } else {
19382 // gfx908
19383 if (RMW->use_empty() &&
19384 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19385 isV2F16(Ty))
19386 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19387 }
19388 }
19389
19390 // flat atomic fadd f32: gfx942, gfx11+.
19391 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
19392 if (Subtarget->hasFlatAtomicFaddF32Inst())
19393 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19394
19395 // If it is in flat address space, and the type is float, we will try to
19396 // expand it, if the target supports global and lds atomic fadd. The
19397 // reason we need that is, in the expansion, we emit the check of
19398 // address space. If it is in global address space, we emit the global
19399 // atomic fadd; if it is in shared address space, we emit the LDS atomic
19400 // fadd.
19401 if (Subtarget->hasLDSFPAtomicAddF32()) {
19402 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19403 return AtomicExpansionKind::CustomExpand;
19404 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19405 return AtomicExpansionKind::CustomExpand;
19406 }
19407 }
19408 }
19409
19410 return AtomicExpansionKind::CmpXChg;
19411 }
19412 case AtomicRMWInst::FMin:
19413 case AtomicRMWInst::FMax: {
19414 Type *Ty = RMW->getType();
19415
19416 // LDS float and double fmin/fmax were always supported.
19417 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
19418 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
19419 : AtomicExpansionKind::CmpXChg;
19420 }
19421
19422 if (globalMemoryFPAtomicIsLegal(Subtarget: *Subtarget, RMW, HasSystemScope)) {
19423 // For flat and global cases:
19424 // float, double in gfx7. Manual claims denormal support.
19425 // Removed in gfx8.
19426 // float, double restored in gfx10.
19427 // double removed again in gfx11, so only f32 for gfx11/gfx12.
19428 //
19429 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
19430 // no f32.
19431 if (AS == AMDGPUAS::FLAT_ADDRESS) {
19432 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19433 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19434 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19435 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19436 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
19437 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
19438 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19439 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19440 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19441 return ReportUnsafeHWInst(AtomicExpansionKind::None);
19442 }
19443 }
19444
19445 return AtomicExpansionKind::CmpXChg;
19446 }
19447 case AtomicRMWInst::Nand:
19448 case AtomicRMWInst::FSub:
19449 default:
19450 return AtomicExpansionKind::CmpXChg;
19451 }
19452
19453 llvm_unreachable("covered atomicrmw op switch");
19454}
19455
19456TargetLowering::AtomicExpansionKind
19457SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19458 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19459 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19460 : AtomicExpansionKind::None;
19461}
19462
19463TargetLowering::AtomicExpansionKind
19464SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19465 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
19466 ? getPrivateAtomicExpansionKind(STI: *getSubtarget())
19467 : AtomicExpansionKind::None;
19468}
19469
19470TargetLowering::AtomicExpansionKind
19471SITargetLowering::shouldExpandAtomicCmpXchgInIR(
19472 const AtomicCmpXchgInst *CmpX) const {
19473 unsigned AddrSpace = CmpX->getPointerAddressSpace();
19474 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
19475 return getPrivateAtomicExpansionKind(STI: *getSubtarget());
19476
19477 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(I: CmpX))
19478 return AtomicExpansionKind::None;
19479
19480 const DataLayout &DL = CmpX->getDataLayout();
19481
19482 Type *ValTy = CmpX->getNewValOperand()->getType();
19483
19484 // If a 64-bit flat atomic may alias private, we need to avoid using the
19485 // atomic in the private case.
19486 return DL.getTypeSizeInBits(Ty: ValTy) == 64 ? AtomicExpansionKind::CustomExpand
19487 : AtomicExpansionKind::None;
19488}
19489
19490const TargetRegisterClass *
19491SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
19492 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, isDivergent: false);
19493 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19494 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19495 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19496 : &AMDGPU::SReg_32RegClass;
19497 if (!TRI->isSGPRClass(RC) && !isDivergent)
19498 return TRI->getEquivalentSGPRClass(VRC: RC);
19499 if (TRI->isSGPRClass(RC) && isDivergent) {
19500 if (Subtarget->hasGFX90AInsts())
19501 return TRI->getEquivalentAVClass(SRC: RC);
19502 return TRI->getEquivalentVGPRClass(SRC: RC);
19503 }
19504
19505 return RC;
19506}
19507
19508// FIXME: This is a workaround for DivergenceAnalysis not understanding always
19509// uniform values (as produced by the mask results of control flow intrinsics)
19510// used outside of divergent blocks. The phi users need to also be treated as
19511// always uniform.
19512//
19513// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
19514static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
19515 unsigned WaveSize) {
19516 // FIXME: We assume we never cast the mask results of a control flow
19517 // intrinsic.
19518 // Early exit if the type won't be consistent as a compile time hack.
19519 IntegerType *IT = dyn_cast<IntegerType>(Val: V->getType());
19520 if (!IT || IT->getBitWidth() != WaveSize)
19521 return false;
19522
19523 if (!isa<Instruction>(Val: V))
19524 return false;
19525 if (!Visited.insert(Ptr: V).second)
19526 return false;
19527 bool Result = false;
19528 for (const auto *U : V->users()) {
19529 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Val: U)) {
19530 if (V == U->getOperand(i: 1)) {
19531 switch (Intrinsic->getIntrinsicID()) {
19532 default:
19533 Result = false;
19534 break;
19535 case Intrinsic::amdgcn_if_break:
19536 case Intrinsic::amdgcn_if:
19537 case Intrinsic::amdgcn_else:
19538 Result = true;
19539 break;
19540 }
19541 }
19542 if (V == U->getOperand(i: 0)) {
19543 switch (Intrinsic->getIntrinsicID()) {
19544 default:
19545 Result = false;
19546 break;
19547 case Intrinsic::amdgcn_end_cf:
19548 case Intrinsic::amdgcn_loop:
19549 Result = true;
19550 break;
19551 }
19552 }
19553 } else {
19554 Result = hasCFUser(V: U, Visited, WaveSize);
19555 }
19556 if (Result)
19557 break;
19558 }
19559 return Result;
19560}
19561
19562bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
19563 const Value *V) const {
19564 if (const CallInst *CI = dyn_cast<CallInst>(Val: V)) {
19565 if (CI->isInlineAsm()) {
19566 // FIXME: This cannot give a correct answer. This should only trigger in
19567 // the case where inline asm returns mixed SGPR and VGPR results, used
19568 // outside the defining block. We don't have a specific result to
19569 // consider, so this assumes if any value is SGPR, the overall register
19570 // also needs to be SGPR.
19571 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
19572 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
19573 DL: MF.getDataLayout(), TRI: Subtarget->getRegisterInfo(), Call: *CI);
19574 for (auto &TC : TargetConstraints) {
19575 if (TC.Type == InlineAsm::isOutput) {
19576 ComputeConstraintToUse(OpInfo&: TC, Op: SDValue());
19577 const TargetRegisterClass *RC =
19578 getRegForInlineAsmConstraint(TRI_: SIRI, Constraint: TC.ConstraintCode,
19579 VT: TC.ConstraintVT)
19580 .second;
19581 if (RC && SIRI->isSGPRClass(RC))
19582 return true;
19583 }
19584 }
19585 }
19586 }
19587 SmallPtrSet<const Value *, 16> Visited;
19588 return hasCFUser(V, Visited, WaveSize: Subtarget->getWavefrontSize());
19589}
19590
19591bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
19592 for (SDUse &Use : N->uses()) {
19593 if (MemSDNode *M = dyn_cast<MemSDNode>(Val: Use.getUser())) {
19594 if (getBasePtrIndex(N: M) == Use.getOperandNo())
19595 return true;
19596 }
19597 }
19598 return false;
19599}
19600
19601bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
19602 SDValue N1) const {
19603 if (!N0.hasOneUse())
19604 return false;
19605 // Take care of the opportunity to keep N0 uniform
19606 if (N0->isDivergent() || !N1->isDivergent())
19607 return true;
19608 // Check if we have a good chance to form the memory access pattern with the
19609 // base and offset
19610 return (DAG.isBaseWithConstantOffset(Op: N0) &&
19611 hasMemSDNodeUser(N: *N0->user_begin()));
19612}
19613
19614bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
19615 Register N0, Register N1) const {
19616 return MRI.hasOneNonDBGUse(RegNo: N0); // FIXME: handle regbanks
19617}
19618
19619MachineMemOperand::Flags
19620SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
19621 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
19622 MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
19623 if (I.getMetadata(Kind: "amdgpu.noclobber"))
19624 Flags |= MONoClobber;
19625 if (I.getMetadata(Kind: "amdgpu.last.use"))
19626 Flags |= MOLastUse;
19627 return Flags;
19628}
19629
19630void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
19631 Instruction *AI) const {
19632 // Given: atomicrmw fadd ptr %addr, float %val ordering
19633 //
19634 // With this expansion we produce the following code:
19635 // [...]
19636 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
19637 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
19638 //
19639 // atomicrmw.shared:
19640 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
19641 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
19642 // float %val ordering
19643 // br label %atomicrmw.phi
19644 //
19645 // atomicrmw.check.private:
19646 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
19647 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
19648 //
19649 // atomicrmw.private:
19650 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
19651 // %loaded.private = load float, ptr addrspace(5) %cast.private
19652 // %val.new = fadd float %loaded.private, %val
19653 // store float %val.new, ptr addrspace(5) %cast.private
19654 // br label %atomicrmw.phi
19655 //
19656 // atomicrmw.global:
19657 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
19658 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
19659 // float %val ordering
19660 // br label %atomicrmw.phi
19661 //
19662 // atomicrmw.phi:
19663 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
19664 // [ %loaded.private, %atomicrmw.private ],
19665 // [ %loaded.global, %atomicrmw.global ]
19666 // br label %atomicrmw.end
19667 //
19668 // atomicrmw.end:
19669 // [...]
19670 //
19671 //
19672 // For 64-bit atomics which may reside in private memory, we perform a simpler
19673 // version that only inserts the private check, and uses the flat operation.
19674
19675 IRBuilder<> Builder(AI);
19676 LLVMContext &Ctx = Builder.getContext();
19677
19678 auto *RMW = dyn_cast<AtomicRMWInst>(Val: AI);
19679 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
19680 : AtomicCmpXchgInst::getPointerOperandIndex();
19681 Value *Addr = AI->getOperand(i: PtrOpIdx);
19682
19683 /// TODO: Only need to check private, then emit flat-known-not private (no
19684 /// need for shared block, or cast to global).
19685 AtomicCmpXchgInst *CX = dyn_cast<AtomicCmpXchgInst>(Val: AI);
19686
19687 Align Alignment;
19688 if (RMW)
19689 Alignment = RMW->getAlign();
19690 else if (CX)
19691 Alignment = CX->getAlign();
19692 else
19693 llvm_unreachable("unhandled atomic operation");
19694
19695 // FullFlatEmulation is true if we need to issue the private, shared, and
19696 // global cases.
19697 //
19698 // If this is false, we are only dealing with the flat-targeting-private case,
19699 // where we only insert a check for private and still use the flat instruction
19700 // for global and shared.
19701
19702 bool FullFlatEmulation =
19703 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
19704 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19705 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19706 RMW->getType()->isDoubleTy()));
19707
19708 // If the return value isn't used, do not introduce a false use in the phi.
19709 bool ReturnValueIsUsed = !AI->use_empty();
19710
19711 BasicBlock *BB = Builder.GetInsertBlock();
19712 Function *F = BB->getParent();
19713 BasicBlock *ExitBB =
19714 BB->splitBasicBlock(I: Builder.GetInsertPoint(), BBName: "atomicrmw.end");
19715 BasicBlock *SharedBB = nullptr;
19716
19717 BasicBlock *CheckPrivateBB = BB;
19718 if (FullFlatEmulation) {
19719 SharedBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.shared", Parent: F, InsertBefore: ExitBB);
19720 CheckPrivateBB =
19721 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.check.private", Parent: F, InsertBefore: ExitBB);
19722 }
19723
19724 BasicBlock *PrivateBB =
19725 BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.private", Parent: F, InsertBefore: ExitBB);
19726 BasicBlock *GlobalBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.global", Parent: F, InsertBefore: ExitBB);
19727 BasicBlock *PhiBB = BasicBlock::Create(Context&: Ctx, Name: "atomicrmw.phi", Parent: F, InsertBefore: ExitBB);
19728
19729 std::prev(x: BB->end())->eraseFromParent();
19730 Builder.SetInsertPoint(BB);
19731
19732 Value *LoadedShared = nullptr;
19733 if (FullFlatEmulation) {
19734 CallInst *IsShared = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_shared,
19735 Args: {Addr}, FMFSource: nullptr, Name: "is.shared");
19736 Builder.CreateCondBr(Cond: IsShared, True: SharedBB, False: CheckPrivateBB);
19737 Builder.SetInsertPoint(SharedBB);
19738 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19739 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::LOCAL_ADDRESS));
19740
19741 Instruction *Clone = AI->clone();
19742 Clone->insertInto(ParentBB: SharedBB, It: SharedBB->end());
19743 Clone->getOperandUse(i: PtrOpIdx).set(CastToLocal);
19744 LoadedShared = Clone;
19745
19746 Builder.CreateBr(Dest: PhiBB);
19747 Builder.SetInsertPoint(CheckPrivateBB);
19748 }
19749
19750 CallInst *IsPrivate = Builder.CreateIntrinsic(ID: Intrinsic::amdgcn_is_private,
19751 Args: {Addr}, FMFSource: nullptr, Name: "is.private");
19752 Builder.CreateCondBr(Cond: IsPrivate, True: PrivateBB, False: GlobalBB);
19753
19754 Builder.SetInsertPoint(PrivateBB);
19755
19756 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19757 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::PRIVATE_ADDRESS));
19758
19759 Value *LoadedPrivate;
19760 if (RMW) {
19761 LoadedPrivate = Builder.CreateAlignedLoad(
19762 Ty: RMW->getType(), Ptr: CastToPrivate, Align: RMW->getAlign(), Name: "loaded.private");
19763
19764 Value *NewVal = buildAtomicRMWValue(Op: RMW->getOperation(), Builder,
19765 Loaded: LoadedPrivate, Val: RMW->getValOperand());
19766
19767 Builder.CreateAlignedStore(Val: NewVal, Ptr: CastToPrivate, Align: RMW->getAlign());
19768 } else {
19769 auto [ResultLoad, Equal] =
19770 buildCmpXchgValue(Builder, Ptr: CastToPrivate, Cmp: CX->getCompareOperand(),
19771 Val: CX->getNewValOperand(), Alignment: CX->getAlign());
19772
19773 Value *Insert = Builder.CreateInsertValue(Agg: PoisonValue::get(T: CX->getType()),
19774 Val: ResultLoad, Idxs: 0);
19775 LoadedPrivate = Builder.CreateInsertValue(Agg: Insert, Val: Equal, Idxs: 1);
19776 }
19777
19778 Builder.CreateBr(Dest: PhiBB);
19779
19780 Builder.SetInsertPoint(GlobalBB);
19781
19782 // Continue using a flat instruction if we only emitted the check for private.
19783 Instruction *LoadedGlobal = AI;
19784 if (FullFlatEmulation) {
19785 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19786 V: Addr, DestTy: PointerType::get(C&: Ctx, AddressSpace: AMDGPUAS::GLOBAL_ADDRESS));
19787 AI->getOperandUse(i: PtrOpIdx).set(CastToGlobal);
19788 }
19789
19790 AI->removeFromParent();
19791 AI->insertInto(ParentBB: GlobalBB, It: GlobalBB->end());
19792
19793 // The new atomicrmw may go through another round of legalization later.
19794 if (!FullFlatEmulation) {
19795 // We inserted the runtime check already, make sure we do not try to
19796 // re-expand this.
19797 // TODO: Should union with any existing metadata.
19798 MDBuilder MDB(F->getContext());
19799 MDNode *RangeNotPrivate =
19800 MDB.createRange(Lo: APInt(32, AMDGPUAS::PRIVATE_ADDRESS),
19801 Hi: APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1));
19802 LoadedGlobal->setMetadata(KindID: LLVMContext::MD_noalias_addrspace,
19803 Node: RangeNotPrivate);
19804 }
19805
19806 Builder.CreateBr(Dest: PhiBB);
19807
19808 Builder.SetInsertPoint(PhiBB);
19809
19810 if (ReturnValueIsUsed) {
19811 PHINode *Loaded = Builder.CreatePHI(Ty: AI->getType(), NumReservedValues: 3);
19812 AI->replaceAllUsesWith(V: Loaded);
19813 if (FullFlatEmulation)
19814 Loaded->addIncoming(V: LoadedShared, BB: SharedBB);
19815 Loaded->addIncoming(V: LoadedPrivate, BB: PrivateBB);
19816 Loaded->addIncoming(V: LoadedGlobal, BB: GlobalBB);
19817 Loaded->takeName(V: AI);
19818 }
19819
19820 Builder.CreateBr(Dest: ExitBB);
19821}
19822
19823static void convertScratchAtomicToFlatAtomic(Instruction *I,
19824 unsigned PtrOpIdx) {
19825 Value *PtrOp = I->getOperand(i: PtrOpIdx);
19826 assert(PtrOp->getType()->getPointerAddressSpace() ==
19827 AMDGPUAS::PRIVATE_ADDRESS);
19828
19829 Type *FlatPtr = PointerType::get(C&: I->getContext(), AddressSpace: AMDGPUAS::FLAT_ADDRESS);
19830 Value *ASCast = CastInst::CreatePointerCast(S: PtrOp, Ty: FlatPtr, Name: "scratch.ascast",
19831 InsertBefore: I->getIterator());
19832 I->setOperand(i: PtrOpIdx, Val: ASCast);
19833}
19834
19835void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
19836 AtomicRMWInst::BinOp Op = AI->getOperation();
19837
19838 if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19839 return convertScratchAtomicToFlatAtomic(I: AI, PtrOpIdx: AI->getPointerOperandIndex());
19840
19841 if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
19842 Op == AtomicRMWInst::Xor) {
19843 if (const auto *ConstVal = dyn_cast<Constant>(Val: AI->getValOperand());
19844 ConstVal && ConstVal->isNullValue()) {
19845 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19846 AI->setOperation(AtomicRMWInst::Add);
19847
19848 // We may still need the private-alias-flat handling below.
19849
19850 // TODO: Skip this for cases where we cannot access remote memory.
19851 }
19852 }
19853
19854 // The non-flat expansions should only perform the de-canonicalization of
19855 // identity values.
19856 if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
19857 return;
19858
19859 emitExpandAtomicAddrSpacePredicate(AI);
19860}
19861
19862void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
19863 if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19864 return convertScratchAtomicToFlatAtomic(I: CI, PtrOpIdx: CI->getPointerOperandIndex());
19865
19866 emitExpandAtomicAddrSpacePredicate(AI: CI);
19867}
19868
19869void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
19870 if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19871 return convertScratchAtomicToFlatAtomic(I: LI, PtrOpIdx: LI->getPointerOperandIndex());
19872
19873 llvm_unreachable(
19874 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19875}
19876
19877void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
19878 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19879 return convertScratchAtomicToFlatAtomic(I: SI, PtrOpIdx: SI->getPointerOperandIndex());
19880
19881 llvm_unreachable(
19882 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19883}
19884
19885LoadInst *
19886SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19887 IRBuilder<> Builder(AI);
19888 auto Order = AI->getOrdering();
19889
19890 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19891 // must be flushed if the atomic ordering had a release semantics. This is
19892 // not necessary a fence, a release fence just coincides to do that flush.
19893 // Avoid replacing of an atomicrmw with a release semantics.
19894 if (isReleaseOrStronger(AO: Order))
19895 return nullptr;
19896
19897 LoadInst *LI = Builder.CreateAlignedLoad(
19898 Ty: AI->getType(), Ptr: AI->getPointerOperand(), Align: AI->getAlign());
19899 LI->setAtomic(Ordering: Order, SSID: AI->getSyncScopeID());
19900 LI->copyMetadata(SrcInst: *AI);
19901 LI->takeName(V: AI);
19902 AI->replaceAllUsesWith(V: LI);
19903 AI->eraseFromParent();
19904 return LI;
19905}
19906